diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8d96c339dadc7..d874b21b0873d 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License
 
-cmake_minimum_required(VERSION 3.15)
+cmake_minimum_required(VERSION 3.10)
 cmake_policy(VERSION 3.10)
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
 set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
@@ -22,9 +22,6 @@ include(system)
 
 project(paddle CXX C)
 
-include(init)
-include(generic)            # simplify cmake module
-
 # enable language CUDA
 # TODO(Shibo Tao): remove find_package(CUDA) completely.
 find_package(CUDA QUIET)
@@ -33,16 +30,24 @@ option(WITH_TENSORRT    "Compile PaddlePaddle with NVIDIA TensorRT"     OFF)
 option(WITH_XPU         "Compile PaddlePaddle with BAIDU KUNLUN XPU"    OFF)
 option(WITH_WIN_DUMP_DBG "Compile with windows core dump debug mode"    OFF)
 option(WITH_ASCEND         "Compile PaddlePaddle with ASCEND"        OFF)
-# NOTE(zhiqiu): WITH_ASCEND_CL can be compile on x86_64, so we can set WITH_ASCEND=OFF and WITH_ASCEND_CL=ON 
+option(WITH_ROCM        "Compile PaddlePaddle with ROCM platform"       OFF)
+# NOTE(zhiqiu): WITH_ASCEND_CL can be compile on x86_64, so we can set WITH_ASCEND=OFF and WITH_ASCEND_CL=ON
 # to develop some acl related functionality on x86
 option(WITH_ASCEND_CL         "Compile PaddlePaddle with ASCEND CL"        ${WITH_ASCEND})
 option(WITH_ASCEND_CXX11         "Compile PaddlePaddle with ASCEND and CXX11 ABI"        OFF)
+# Note(zhouwei): It use option above, so put here
+include(init)
+include(generic)            # simplify cmake module
+
 if (WITH_GPU  AND WITH_XPU)
     message(FATAL_ERROR "Error when compile GPU and XPU at the same time")
 endif()
 if (WITH_GPU AND WITH_ASCEND)
     message(FATAL_ERROR "Error when compile GPU and ASCEND at the same time")
 endif()
+if (WITH_GPU AND WITH_ROCM)
+    message(FATAL_ERROR "Error when compile CUDA and ROCM at the same time")
+endif()
 
 if(WITH_GPU AND NOT APPLE)
     enable_language(CUDA)
@@ -61,7 +66,7 @@ if(WITH_MUSL)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations -Wno-deprecated-declarations -Wno-error=pessimizing-move -Wno-error=deprecated-copy")
 endif()
 
-if(WITH_ASCEND AND NOT WITH_ASCEND_CXX11)
+if(WITH_ASCEND_CL AND NOT WITH_ASCEND_CXX11)
     set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
 endif()
 
@@ -99,9 +104,11 @@ if(WIN32)
             endif()
         endforeach(flag_var)
     endif()
-    
-    # NOTE(Avin0323): Less parallel count result in faster compilation.
-    math(EXPR PROCESS_MAX "${CPU_CORES} * 2 / 3")
+
+    # NOTE(zhouwei25): temporarily change MP to 1 for reducing CPU & memory utilization
+    set(PROCESS_MAX 1)
+    #math(EXPR PROCESS_MAX "${CPU_CORES} * 1 / 2")
+
     # windows build turn off warnings, use parallel compiling.
     foreach(flag_var
         CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
@@ -129,6 +136,9 @@ if(WIN32)
 
     foreach(flag_var CMAKE_SHARED_LINKER_FLAGS CMAKE_STATIC_LINKER_FLAGS CMAKE_EXE_LINKER_FLAGS CMAKE_LINKER_FLAGS)
         set(${flag_var} "${${flag_var}} /ignore:4049 /ignore:4217 /ignore:4006 /ignore:4221")
+        if(MSVC_STATIC_CRT)
+            set(${flag_var} "${${flag_var}} /NODEFAULTLIB:MSVCRT.LIB")
+        endif()
     endforeach(flag_var)
 
     if (WITH_WIN_DUMP_DBG)
@@ -168,8 +178,6 @@ option(WITH_DISTRIBUTE  "Compile with distributed support"              OFF)
 option(WITH_BRPC_RDMA     "Use brpc rdma as the rpc protocal"           OFF)
 option(ON_INFER         "Turn on inference optimization and inference-lib generation" OFF)
 ################################ Internal Configurations #######################################
-option(WITH_ROCM        "Compile PaddlePaddle with ROCM platform"         OFF)
-option(WITH_RCCL        "Compile PaddlePaddle with RCCL support"          OFF)
 option(WITH_NV_JETSON   "Compile PaddlePaddle with NV JETSON"             OFF)
 option(WITH_PROFILER    "Compile PaddlePaddle with GPU profiler and gperftools"        OFF)
 option(WITH_COVERAGE    "Compile PaddlePaddle with code coverage"       OFF)
@@ -180,14 +188,15 @@ option(WITH_PSLIB       "Compile with pslib support"                    OFF)
 option(WITH_BOX_PS      "Compile with box_ps support"                   OFF)
 option(WITH_XBYAK       "Compile with xbyak support"                    ON)
 option(WITH_CONTRIB     "Compile the third-party contributation"        OFF)
-option(WITH_GRPC     "Use grpc as the default rpc framework"            ${WITH_DISTRIBUTE})
 option(WITH_PSCORE     "Compile with parameter server support"         ${WITH_DISTRIBUTE})
+option(WITH_HETERPS     "Compile with heterps"                          OFF})
 option(WITH_INFERENCE_API_TEST   "Test fluid inference C++ high-level api interface"  OFF)
 option(PY_VERSION       "Compile PaddlePaddle with python3 support"     ${PY_VERSION})
 option(WITH_DGC   "Use DGC(Deep Gradient Compression) or not" ${WITH_DISTRIBUTE})
 option(SANITIZER_TYPE "Choose the type of sanitizer, options are: Address, Leak, Memory, Thread, Undefined" OFF)
 option(WITH_LITE   "Compile Paddle Fluid with Lite Engine" OFF)
 option(WITH_NCCL   "Compile PaddlePaddle with NCCL support"             ON)
+option(WITH_RCCL   "Compile PaddlePaddle with RCCL support"             ON)
 option(WITH_XPU_BKCL    "Compile PaddlePaddle with BAIDU KUNLUN XPU BKCL"   OFF)
 option(WITH_CRYPTO   "Compile PaddlePaddle with crypto support"         ON)
 option(WITH_ARM   "Compile PaddlePaddle with arm support"         OFF)
@@ -195,6 +204,7 @@ option(WITH_SW   "Compile PaddlePaddle with sw support"         OFF)
 option(WITH_MIPS   "Compile PaddlePaddle with mips support"         OFF)
 option(WITH_MUSL        "Compile with musl libc instead of gblic"  OFF)
 option(WITH_UNITY_BUILD "Compile with UnityBuild mode"             OFF)
+option(WITH_STRIP       "Strip so files of Whl packages"         OFF)
 
 # PY_VERSION
 if(NOT PY_VERSION)
@@ -255,9 +265,6 @@ endif()
 
 if(WITH_BRPC_RDMA)
     message(STATUS "Use brpc with rdma.")
-    if(WITH_GRPC)
-        message(FATAL_ERROR "Can't use grpc with brpc rdma.")
-    endif()
     if(NOT WITH_DISTRIBUTE)
         message(FATAL_ERROR "Can't use brpc rdma in no distribute env.")
     endif()
@@ -305,9 +312,9 @@ endif(WITH_ROCM)
 
 if (NOT WITH_ROCM AND WITH_RCCL)
     MESSAGE(WARNING
-        "Disable RCCL when compiling without GPU. Force WITH_RCCL=OFF.")
-    set(WITH_NCCL OFF CACHE STRING
-        "Disable RCCL when compiling without GPU" FORCE)
+        "Disable RCCL when compiling without ROCM. Force WITH_RCCL=OFF.")
+    set(WITH_RCCL OFF CACHE STRING
+        "Disable RCCL when compiling without ROCM" FORCE)
 endif()
 
 if(WITH_RCCL)
@@ -362,6 +369,13 @@ else()
     message(WARNING "On inference mode, will take place some specific optimization. Turn on the ON_INFER flag when building inference_lib only.")
 endif()
 
+if(WITH_STRIP)
+    find_program(STRIP_PATH strip)
+    if(NOT STRIP_PATH OR NOT LINUX)
+        set(WITH_STRIP OFF CACHE STRING "Command strip is only used on Linux when it exists." FORCE)
+    endif()
+endif()
+
 add_subdirectory(paddle)
 if(WITH_PYTHON)
     add_subdirectory(python)
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index 9f1eb16fcf03f..e7f125269be1f 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -173,10 +173,9 @@ if(WITH_PSCORE)
     add_definitions(-DPADDLE_WITH_PSCORE)
 endif()
 
-
-if(WITH_GRPC)
-    add_definitions(-DPADDLE_WITH_GRPC)
-endif(WITH_GRPC)
+if(WITH_HETERPS)
+    add_definitions(-DPADDLE_WITH_HETERPS)
+endif()
 
 if(WITH_BRPC_RDMA)
     add_definitions(-DPADDLE_WITH_BRPC_RDMA)
diff --git a/cmake/external/ascend.cmake b/cmake/external/ascend.cmake
index bddd2023b437b..414b2a54be034 100644
--- a/cmake/external/ascend.cmake
+++ b/cmake/external/ascend.cmake
@@ -21,7 +21,13 @@ else()
     set(ASCEND_DIR /usr/local/Ascend)
 endif()
 
-if(WITH_ASCEND)
+if(EXISTS ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include/graph/ascend_string.h)
+  # It means CANN 20.2 +
+  add_definitions(-DPADDLE_WITH_ASCEND_STRING)
+endif()
+
+
+if(WITH_ASCEND OR WITH_ASCEND_CL)
   set(ASCEND_DRIVER_DIR ${ASCEND_DIR}/driver/lib64)
   set(ASCEND_DRIVER_COMMON_DIR ${ASCEND_DIR}/driver/lib64/common)
   set(ASCEND_DRIVER_SHARE_DIR ${ASCEND_DIR}/driver/lib64/share)
@@ -43,9 +49,6 @@ if(WITH_ASCEND)
   set(atlas_acl_lib ${ATLAS_RUNTIME_DIR}/libascendcl.so)
   INCLUDE_DIRECTORIES(${ATLAS_RUNTIME_INC_DIR})
 
-  if(EXISTS ${ATLAS_RUNTIME_INC_DIR}/graph/ascend_string.h)
-    add_definitions(-DPADDLE_WITH_ASCEND_STRING)
-  endif()
 
   ADD_LIBRARY(ascend_ge SHARED IMPORTED GLOBAL)
   SET_PROPERTY(TARGET ascend_ge PROPERTY IMPORTED_LOCATION ${atlas_ge_runner_lib})
@@ -62,17 +65,23 @@ endif()
 if(WITH_ASCEND_CL)
   set(ASCEND_CL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64)
 
+  set(ascend_hccl_lib ${ASCEND_CL_DIR}/libhccl.so)
   set(ascendcl_lib ${ASCEND_CL_DIR}/libascendcl.so)
   set(acl_op_compiler_lib ${ASCEND_CL_DIR}/libacl_op_compiler.so)
-  set(ASCEND_CL_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include)
+  set(FWKACLLIB_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include)
+  set(ACLLIB_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/include)
 
-  message(STATUS "ASCEND_CL_INC_DIR ${ASCEND_CL_INC_DIR}")
+  message(STATUS "FWKACLLIB_INC_DIR ${FWKACLLIB_INC_DIR}")
   message(STATUS "ASCEND_CL_DIR ${ASCEND_CL_DIR}")
-  INCLUDE_DIRECTORIES(${ASCEND_CL_INC_DIR})
+  INCLUDE_DIRECTORIES(${FWKACLLIB_INC_DIR})
+  INCLUDE_DIRECTORIES(${ACLLIB_INC_DIR})
 
   ADD_LIBRARY(ascendcl SHARED IMPORTED GLOBAL)
   SET_PROPERTY(TARGET ascendcl PROPERTY IMPORTED_LOCATION ${ascendcl_lib})
 
+  ADD_LIBRARY(ascend_hccl SHARED IMPORTED GLOBAL)
+  SET_PROPERTY(TARGET ascend_hccl PROPERTY IMPORTED_LOCATION ${ascend_hccl_lib})
+
   ADD_LIBRARY(acl_op_compiler SHARED IMPORTED GLOBAL)
   SET_PROPERTY(TARGET acl_op_compiler PROPERTY IMPORTED_LOCATION ${acl_op_compiler_lib})
   add_custom_target(extern_ascend_cl DEPENDS ascendcl acl_op_compiler)
diff --git a/cmake/external/gloo.cmake b/cmake/external/gloo.cmake
index 2e4a67093dc54..e8db13a694f55 100644
--- a/cmake/external/gloo.cmake
+++ b/cmake/external/gloo.cmake
@@ -32,7 +32,7 @@ cache_third_party(extern_gloo
     TAG           ${GLOO_TAG}
     DIR           GLOO_SOURCE_DIR)
 
-if(WITH_ASCEND)
+  if(WITH_ASCEND OR WITH_ASCEND_CL)
   ExternalProject_Add(
       extern_gloo
       ${EXTERNAL_PROJECT_LOG_ARGS}
diff --git a/cmake/external/grpc.cmake b/cmake/external/grpc.cmake
deleted file mode 100644
index 536e95c1dc2a4..0000000000000
--- a/cmake/external/grpc.cmake
+++ /dev/null
@@ -1,77 +0,0 @@
-# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-include (ExternalProject)
-
-SET(GRPC_SOURCES_DIR ${THIRD_PARTY_PATH}/grpc)
-SET(GRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/grpc)
-SET(GRPC_INCLUDE_DIR "${GRPC_INSTALL_DIR}/include/" CACHE PATH "grpc include directory." FORCE)
-SET(GRPC_CPP_PLUGIN "${GRPC_INSTALL_DIR}/bin/grpc_cpp_plugin" CACHE FILEPATH "GRPC_CPP_PLUGIN" FORCE)
-
-include(ProcessorCount)
-ProcessorCount(NUM_OF_PROCESSOR)
-
-IF(APPLE)
-  SET(BUILD_CMD make -n HAS_SYSTEM_PROTOBUF=false -s -j ${NUM_OF_PROCESSOR} static grpc_cpp_plugin | sed "s/-Werror//g" | sh)
-  SET(GRPC_INSTALL_CMD make prefix=${GRPC_INSTALL_DIR} install) 
-ELSE()
-  SET(GRPC_CFLAGS "-Wno-error -std=c11 ${CLFAGS}")
-  SET(GRPC_CXXFLAGS "-Wno-error -std=c++11 ${CXXFLAGS}")
-  SET(BUILD_CMD make CFLAGS=${GRPC_CFLAGS} CXXFLAGS=${GRPC_CXXFLAGS} HAS_SYSTEM_PROTOBUF=false -s -j ${NUM_OF_PROCESSOR} static grpc_cpp_plugin) 
-  SET(GRPC_INSTALL_CMD make prefix=${GRPC_INSTALL_DIR} install CFLAGS=${GRPC_CFLAGS} CXXFLAGS=${GRPC_CXXFLAGS})
-ENDIF()
-
-# FIXME(wuyi): do not build zlib cares protobuf twice, find a way to build grpc with them
-ExternalProject_Add(
-    extern_grpc
-    DEPENDS protobuf zlib
-    # NOTE(wuyi):
-    # this package is generated by following steps:
-    # 1. git clone -b v1.8.x https://github.com/grpc/grpc.git
-    # 2. git submodule update --init
-    # 3. keep only zlib, cares, protobuf, boringssl under "third_party",
-    #    checkout and clean other dirs under third_party
-    # 4. remove .git, and package the directory.
-    URL          http://paddlepaddledeps.bj.bcebos.com/grpc-v1.10.x_paddle.tar.gz
-    URL_MD5      f5442d137ddccee252e194b1bc90f98c
-    PREFIX          ${GRPC_SOURCES_DIR}
-    UPDATE_COMMAND  ""
-    CONFIGURE_COMMAND ""
-    BUILD_IN_SOURCE 1
-    # NOTE(yuyang18):
-    # Disable -Werror, otherwise the compile will fail in MacOS.
-    # It seems that we cannot configure that by make command.
-    # Just dry run make command and remove `-Werror`, then use a shell to run make commands
-    BUILD_COMMAND  ${BUILD_CMD}
-    INSTALL_COMMAND ${GRPC_INSTALL_CMD}
-)
-
-ADD_LIBRARY(grpc++_unsecure STATIC IMPORTED GLOBAL)
-SET_PROPERTY(TARGET grpc++_unsecure PROPERTY IMPORTED_LOCATION
-             "${GRPC_INSTALL_DIR}/lib/libgrpc++_unsecure.a")
-
-ADD_LIBRARY(grpc++ STATIC IMPORTED GLOBAL)
-SET_PROPERTY(TARGET grpc++ PROPERTY IMPORTED_LOCATION
-            "${GRPC_INSTALL_DIR}/lib/libgrpc++.a")
-ADD_LIBRARY(gpr STATIC IMPORTED GLOBAL)
-SET_PROPERTY(TARGET gpr PROPERTY IMPORTED_LOCATION
-            "${GRPC_INSTALL_DIR}/lib/libgpr.a")
-
-ADD_LIBRARY(grpc_unsecure STATIC IMPORTED GLOBAL)
-SET_PROPERTY(TARGET grpc_unsecure PROPERTY IMPORTED_LOCATION
-            "${GRPC_INSTALL_DIR}/lib/libgrpc_unsecure.a")
-
-include_directories(${GRPC_INCLUDE_DIR})
-ADD_DEPENDENCIES(grpc++_unsecure extern_grpc)
diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index 884219d8dd81f..fb1d4d9d56dcc 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -20,7 +20,7 @@ SET(MKLDNN_SOURCE_DIR     ${THIRD_PARTY_PATH}/mkldnn/src/extern_mkldnn)
 SET(MKLDNN_INSTALL_DIR    ${THIRD_PARTY_PATH}/install/mkldnn)
 SET(MKLDNN_INC_DIR        "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE)
 SET(MKLDNN_REPOSITORY     ${GIT_URL}/oneapi-src/oneDNN.git)
-SET(MKLDNN_TAG            72efa005effb49595933e033cc732f215ef0445a)
+SET(MKLDNN_TAG            f58682cd8bd0615f41d879f8afc8f1511ab42d24)
 
 # Introduce variables:
 # * CMAKE_INSTALL_LIBDIR
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index 82d64fd022883..c108c05368c91 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -242,7 +242,7 @@ endif()
     )
 ENDFUNCTION()
 
-if(WITH_ASCEND)
+if(WITH_ASCEND OR WITH_ASCEND_CL)
     SET(PROTOBUF_VERSION 3.8.0)
 else()
     SET(PROTOBUF_VERSION 3.1.0)
diff --git a/cmake/external/threadpool.cmake b/cmake/external/threadpool.cmake
index 0eabdb4e127bd..f9cb3a9075a82 100644
--- a/cmake/external/threadpool.cmake
+++ b/cmake/external/threadpool.cmake
@@ -16,7 +16,7 @@ INCLUDE(ExternalProject)
 
 SET(THREADPOOL_PREFIX_DIR ${THIRD_PARTY_PATH}/threadpool)
 SET(THREADPOOL_SOURCE_DIR ${THIRD_PARTY_PATH}/threadpool/src/extern_threadpool)
-if(WITH_ASCEND)
+if(WITH_ASCEND OR WITH_ASCEND_CL)
     SET(THREADPOOL_REPOSITORY https://gitee.com/tianjianhe/ThreadPool.git)
 else()
     SET(THREADPOOL_REPOSITORY ${GIT_URL}/progschj/ThreadPool.git)
diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake
index a4367510ac703..100b915339469 100644
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -43,7 +43,7 @@ cache_third_party(extern_warpctc
     TAG          ${WARPCTC_TAG}
     DIR          WARPCTC_SOURCE_DIR)
 
-if(WITH_ASCEND)
+if(WITH_ASCEND OR WITH_ASCEND_CL)
     ExternalProject_Add(
         extern_warpctc
         ${EXTERNAL_PROJECT_LOG_ARGS}
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index c85654a5674a0..a5c74a46631e9 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -447,9 +447,20 @@ function(cc_test TARGET_NAME)
     cc_test_build(${TARGET_NAME}
 	    SRCS ${cc_test_SRCS}
 	    DEPS ${cc_test_DEPS})
-    cc_test_run(${TARGET_NAME}
-	    COMMAND ${TARGET_NAME}
-	    ARGS ${cc_test_ARGS})
+    # we dont test hcom op, because it need complex configuration
+    # with more than one machine
+    if(NOT ("${TARGET_NAME}" STREQUAL "c_broadcast_op_npu_test"         OR
+            "${TARGET_NAME}" STREQUAL "c_allreduce_sum_op_npu_test"     OR
+            "${TARGET_NAME}" STREQUAL "c_allreduce_max_op_npu_test"     OR
+            "${TARGET_NAME}" STREQUAL "c_reducescatter_op_npu_test"     OR
+            "${TARGET_NAME}" STREQUAL "c_allgather_op_npu_test"         OR
+            "${TARGET_NAME}" STREQUAL "send_v2_op_npu_test"             OR
+            "${TARGET_NAME}" STREQUAL "c_reduce_sum_op_npu_test"        OR
+            "${TARGET_NAME}" STREQUAL "recv_v2_op_npu_test"))
+      cc_test_run(${TARGET_NAME}
+        COMMAND ${TARGET_NAME}
+        ARGS ${cc_test_ARGS})
+    endif()
   endif()
 endfunction(cc_test)
 
@@ -807,7 +818,7 @@ function(py_test TARGET_NAME)
                ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
                WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
     endif()
-    
+
     if (WIN32)
         set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 150)
     endif()
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 4864e04fa0516..9694a7bc59c12 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -211,11 +211,11 @@ set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
 if(WIN32)
   set(paddle_inference_c_lib $<TARGET_FILE_DIR:paddle_inference_c>/paddle_inference_c.*)
 else(WIN32)
-  set(paddle_inference_c_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/capi/libpaddle_inference_c.*)
+  set(paddle_inference_c_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/capi_exp/libpaddle_inference_c.*)
 endif(WIN32)
 
 copy(inference_lib_dist
-      SRCS  ${src_dir}/inference/capi/paddle_c_api.h  ${paddle_inference_c_lib}
+      SRCS  ${src_dir}/inference/capi_exp/pd_*.h  ${paddle_inference_c_lib}
       DSTS  ${PADDLE_INFERENCE_C_INSTALL_DIR}/paddle/include ${PADDLE_INFERENCE_C_INSTALL_DIR}/paddle/lib)
 
 # fluid library for both train and inference
diff --git a/cmake/init.cmake b/cmake/init.cmake
index 19fdb6c601a11..b11156d2e9986 100644
--- a/cmake/init.cmake
+++ b/cmake/init.cmake
@@ -18,10 +18,10 @@ if(NOT WIN32)
     set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG")
     set(CMAKE_CXX_FLAGS_MINSIZEREL "-Os -DNDEBUG")
 else()
-    # It has not been used now, it can specify CUDA compile flag manualy,
+    # It can specify CUDA compile flag manualy,
     # its use is to remvoe /Zi to reduce GPU static library size. But it's dangerous
     # because CUDA will update by nvidia, then error will occur.
-    # Now, it's used in CUDA:[10.0, 10.2]
+    # Now, it's only used in VS2015 + CUDA:[10.0, 10.2]
     set(WIN_PROPS ${CMAKE_SOURCE_DIR}/cmake/paddle_win.props)
 endif()
 
diff --git a/cmake/paddle_win.props b/cmake/paddle_win.props
index 296940dc3f50c..3c069bd2981c4 100644
--- a/cmake/paddle_win.props
+++ b/cmake/paddle_win.props
@@ -88,4 +88,3 @@ set CUDAFE_FLAGS=--sdk_dir "$(WindowsSdkDir)"
         </ClCompile>
     </ItemDefinitionGroup>
 </Project>
-
diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake
index 81fa7d0dfa98f..f90fa3509d63d 100644
--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -29,9 +29,9 @@ set(third_party_deps)
 # 2. REPOSITORY:    specify git REPOSITORY of 3rd party
 # 3. TAG:           specify git tag/branch/commitID of 3rd party
 # 4. DIR:           overwrite the original SOURCE_DIR when cache directory
-# 
+#
 # The function Return 1 PARENT_SCOPE variables:
-#  - ${TARGET}_DOWNLOAD_CMD: Simply place "${TARGET}_DOWNLOAD_CMD" in ExternalProject_Add, 
+#  - ${TARGET}_DOWNLOAD_CMD: Simply place "${TARGET}_DOWNLOAD_CMD" in ExternalProject_Add,
 #                            and you no longer need to set any donwnload steps in ExternalProject_Add.
 # For example:
 #    Cache_third_party(${TARGET}
@@ -52,7 +52,7 @@ FUNCTION(cache_third_party TARGET)
         SET(${TARGET_NAME}_DOWNLOAD_CMD
                 GIT_REPOSITORY  ${cache_third_party_REPOSITORY})
         IF(cache_third_party_TAG)
-            LIST(APPEND   ${TARGET_NAME}_DOWNLOAD_CMD  
+            LIST(APPEND   ${TARGET_NAME}_DOWNLOAD_CMD
                     GIT_TAG     ${cache_third_party_TAG})
         ENDIF()
     ELSEIF(cache_third_party_URL)
@@ -130,7 +130,7 @@ ENDFUNCTION()
 # Correction of flags on different Platform(WIN/MAC) and Print Warning Message
 if (APPLE)
     if(WITH_MKL)
-        MESSAGE(WARNING 
+        MESSAGE(WARNING
             "Mac is not supported with MKL in Paddle yet. Force WITH_MKL=OFF.")
         set(WITH_MKL OFF CACHE STRING "Disable MKL for building on mac" FORCE)
     endif()
@@ -141,7 +141,7 @@ if(WIN32 OR APPLE)
     SET(WITH_XBYAK OFF CACHE STRING "Disable XBYAK in Windows and MacOS" FORCE)
 
     if(WITH_LIBXSMM)
-        MESSAGE(WARNING 
+        MESSAGE(WARNING
             "Windows, Mac are not supported with libxsmm in Paddle yet."
             "Force WITH_LIBXSMM=OFF")
         SET(WITH_LIBXSMM OFF CACHE STRING "Disable LIBXSMM in Windows and MacOS" FORCE)
@@ -276,7 +276,7 @@ endif(WITH_BOX_PS)
 
 if(WITH_ASCEND OR WITH_ASCEND_CL)
     include(external/ascend)
-    if(WITH_ASCEND)
+    if(WITH_ASCEND OR WITH_ASCEND_CL)
         list(APPEND third_party_deps extern_ascend)
     endif()
     if(WITH_ASCEND_CL)
@@ -290,7 +290,7 @@ if (WITH_PSCORE)
 
     include(external/leveldb)
     list(APPEND third_party_deps extern_leveldb)
-        
+
     include(external/brpc)
     list(APPEND third_party_deps extern_brpc)
 
diff --git a/go/demo/mobilenet_c_exp.cc b/go/demo/mobilenet_c_exp.cc
new file mode 100644
index 0000000000000..b4f42dab6790b
--- /dev/null
+++ b/go/demo/mobilenet_c_exp.cc
@@ -0,0 +1,84 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <pd_inference_api.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+void ReadData(float* data, int size);
+
+int main(int argc, char* argv[]) {
+  PD_Config* config = PD_ConfigCreate();
+  PD_ConfigSetModel(config, "data/model/__model__", "data/model/__params__");
+  PD_ConfigDisableGlogInfo(config);
+
+  PD_Predictor* predictor = PD_PredictorCreate(config);
+  // config has destroyed in PD_PredictorCreate
+  config = NULL;
+
+  int input_num = PD_PredictorGetInputNum(predictor);
+  printf("Input num: %d\n", input_num);
+  int output_num = PD_PredictorGetOutputNum(predictor);
+  printf("Output num: %d\n", output_num);
+
+  PD_OneDimArrayCstr* input_names = PD_PredictorGetInputNames(predictor);
+  PD_Tensor* input_tensor =
+      PD_PredictorGetInputHandle(predictor, input_names->data[0]);
+  PD_OneDimArrayCstrDestroy(input_names);
+  input_names = NULL;
+
+  int32_t shape[] = {1, 3, 300, 300};
+  float* data = (float*)malloc(sizeof(float) * 1 * 3 * 300 * 300);  // NOLINT
+  ReadData(data, 1 * 3 * 300 * 300);                                // NOLINT
+  PD_TensorReshape(input_tensor, 4, shape);
+  PD_TensorCopyFromCpuFloat(input_tensor, data);
+  free(data);
+  data = NULL;
+  PD_PredictorRun(predictor);
+
+  PD_OneDimArrayCstr* output_names = PD_PredictorGetOutputNames(predictor);
+  PD_Tensor* output_tensor =
+      PD_PredictorGetOutputHandle(predictor, output_names->data[0]);
+  PD_OneDimArrayCstrDestroy(output_names);
+  output_names = nullptr;
+
+  PD_OneDimArrayInt32* out_shape = PD_TensorGetShape(output_tensor);
+  int32_t size = 1;
+  for (size_t index = 0; index < out_shape->size; ++index) {
+    size = size * out_shape->data[index];
+  }
+  PD_OneDimArrayInt32Destroy(out_shape);
+  out_shape = NULL;
+
+  data = (float*)malloc(sizeof(float) * size);  // NOLINT
+  PD_TensorCopyToCpuFloat(output_tensor, data);
+  free(data);
+  data = NULL;
+
+  PD_TensorDestroy(output_tensor);
+  output_tensor = NULL;
+  PD_TensorDestroy(input_tensor);
+  input_tensor = NULL;
+  PD_PredictorDestroy(predictor);
+  predictor = NULL;
+
+  return 0;
+}
+
+void ReadData(float* data, int n) {
+  FILE* fp = fopen("data/data.txt", "r");
+  for (int i = 0; i < n; i++) {
+    fscanf(fp, "%f", &data[i]);
+  }
+  fclose(fp);
+}
diff --git a/paddle/extension.h b/paddle/extension.h
index 71469576853a3..98d4bfd0326c5 100644
--- a/paddle/extension.h
+++ b/paddle/extension.h
@@ -15,4 +15,4 @@ limitations under the License. */
 #pragma once
 
 // All paddle apis in C++ frontend
-#include "paddle/fluid/extension/include/ext_all.h"
+#include "paddle/extension/include/ext_all.h"
diff --git a/paddle/fluid/distributed/CMakeLists.txt b/paddle/fluid/distributed/CMakeLists.txt
index 5a2d7a06201ba..a2062d82c8130 100644
--- a/paddle/fluid/distributed/CMakeLists.txt
+++ b/paddle/fluid/distributed/CMakeLists.txt
@@ -14,6 +14,7 @@ endif()
 add_subdirectory(table)
 add_subdirectory(service)
 add_subdirectory(test)
+add_subdirectory(index_dataset)
 
 get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS)
 
diff --git a/paddle/fluid/distributed/fleet.cc b/paddle/fluid/distributed/fleet.cc
index 9aafdd769ed4a..dfd55f16e1a06 100644
--- a/paddle/fluid/distributed/fleet.cc
+++ b/paddle/fluid/distributed/fleet.cc
@@ -146,6 +146,44 @@ void FleetWrapper::CreateClient2ClientConnection() {
       client2client_max_retry_);
 }
 
+std::future<int32_t> FleetWrapper::PullSparseVarsAsync(
+    const Scope& scope, const uint64_t table_id,
+    const std::vector<std::string>& var_names, std::vector<uint64_t>* fea_keys,
+    std::vector<std::vector<float>>* fea_values, int fea_value_dim) {
+  fea_keys->clear();
+  fea_keys->resize(0);
+  fea_keys->reserve(MAX_FEASIGN_NUM);
+  for (auto name : var_names) {
+    Variable* var = scope.FindVar(name);
+    if (var == nullptr) {
+      continue;
+    }
+    LoDTensor* tensor = var->GetMutable<LoDTensor>();
+    CHECK(tensor != nullptr) << "tensor of var " << name << " is null";
+    int64_t* ids = tensor->data<int64_t>();
+    size_t len = tensor->numel();
+    for (auto i = 0u; i < len; ++i) {
+      if (ids[i] == 0u) {
+        continue;
+      }
+      fea_keys->push_back(static_cast<uint64_t>(ids[i]));
+    }
+  }
+  fea_values->resize(fea_keys->size() + 1);
+  for (auto& t : *fea_values) {
+    t.resize(fea_value_dim);
+  }
+  std::vector<float*> pull_result_ptr;
+  for (auto& t : *fea_values) {
+    pull_result_ptr.push_back(t.data());
+  }
+
+  bool training = true;
+  return pserver_ptr_->_worker_ptr->pull_sparse(pull_result_ptr.data(),
+                                                table_id, fea_keys->data(),
+                                                fea_keys->size(), training);
+}
+
 void FleetWrapper::PullSparseVarsSync(
     const Scope& scope, const uint64_t table_id,
     const std::vector<std::string>& var_names, std::vector<uint64_t>* fea_keys,
diff --git a/paddle/fluid/distributed/fleet.h b/paddle/fluid/distributed/fleet.h
index 863440180a808..0da5d1e2bf987 100644
--- a/paddle/fluid/distributed/fleet.h
+++ b/paddle/fluid/distributed/fleet.h
@@ -84,6 +84,15 @@ class FleetWrapper {
                           int fea_dim,
                           const std::vector<std::string>& var_emb_names);
 
+  // Pull sparse variables from server in async mode
+  // Param<in>: scope, table_id, var_names, fea_keys, fea_dim
+  // Param<out>: fea_values std::future
+  std::future<int32_t> PullSparseVarsAsync(
+      const Scope& scope, const uint64_t table_id,
+      const std::vector<std::string>& var_names,
+      std::vector<uint64_t>* fea_keys,
+      std::vector<std::vector<float>>* fea_values, int fea_dim);
+
   // Pull sparse variables from server in sync mode
   // pull immediately to tensors
   // is_training is true means training, false means inference, the behavior is
diff --git a/paddle/fluid/distributed/index_dataset/CMakeLists.txt b/paddle/fluid/distributed/index_dataset/CMakeLists.txt
new file mode 100644
index 0000000000000..a30488494a52b
--- /dev/null
+++ b/paddle/fluid/distributed/index_dataset/CMakeLists.txt
@@ -0,0 +1,7 @@
+proto_library(index_dataset_proto SRCS index_dataset.proto)
+cc_library(index_wrapper SRCS index_wrapper.cc DEPS index_dataset_proto fs)
+cc_library(index_sampler SRCS index_sampler.cc DEPS index_wrapper)
+
+if(WITH_PYTHON)
+  py_proto_compile(index_dataset_py_proto SRCS index_dataset.proto)
+endif()
diff --git a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.cc b/paddle/fluid/distributed/index_dataset/index_dataset.proto
similarity index 55%
rename from paddle/fluid/operators/distributed/async_sparse_param_update_recorder.cc
rename to paddle/fluid/distributed/index_dataset/index_dataset.proto
index 3f3b6b959e301..1b4ee313671ad 100644
--- a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.cc
+++ b/paddle/fluid/distributed/index_dataset/index_dataset.proto
@@ -1,4 +1,4 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,16 +12,21 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h"
+syntax = "proto2";
+package paddle.distributed;
 
-namespace paddle {
-namespace operators {
-namespace distributed {
+message IndexNode {
+  required uint64 id = 1;
+  required bool is_leaf = 2;
+  required float probability = 3;
+}
 
-std::once_flag AsyncSparseParamUpdateRecorder::init_flag_;
-std::unique_ptr<AsyncSparseParamUpdateRecorder>
-    AsyncSparseParamUpdateRecorder::recorder_(nullptr);
+message TreeMeta {
+  required int32 height = 1;
+  required int32 branch = 2;
+}
 
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
+message KVItem {
+  required bytes key = 1;
+  required bytes value = 2;
+}
\ No newline at end of file
diff --git a/paddle/fluid/distributed/index_dataset/index_sampler.cc b/paddle/fluid/distributed/index_dataset/index_sampler.cc
new file mode 100644
index 0000000000000..58f85d98fb09c
--- /dev/null
+++ b/paddle/fluid/distributed/index_dataset/index_sampler.cc
@@ -0,0 +1,95 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/index_dataset/index_sampler.h"
+#include "paddle/fluid/operators/math/sampler.h"
+
+namespace paddle {
+namespace distributed {
+
+using Sampler = paddle::operators::math::Sampler;
+
+std::vector<std::vector<uint64_t>> LayerWiseSampler::sample(
+    const std::vector<std::vector<uint64_t>>& user_inputs,
+    const std::vector<uint64_t>& target_ids, bool with_hierarchy) {
+  auto input_num = target_ids.size();
+  auto user_feature_num = user_inputs[0].size();
+  std::vector<std::vector<uint64_t>> outputs(
+      input_num * layer_counts_sum_,
+      std::vector<uint64_t>(user_feature_num + 2));
+
+  auto max_layer = tree_->Height();
+  std::vector<Sampler*> sampler_vec(max_layer - start_sample_layer_);
+  std::vector<std::vector<IndexNode>> layer_ids(max_layer -
+                                                start_sample_layer_);
+
+  auto layer_index = max_layer - 1;
+  size_t idx = 0;
+  while (layer_index >= start_sample_layer_) {
+    auto layer_codes = tree_->GetLayerCodes(layer_index);
+    layer_ids[idx] = tree_->GetNodes(layer_codes);
+    sampler_vec[idx] = new paddle::operators::math::UniformSampler(
+        layer_ids[idx].size() - 1, seed_);
+    layer_index--;
+    idx++;
+  }
+
+  idx = 0;
+  for (size_t i = 0; i < input_num; i++) {
+    auto travel_codes =
+        tree_->GetTravelCodes(target_ids[i], start_sample_layer_);
+    auto travel_path = tree_->GetNodes(travel_codes);
+    for (size_t j = 0; j < travel_path.size(); j++) {
+      // user
+      if (j > 0 && with_hierarchy) {
+        auto ancestor_codes =
+            tree_->GetAncestorCodes(user_inputs[i], max_layer - j - 1);
+        auto hierarchical_user = tree_->GetNodes(ancestor_codes);
+        for (int idx_offset = 0; idx_offset <= layer_counts_[j]; idx_offset++) {
+          for (size_t k = 0; k < user_feature_num; k++) {
+            outputs[idx + idx_offset][k] = hierarchical_user[k].id();
+          }
+        }
+      } else {
+        for (int idx_offset = 0; idx_offset <= layer_counts_[j]; idx_offset++) {
+          for (size_t k = 0; k < user_feature_num; k++) {
+            outputs[idx + idx_offset][k] = user_inputs[i][k];
+          }
+        }
+      }
+
+      // sampler ++
+      outputs[idx][user_feature_num] = travel_path[j].id();
+      outputs[idx][user_feature_num + 1] = 1.0;
+      idx += 1;
+      for (int idx_offset = 0; idx_offset < layer_counts_[j]; idx_offset++) {
+        int sample_res = 0;
+        do {
+          sample_res = sampler_vec[j]->Sample();
+        } while (layer_ids[j][sample_res].id() == travel_path[j].id());
+        outputs[idx + idx_offset][user_feature_num] =
+            layer_ids[j][sample_res].id();
+        outputs[idx + idx_offset][user_feature_num + 1] = 0;
+      }
+      idx += layer_counts_[j];
+    }
+  }
+  for (size_t i = 0; i < sampler_vec.size(); i++) {
+    delete sampler_vec[i];
+  }
+  return outputs;
+}
+
+}  // end namespace distributed
+}  // end namespace paddle
diff --git a/paddle/fluid/distributed/index_dataset/index_sampler.h b/paddle/fluid/distributed/index_dataset/index_sampler.h
new file mode 100644
index 0000000000000..66882bedc9b76
--- /dev/null
+++ b/paddle/fluid/distributed/index_dataset/index_sampler.h
@@ -0,0 +1,100 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <vector>
+#include "paddle/fluid/distributed/index_dataset/index_wrapper.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace distributed {
+
+class IndexSampler {
+ public:
+  virtual ~IndexSampler() {}
+  IndexSampler() {}
+
+  template <typename T>
+  static std::shared_ptr<IndexSampler> Init(const std::string& name) {
+    std::shared_ptr<IndexSampler> instance = nullptr;
+    instance.reset(new T(name));
+    return instance;
+  }
+
+  virtual void init_layerwise_conf(const std::vector<int>& layer_sample_counts,
+                                   int start_sample_layer = 1, int seed = 0) {}
+  virtual void init_beamsearch_conf(const int64_t k) {}
+  virtual std::vector<std::vector<uint64_t>> sample(
+      const std::vector<std::vector<uint64_t>>& user_inputs,
+      const std::vector<uint64_t>& input_targets,
+      bool with_hierarchy = false) = 0;
+};
+
+class LayerWiseSampler : public IndexSampler {
+ public:
+  virtual ~LayerWiseSampler() {}
+  explicit LayerWiseSampler(const std::string& name) {
+    tree_ = IndexWrapper::GetInstance()->get_tree_index(name);
+  }
+
+  void init_layerwise_conf(const std::vector<int>& layer_sample_counts,
+                           int start_sample_layer, int seed) override {
+    seed_ = seed;
+    start_sample_layer_ = start_sample_layer;
+
+    PADDLE_ENFORCE_GT(
+        start_sample_layer_, 0,
+        paddle::platform::errors::InvalidArgument(
+            "start sampler layer = [%d], it should greater than 0.",
+            start_sample_layer_));
+    PADDLE_ENFORCE_LT(start_sample_layer_, tree_->Height(),
+                      paddle::platform::errors::InvalidArgument(
+                          "start sampler layer = [%d], it should less than "
+                          "max_layer, which is [%d].",
+                          start_sample_layer_, tree_->Height()));
+
+    size_t i = 0;
+    layer_counts_sum_ = 0;
+    layer_counts_.clear();
+    int cur_layer = start_sample_layer_;
+    while (cur_layer < tree_->Height()) {
+      int layer_sample_num = 1;
+      if (i < layer_sample_counts.size()) {
+        layer_sample_num = layer_sample_counts[i];
+      }
+      layer_counts_sum_ += layer_sample_num + 1;
+      layer_counts_.push_back(layer_sample_num);
+      VLOG(3) << "[INFO] level " << cur_layer
+              << " sample_layer_counts.push_back: " << layer_sample_num;
+      cur_layer += 1;
+      i += 1;
+    }
+    reverse(layer_counts_.begin(), layer_counts_.end());
+    VLOG(3) << "sample counts sum: " << layer_counts_sum_;
+  }
+  std::vector<std::vector<uint64_t>> sample(
+      const std::vector<std::vector<uint64_t>>& user_inputs,
+      const std::vector<uint64_t>& target_ids, bool with_hierarchy) override;
+
+ private:
+  std::vector<int> layer_counts_;
+  int64_t layer_counts_sum_{0};
+  std::shared_ptr<TreeIndex> tree_{nullptr};
+  int seed_{0};
+  int start_sample_layer_{1};
+};
+
+}  // end namespace distributed
+}  // end namespace paddle
diff --git a/paddle/fluid/distributed/index_dataset/index_wrapper.cc b/paddle/fluid/distributed/index_dataset/index_wrapper.cc
new file mode 100644
index 0000000000000..99fe4ca0c6d04
--- /dev/null
+++ b/paddle/fluid/distributed/index_dataset/index_wrapper.cc
@@ -0,0 +1,196 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+#include <thread>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+#include "paddle/fluid/framework/io/fs.h"
+
+#include <boost/algorithm/string.hpp>
+#include <boost/lexical_cast.hpp>
+#include "paddle/fluid/distributed/index_dataset/index_wrapper.h"
+
+namespace paddle {
+namespace distributed {
+
+std::shared_ptr<IndexWrapper> IndexWrapper::s_instance_(nullptr);
+
+int TreeIndex::Load(const std::string filename) {
+  int err_no;
+  auto fp = paddle::framework::fs_open_read(filename, &err_no, "");
+  PADDLE_ENFORCE_NE(
+      fp, nullptr,
+      platform::errors::InvalidArgument(
+          "Open file %s failed. Please check whether the file exists.",
+          filename));
+
+  int num = 0;
+  max_id_ = 0;
+  fake_node_.set_id(0);
+  fake_node_.set_is_leaf(false);
+  fake_node_.set_probability(0.0);
+  max_code_ = 0;
+  size_t ret = fread(&num, sizeof(num), 1, fp.get());
+  while (ret == 1 && num > 0) {
+    std::string content(num, '\0');
+    size_t read_num =
+        fread(const_cast<char*>(content.data()), 1, num, fp.get());
+    PADDLE_ENFORCE_EQ(
+        read_num, static_cast<size_t>(num),
+        platform::errors::InvalidArgument(
+            "Read from file: %s failed. Valid Format is "
+            "an integer representing the length of the following string, "
+            "and the string itself.We got an iteger[% d], "
+            "but the following string's length is [%d].",
+            filename, num, read_num));
+
+    KVItem item;
+    PADDLE_ENFORCE_EQ(
+        item.ParseFromString(content), true,
+        platform::errors::InvalidArgument("Parse from file: %s failed. It's "
+                                          "content can't be parsed by KVItem.",
+                                          filename));
+
+    if (item.key() == ".tree_meta") {
+      meta_.ParseFromString(item.value());
+    } else {
+      auto code = boost::lexical_cast<uint64_t>(item.key());
+      IndexNode node;
+      node.ParseFromString(item.value());
+      PADDLE_ENFORCE_NE(node.id(), 0,
+                        platform::errors::InvalidArgument(
+                            "Node'id should not be equel to zero."));
+      if (node.is_leaf()) {
+        id_codes_map_[node.id()] = code;
+      }
+      data_[code] = node;
+      if (node.id() > max_id_) {
+        max_id_ = node.id();
+      }
+      if (code > max_code_) {
+        max_code_ = code;
+      }
+    }
+    ret = fread(&num, sizeof(num), 1, fp.get());
+  }
+  total_nodes_num_ = data_.size();
+  max_code_ += 1;
+  return 0;
+}
+
+std::vector<IndexNode> TreeIndex::GetNodes(const std::vector<uint64_t>& codes) {
+  std::vector<IndexNode> nodes;
+  nodes.reserve(codes.size());
+  for (size_t i = 0; i < codes.size(); i++) {
+    if (CheckIsValid(codes[i])) {
+      nodes.push_back(data_.at(codes[i]));
+    } else {
+      nodes.push_back(fake_node_);
+    }
+  }
+  return nodes;
+}
+
+std::vector<uint64_t> TreeIndex::GetLayerCodes(int level) {
+  uint64_t level_num = static_cast<uint64_t>(std::pow(meta_.branch(), level));
+  uint64_t level_offset = level_num - 1;
+
+  std::vector<uint64_t> res;
+  res.reserve(level_num);
+  for (uint64_t i = 0; i < level_num; i++) {
+    auto code = level_offset + i;
+    if (CheckIsValid(code)) {
+      res.push_back(code);
+    }
+  }
+  return res;
+}
+
+std::vector<uint64_t> TreeIndex::GetAncestorCodes(
+    const std::vector<uint64_t>& ids, int level) {
+  std::vector<uint64_t> res;
+  res.reserve(ids.size());
+
+  int cur_level;
+  for (size_t i = 0; i < ids.size(); i++) {
+    if (id_codes_map_.find(ids[i]) == id_codes_map_.end()) {
+      res.push_back(max_code_);
+    } else {
+      auto code = id_codes_map_.at(ids[i]);
+      cur_level = meta_.height() - 1;
+
+      while (level >= 0 && cur_level > level) {
+        code = (code - 1) / meta_.branch();
+        cur_level--;
+      }
+      res.push_back(code);
+    }
+  }
+  return res;
+}
+
+std::vector<uint64_t> TreeIndex::GetChildrenCodes(uint64_t ancestor,
+                                                  int level) {
+  auto level_code_num = static_cast<uint64_t>(std::pow(meta_.branch(), level));
+  auto code_min = level_code_num - 1;
+  auto code_max = meta_.branch() * level_code_num - 1;
+
+  std::vector<uint64_t> parent;
+  parent.push_back(ancestor);
+  std::vector<uint64_t> res;
+  size_t p_idx = 0;
+  while (true) {
+    size_t p_size = parent.size();
+    for (; p_idx < p_size; p_idx++) {
+      for (int i = 0; i < meta_.branch(); i++) {
+        auto code = parent[p_idx] * meta_.branch() + i + 1;
+        if (data_.find(code) != data_.end()) parent.push_back(code);
+      }
+    }
+    if ((code_min <= parent[p_idx]) && (parent[p_idx] < code_max)) {
+      break;
+    }
+  }
+
+  return std::vector<uint64_t>(parent.begin() + p_idx, parent.end());
+}
+
+std::vector<uint64_t> TreeIndex::GetTravelCodes(uint64_t id, int start_level) {
+  std::vector<uint64_t> res;
+  PADDLE_ENFORCE_NE(id_codes_map_.find(id), id_codes_map_.end(),
+                    paddle::platform::errors::InvalidArgument(
+                        "id = %d doesn't exist in Tree.", id));
+  auto code = id_codes_map_.at(id);
+  int level = meta_.height() - 1;
+
+  while (level >= start_level) {
+    res.push_back(code);
+    code = (code - 1) / meta_.branch();
+    level--;
+  }
+  return res;
+}
+
+std::vector<IndexNode> TreeIndex::GetAllLeafs() {
+  std::vector<IndexNode> res;
+  res.reserve(id_codes_map_.size());
+  for (auto& ite : id_codes_map_) {
+    auto code = ite.second;
+    res.push_back(data_.at(code));
+  }
+  return res;
+}
+
+}  // end namespace distributed
+}  // end namespace paddle
diff --git a/paddle/fluid/distributed/index_dataset/index_wrapper.h b/paddle/fluid/distributed/index_dataset/index_wrapper.h
new file mode 100644
index 0000000000000..8fb8faf6c84a2
--- /dev/null
+++ b/paddle/fluid/distributed/index_dataset/index_wrapper.h
@@ -0,0 +1,120 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <cmath>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/distributed/index_dataset/index_dataset.pb.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace distributed {
+
+class Index {
+ public:
+  Index() {}
+  ~Index() {}
+};
+
+class TreeIndex : public Index {
+ public:
+  TreeIndex() {}
+  ~TreeIndex() {}
+
+  int Height() { return meta_.height(); }
+  int Branch() { return meta_.branch(); }
+  uint64_t TotalNodeNums() { return total_nodes_num_; }
+  uint64_t EmbSize() { return max_id_ + 1; }
+  int Load(const std::string path);
+
+  inline bool CheckIsValid(int code) {
+    if (data_.find(code) != data_.end()) {
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  std::vector<IndexNode> GetNodes(const std::vector<uint64_t>& codes);
+  std::vector<uint64_t> GetLayerCodes(int level);
+  std::vector<uint64_t> GetAncestorCodes(const std::vector<uint64_t>& ids,
+                                         int level);
+  std::vector<uint64_t> GetChildrenCodes(uint64_t ancestor, int level);
+  std::vector<uint64_t> GetTravelCodes(uint64_t id, int start_level);
+  std::vector<IndexNode> GetAllLeafs();
+
+  std::unordered_map<uint64_t, IndexNode> data_;
+  std::unordered_map<uint64_t, uint64_t> id_codes_map_;
+  uint64_t total_nodes_num_;
+  TreeMeta meta_;
+  uint64_t max_id_;
+  uint64_t max_code_;
+  IndexNode fake_node_;
+};
+
+using TreePtr = std::shared_ptr<TreeIndex>;
+
+class IndexWrapper {
+ public:
+  virtual ~IndexWrapper() {}
+  IndexWrapper() {}
+
+  void clear_tree() { tree_map.clear(); }
+
+  TreePtr get_tree_index(const std::string name) {
+    PADDLE_ENFORCE_NE(tree_map.find(name), tree_map.end(),
+                      paddle::platform::errors::InvalidArgument(
+                          "tree [%s] doesn't exist. Please insert it firstly "
+                          "by API[\' insert_tree_index \'].",
+                          name));
+    return tree_map[name];
+  }
+
+  void insert_tree_index(const std::string name, const std::string tree_path) {
+    if (tree_map.find(name) != tree_map.end()) {
+      VLOG(0) << "Tree " << name << " has already existed.";
+      return;
+    }
+    TreePtr tree = std::make_shared<TreeIndex>();
+    int ret = tree->Load(tree_path);
+    PADDLE_ENFORCE_EQ(ret, 0, paddle::platform::errors::InvalidArgument(
+                                  "Load tree[%s] from path[%s] failed. Please "
+                                  "check whether the file exists.",
+                                  name, tree_path));
+    tree_map.insert(std::pair<std::string, TreePtr>{name, tree});
+  }
+
+  static std::shared_ptr<IndexWrapper> GetInstancePtr() {
+    if (NULL == s_instance_) {
+      s_instance_.reset(new paddle::distributed::IndexWrapper());
+    }
+    return s_instance_;
+  }
+
+  static IndexWrapper* GetInstance() {
+    if (NULL == s_instance_) {
+      s_instance_.reset(new paddle::distributed::IndexWrapper());
+    }
+    return s_instance_.get();
+  }
+
+ private:
+  static std::shared_ptr<IndexWrapper> s_instance_;
+  std::unordered_map<std::string, TreePtr> tree_map;
+};
+
+}  // end namespace distributed
+}  // end namespace paddle
diff --git a/paddle/fluid/distributed/service/CMakeLists.txt b/paddle/fluid/distributed/service/CMakeLists.txt
index 843dea9eea6ef..d1f04e26ade72 100644
--- a/paddle/fluid/distributed/service/CMakeLists.txt
+++ b/paddle/fluid/distributed/service/CMakeLists.txt
@@ -16,6 +16,7 @@ set_source_files_properties(communicator.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUT
 set_source_files_properties(service.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 set_source_files_properties(brpc_ps_server.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 set_source_files_properties(brpc_ps_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+set_source_files_properties(ps_local_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 
 set_source_files_properties(brpc_utils.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 set_source_files_properties(heter_server.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
@@ -29,7 +30,8 @@ set_source_files_properties(graph_brpc_client.cc PROPERTIES COMPILE_FLAGS ${DIST
 cc_library(brpc_utils SRCS brpc_utils.cc DEPS tensor device_context ${COMMON_DEPS} ${RPC_DEPS})
 
 cc_library(downpour_server SRCS graph_brpc_server.cc brpc_ps_server.cc DEPS boost eigen3 table brpc_utils simple_threadpool ${RPC_DEPS})
-cc_library(downpour_client SRCS graph_brpc_client.cc brpc_ps_client.cc  DEPS boost eigen3 table brpc_utils simple_threadpool ${RPC_DEPS})
+cc_library(downpour_client SRCS graph_brpc_client.cc brpc_ps_client.cc
+ps_local_client.cc DEPS boost eigen3 table brpc_utils simple_threadpool ${RPC_DEPS})
 
 cc_library(client SRCS ps_client.cc DEPS downpour_client boost ${RPC_DEPS})
 cc_library(server SRCS server.cc DEPS downpour_server boost ${RPC_DEPS})
diff --git a/paddle/fluid/distributed/service/brpc_ps_client.cc b/paddle/fluid/distributed/service/brpc_ps_client.cc
index b49a71ab0c13a..a6ad9d08f52fd 100644
--- a/paddle/fluid/distributed/service/brpc_ps_client.cc
+++ b/paddle/fluid/distributed/service/brpc_ps_client.cc
@@ -880,8 +880,8 @@ std::future<int32_t> BrpcPsClient::send_client2client_msg(
   auto promise = std::make_shared<std::promise<int32_t>>();
   std::future<int> fut = promise->get_future();
   if (to_client_id >= _client_channels.size()) {
-    LOG(FATAL) << "to_client_id is out of range clients, which size is "
-               << _client_channels.size();
+    VLOG(0) << "to_client_id is out of range clients, which size is "
+            << _client_channels.size();
     promise->set_value(-1);
     return fut;
   }
@@ -1001,4 +1001,4 @@ int32_t BrpcPsClient::recv_and_save_table(const uint64_t table_id,
 }
 
 }  // namespace distributed
-}  // namespace paddle
\ No newline at end of file
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/service/communicator.h b/paddle/fluid/distributed/service/communicator.h
index 043fe9d83dfc5..fa60cab2b5877 100644
--- a/paddle/fluid/distributed/service/communicator.h
+++ b/paddle/fluid/distributed/service/communicator.h
@@ -310,6 +310,8 @@ class Communicator {
     return _worker_ptr;
   }
 
+  RecvCtxMap &GetRecvCtxMap() { return recv_varname_to_ctx_; }
+
   std::shared_ptr<PSClient> _worker_ptr;  // pointer to worker
 
  protected:
diff --git a/paddle/fluid/distributed/service/graph_brpc_client.cc b/paddle/fluid/distributed/service/graph_brpc_client.cc
index a6271cac83c9a..eafb4d596cc16 100644
--- a/paddle/fluid/distributed/service/graph_brpc_client.cc
+++ b/paddle/fluid/distributed/service/graph_brpc_client.cc
@@ -135,7 +135,8 @@ std::future<int32_t> GraphBrpcClient::get_node_feat(
     closure->request(request_idx)
         ->add_params(joint_feature_name.c_str(), joint_feature_name.size());
 
-    PsService_Stub rpc_stub(get_cmd_channel(server_index));
+    GraphPsService_Stub rpc_stub =
+        getServiceStub(get_cmd_channel(server_index));
     closure->cntl(request_idx)->set_log_id(butil::gettimeofday_ms());
     rpc_stub.service(closure->cntl(request_idx), closure->request(request_idx),
                      closure->response(request_idx), closure);
diff --git a/paddle/fluid/distributed/service/graph_py_service.h b/paddle/fluid/distributed/service/graph_py_service.h
index e185f23e3d240..c6657be96ba44 100644
--- a/paddle/fluid/distributed/service/graph_py_service.h
+++ b/paddle/fluid/distributed/service/graph_py_service.h
@@ -54,19 +54,7 @@ class GraphPyService {
   std::vector<std::string> table_feat_conf_feat_dtype;
   std::vector<int32_t> table_feat_conf_feat_shape;
 
-  // std::thread *server_thread, *client_thread;
-
-  // std::shared_ptr<paddle::distributed::PSServer> pserver_ptr;
-
-  // std::shared_ptr<paddle::distributed::PSClient> worker_ptr;
-
  public:
-  // std::shared_ptr<paddle::distributed::PSServer> get_ps_server() {
-  //   return pserver_ptr;
-  // }
-  // std::shared_ptr<paddle::distributed::PSClient> get_ps_client() {
-  //   return worker_ptr;
-  // }
   int get_shard_num() { return shard_num; }
   void set_shard_num(int shard_num) { this->shard_num = shard_num; }
   void GetDownpourSparseTableProto(
diff --git a/paddle/fluid/distributed/service/ps_client.cc b/paddle/fluid/distributed/service/ps_client.cc
index 3f78908baa3b1..d45f41a0f58de 100644
--- a/paddle/fluid/distributed/service/ps_client.cc
+++ b/paddle/fluid/distributed/service/ps_client.cc
@@ -16,12 +16,15 @@
 #include "glog/logging.h"
 #include "paddle/fluid/distributed/service/brpc_ps_client.h"
 #include "paddle/fluid/distributed/service/graph_brpc_client.h"
+#include "paddle/fluid/distributed/service/ps_local_client.h"
 #include "paddle/fluid/distributed/table/table.h"
 
 namespace paddle {
 namespace distributed {
 REGISTER_PSCORE_CLASS(PSClient, BrpcPsClient);
+REGISTER_PSCORE_CLASS(PSClient, PsLocalClient);
 REGISTER_PSCORE_CLASS(PSClient, GraphBrpcClient);
+
 int32_t PSClient::configure(
     const PSParameter &config,
     const std::map<uint64_t, std::vector<paddle::distributed::Region>> &regions,
@@ -83,4 +86,4 @@ PSClient *PSClientFactory::create(const PSParameter &ps_config) {
   return client;
 }
 }  // namespace distributed
-}  // namespace paddle
\ No newline at end of file
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/service/ps_client.h b/paddle/fluid/distributed/service/ps_client.h
index 1c8abc6c2e8dc..74a1e0dde71fc 100644
--- a/paddle/fluid/distributed/service/ps_client.h
+++ b/paddle/fluid/distributed/service/ps_client.h
@@ -118,6 +118,17 @@ class PSClient {
                                            const uint64_t *keys, size_t num,
                                            bool is_training) = 0;
 
+  virtual ::std::future<int32_t> pull_sparse_ptr(char **select_values,
+                                                 size_t table_id,
+                                                 const uint64_t *keys,
+                                                 size_t num) {
+    VLOG(0) << "Did not implement";
+    std::promise<int32_t> promise;
+    std::future<int> fut = promise.get_future();
+    promise.set_value(-1);
+    return fut;
+  }
+
   virtual std::future<int32_t> print_table_stat(uint32_t table_id) = 0;
 
   // 确保所有积攒中的请求都发起发送
@@ -150,7 +161,7 @@ class PSClient {
   virtual std::future<int32_t> send_client2client_msg(int msg_type,
                                                       int to_client_id,
                                                       const std::string &msg) {
-    LOG(FATAL) << "Did not implement";
+    VLOG(0) << "Did not implement";
     std::promise<int32_t> promise;
     std::future<int> fut = promise.get_future();
     promise.set_value(-1);
diff --git a/paddle/fluid/distributed/service/ps_local_client.cc b/paddle/fluid/distributed/service/ps_local_client.cc
new file mode 100644
index 0000000000000..2acc845a50890
--- /dev/null
+++ b/paddle/fluid/distributed/service/ps_local_client.cc
@@ -0,0 +1,269 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/service/ps_local_client.h"
+#include "paddle/fluid/distributed/table/table.h"
+
+//#define pslib_debug_dense_compress
+
+namespace paddle {
+namespace distributed {
+int32_t PsLocalClient::initialize() {
+  const auto& downpour_param = _config.server_param().downpour_server_param();
+  TableManager::instance().initialize();
+  for (size_t i = 0; i < downpour_param.downpour_table_param_size(); ++i) {
+    auto* table = CREATE_PSCORE_CLASS(
+        Table, downpour_param.downpour_table_param(i).table_class());
+    table->initialize(downpour_param.downpour_table_param(i),
+                      _config.fs_client_param());
+    table->set_shard(0, 1);
+    _table_map[downpour_param.downpour_table_param(i).table_id()].reset(table);
+  }
+  return 0;
+}
+
+::std::future<int32_t> PsLocalClient::shrink(uint32_t table_id,
+                                             const std::string threshold) {
+  // TODO
+  return done();
+}
+
+::std::future<int32_t> PsLocalClient::load(const std::string& epoch,
+                                           const std::string& mode) {
+  // TODO
+  // for (auto& it : _table_map) {
+  //    load(it.first, epoch, mode);
+  //}
+  return done();
+}
+::std::future<int32_t> PsLocalClient::load(uint32_t table_id,
+                                           const std::string& epoch,
+                                           const std::string& mode) {
+  // TODO
+  // auto* table_ptr = table(table_id);
+  // table_ptr->load(epoch, mode);
+  return done();
+}
+
+::std::future<int32_t> PsLocalClient::save(const std::string& epoch,
+                                           const std::string& mode) {
+  // TODO
+  for (auto& it : _table_map) {
+    save(it.first, epoch, mode);
+  }
+  return done();
+}
+::std::future<int32_t> PsLocalClient::save(uint32_t table_id,
+                                           const std::string& epoch,
+                                           const std::string& mode) {
+  // TODO
+  auto* table_ptr = table(table_id);
+  table_ptr->flush();
+  table_ptr->save(epoch, mode);
+  return done();
+}
+
+::std::future<int32_t> PsLocalClient::clear() {
+  // TODO
+  return done();
+}
+::std::future<int32_t> PsLocalClient::clear(uint32_t table_id) {
+  // TODO
+  return done();
+}
+
+::std::future<int32_t> PsLocalClient::flush() {
+  // no need
+  return done();
+}
+
+::std::future<int32_t> PsLocalClient::stop_server() {
+  // no need
+  return done();
+}
+
+::std::future<int32_t> PsLocalClient::pull_dense(Region* regions,
+                                                 size_t region_num,
+                                                 size_t table_id) {
+  auto* accessor = table_accessor(table_id);
+  auto* table_ptr = table(table_id);
+
+  uint32_t num_per_shard = dense_dim_per_shard(accessor->fea_dim(), 1);
+  std::vector<float> region_buffer;
+  region_buffer.resize(num_per_shard);
+  table_ptr->pull_dense(region_buffer.data(), region_buffer.size());
+
+  size_t region_idx = 0;
+  size_t region_data_idx = 0;
+  size_t shard_data_size = num_per_shard;
+  size_t shard_buffer_remain = shard_data_size * sizeof(float);
+  PADDLE_ENFORCE_EQ(
+      shard_buffer_remain, region_buffer.size() * sizeof(float),
+      platform::errors::PreconditionNotMet("pull dense size error."));
+  size_t index = 0;
+  while (shard_buffer_remain > 0 && region_idx < region_num) {
+    auto& region = regions[region_idx];
+    if (region.size - region_data_idx >= shard_buffer_remain) {
+      memcpy((void*)(region.data + region_data_idx),
+             (uint8_t*)(void*)(region_buffer.data()) + index,
+             shard_buffer_remain);
+      region_data_idx += shard_buffer_remain;
+      shard_buffer_remain = 0;
+    } else if (region.size - region_data_idx == 0) {
+      ++region_idx;
+      region_data_idx = 0;
+    } else {
+      memcpy((void*)(region.data + region_data_idx),
+             (uint8_t*)(void*)(region_buffer.data()) + index,
+             region.size - region_data_idx);
+      shard_buffer_remain -= (region.size - region_data_idx);
+      index += (region.size - region_data_idx);
+      ++region_idx;
+      region_data_idx = 0;
+    }
+  }
+
+  return done();
+}
+
+::std::future<int32_t> PsLocalClient::push_dense_param(const Region* regions,
+                                                       size_t region_num,
+                                                       size_t table_id) {
+  auto* accessor = table_accessor(table_id);
+  auto* table_ptr = table(table_id);
+
+  std::vector<float> region_buffer;
+  region_buffer.resize(dense_dim_per_shard(accessor->fea_dim(), 1), 0);
+  for (size_t i = 0, offset = 0; i < region_num; ++i) {
+    uint32_t data_num = regions[i].size / sizeof(float);
+    memcpy(region_buffer.data() + offset, regions[i].data, regions[i].size);
+    offset += data_num;
+  }
+
+  // table_ptr->push_dense_param(region_buffer.data(), region_buffer.size());
+
+  return done();
+}
+
+::std::future<int32_t> PsLocalClient::push_dense_raw_gradient(
+    int table_id, float* total_send_data, size_t total_send_data_size,
+    void* callback) {
+  VLOG(1) << "wxx push_dense_raw_gradient";
+
+  PSClientClosure* closure = reinterpret_cast<PSClientClosure*>(callback);
+
+  auto* table_ptr = table(table_id);
+
+  table_ptr->push_dense(total_send_data, total_send_data_size);
+  delete closure;
+  return done();
+}
+
+::std::future<int32_t> PsLocalClient::push_dense(const Region* regions,
+                                                 size_t region_num,
+                                                 size_t table_id) {
+  auto* accessor = table_accessor(table_id);
+  auto* table_ptr = table(table_id);
+
+  std::vector<float> region_buffer;
+  region_buffer.resize(dense_dim_per_shard(accessor->fea_dim(), 1));
+  size_t data_size = region_buffer.size();
+  for (size_t i = 0, offset = 0; i < region_num; ++i) {
+    uint32_t data_num = regions[i].size / sizeof(float);
+    PADDLE_ENFORCE_LE(
+        offset + data_num, data_size,
+        platform::errors::PreconditionNotMet(
+            "invalid dense size, cur pos[%d] data_num[%d] size[%d]", offset,
+            data_num, data_size));
+    memcpy(region_buffer.data() + offset, regions[i].data, regions[i].size);
+    offset += data_num;
+  }
+
+  table_ptr->push_dense(region_buffer.data(), region_buffer.size());
+
+  return done();
+}
+
+//::std::future<int32_t> PsLocalClient::pull_sparse(float** select_values,
+//                                                  size_t table_id,
+//                                                  const uint64_t* keys,
+//                                                  size_t num) {
+//  // FIXME
+//  // auto timer =
+//  // std::make_shared<CostTimer>("pslib_downpour_client_pull_sparse");
+//  // auto local_timer =
+//  // std::make_shared<CostTimer>("pslib_downpour_client_pull_sparse_local");
+//  //将key拆分到各shard请求，并记录原始对应value指针
+//  auto* accessor = table_accessor(table_id);
+//  auto* table_ptr = table(table_id);
+//  size_t value_size = accessor->select_size();
+//
+//  // table_ptr->pull_sparse(keys, num);
+//  std::vector<float> res_data;
+//  res_data.resize(num * value_size / sizeof(float));
+//  table_ptr->pull_sparse(res_data.data(), keys, num);
+//  // memcpy(select_values[0], res_data->data(), res_data->size() *
+//  // sizeof(float));
+//  size_t offset = 0;
+//  for (int i = 0; i < num; ++i) {
+//    memcpy(select_values[i], (char*)res_data.data() + offset, value_size);
+//    offset += value_size;
+//  }
+//
+//  // return fut;
+//  return done();
+//}
+
+::std::future<int32_t> PsLocalClient::pull_sparse_ptr(char** select_values,
+                                                      size_t table_id,
+                                                      const uint64_t* keys,
+                                                      size_t num) {
+  // FIXME
+  // auto timer =
+  // std::make_shared<CostTimer>("pslib_downpour_client_pull_sparse");
+  // auto local_timer =
+  // std::make_shared<CostTimer>("pslib_downpour_client_pull_sparse_local");
+  //将key拆分到各shard请求，并记录原始对应value指针
+  auto* table_ptr = table(table_id);
+
+  table_ptr->pull_sparse_ptr(select_values, keys, num);
+
+  return done();
+}
+
+::std::future<int32_t> PsLocalClient::push_sparse_raw_gradient(
+    size_t table_id, const uint64_t* keys, const float** update_values,
+    size_t num, void* callback) {
+  VLOG(1) << "wxx push_sparse_raw_gradient";
+  PSClientClosure* closure = reinterpret_cast<PSClientClosure*>(callback);
+  auto* accessor = table_accessor(table_id);
+  auto* table_ptr = table(table_id);
+
+  table_ptr->push_sparse(keys, update_values, num);
+  delete closure;
+  return done();
+}
+
+::std::future<int32_t> PsLocalClient::push_sparse(size_t table_id,
+                                                  const uint64_t* keys,
+                                                  const float** update_values,
+                                                  size_t num) {
+  auto* accessor = table_accessor(table_id);
+  auto* table_ptr = table(table_id);
+
+  table_ptr->push_sparse(keys, update_values, num);
+  return done();
+}
+}
+}
diff --git a/paddle/fluid/distributed/service/ps_local_client.h b/paddle/fluid/distributed/service/ps_local_client.h
new file mode 100644
index 0000000000000..9d2b01a45fe92
--- /dev/null
+++ b/paddle/fluid/distributed/service/ps_local_client.h
@@ -0,0 +1,226 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License 0//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "brpc/channel.h"
+#include "brpc/controller.h"
+#include "brpc/server.h"
+#include "paddle/fluid/distributed/service/ps_client.h"
+
+namespace paddle {
+namespace distributed {
+
+class Table;
+
+class PsLocalClient : public PSClient {
+ public:
+  PsLocalClient() {}
+  virtual ~PsLocalClient() { _running = false; }
+  virtual int32_t create_client2client_connection(int pslib_timeout_ms,
+                                                  int pslib_connect_timeout_ms,
+                                                  int max_retry) {
+    return 0;
+  }
+
+  virtual ::std::future<int32_t> shrink(uint32_t table_id,
+                                        const std::string threshold) override;
+  virtual ::std::future<int32_t> load(const std::string& epoch,
+                                      const std::string& mode) override;
+  virtual ::std::future<int32_t> load(uint32_t table_id,
+                                      const std::string& epoch,
+                                      const std::string& mode) override;
+
+  virtual ::std::future<int32_t> save(const std::string& epoch,
+                                      const std::string& mode) override;
+  virtual ::std::future<int32_t> save(uint32_t table_id,
+                                      const std::string& epoch,
+                                      const std::string& mode) override;
+
+  virtual ::std::future<int32_t> clear() override;
+  virtual ::std::future<int32_t> clear(uint32_t table_id) override;
+
+  virtual ::std::future<int32_t> stop_server() override;
+
+  virtual void finalize_worker() override {}
+  virtual ::std::future<int32_t> pull_dense(Region* regions, size_t region_num,
+                                            size_t table_id);
+
+  virtual ::std::future<int32_t> push_dense(const Region* regions,
+                                            size_t region_num, size_t table_id);
+
+  virtual ::std::future<int32_t> push_dense_param(const Region* regions,
+                                                  size_t region_num,
+                                                  size_t table_id);
+
+  virtual ::std::future<int32_t> pull_sparse(float** select_values,
+                                             size_t table_id,
+                                             const uint64_t* keys, size_t num,
+                                             bool is_training) {
+    std::promise<int32_t> prom;
+    std::future<int32_t> fut = prom.get_future();
+    prom.set_value(0);
+
+    return fut;
+  }
+
+  virtual ::std::future<int32_t> pull_sparse_ptr(char** select_values,
+                                                 size_t table_id,
+                                                 const uint64_t* keys,
+                                                 size_t num);
+
+  virtual ::std::future<int32_t> print_table_stat(uint32_t table_id) {
+    std::promise<int32_t> prom;
+    std::future<int32_t> fut = prom.get_future();
+    prom.set_value(0);
+
+    return fut;
+  }
+  virtual ::std::future<int32_t> push_sparse(size_t table_id,
+                                             const uint64_t* keys,
+                                             const float** update_values,
+                                             size_t num);
+
+  virtual ::std::future<int32_t> flush();
+  // server profilera
+  virtual std::future<int32_t> start_profiler() {
+    std::promise<int32_t> prom;
+    std::future<int32_t> fut = prom.get_future();
+    prom.set_value(0);
+
+    return fut;
+  };
+
+  virtual std::future<int32_t> stop_profiler() {
+    std::promise<int32_t> prom;
+    std::future<int32_t> fut = prom.get_future();
+    prom.set_value(0);
+
+    return fut;
+  }
+
+  virtual std::future<int32_t> barrier(size_t table_id, uint32_t barrier_type) {
+    std::promise<int32_t> prom;
+    std::future<int32_t> fut = prom.get_future();
+    prom.set_value(0);
+
+    return fut;
+  }
+
+  virtual std::future<int32_t> pull_geo_param(size_t table_id,
+                                              std::vector<float>* values,
+                                              std::vector<uint64_t>* keys,
+                                              int pserver_idx) {
+    std::promise<int32_t> prom;
+    std::future<int32_t> fut = prom.get_future();
+    prom.set_value(0);
+
+    return fut;
+  }
+
+  virtual std::future<int32_t> push_global_step(int table_id,
+                                                int64_t* total_send_data,
+                                                void* done) {
+    std::promise<int32_t> prom;
+    std::future<int32_t> fut = prom.get_future();
+    prom.set_value(0);
+
+    return fut;
+  }
+
+  // recv table from server and save it in LodTensor
+  virtual int32_t recv_and_save_table(const uint64_t table_id,
+                                      const std::string& path) {
+    return 0;
+  }
+
+  virtual ::std::future<int32_t> send_client2client_msg(
+      int msg_type, int to_client_id, const std::string& msg) override {
+    std::promise<int32_t> prom;
+    std::future<int32_t> fut = prom.get_future();
+    prom.set_value(0);
+
+    return fut;
+  }
+  virtual size_t get_server_nums() { return 1; }
+
+  virtual std::future<int32_t> push_dense_raw_gradient(
+      int table_id, float* total_send_data, size_t total_send_data_size,
+      void* callback) override;
+
+  virtual std::future<int32_t> push_sparse_raw_gradient(
+      size_t table_id, const uint64_t* keys, const float** update_values,
+      size_t num, void* callback) override;
+
+  virtual std::future<int32_t> push_sparse_raw_gradient_partial(
+      size_t table_id, const uint64_t* keys, const float** update_values,
+      uint32_t num, void* done, int pserver_idx) override {
+    std::promise<int32_t> prom;
+    std::future<int32_t> fut = prom.get_future();
+    prom.set_value(0);
+
+    return fut;
+  }
+
+  virtual std::future<int32_t> push_sparse_param(size_t table_id,
+                                                 const uint64_t* keys,
+                                                 const float** update_values,
+                                                 size_t num,
+                                                 void* done) override {
+    std::promise<int32_t> prom;
+    std::future<int32_t> fut = prom.get_future();
+    prom.set_value(0);
+
+    return fut;
+  }
+
+ private:
+  virtual int32_t initialize() override;
+
+  std::future<int32_t> done() {
+    std::shared_ptr<std::promise<int32_t>> prom =
+        std::make_shared<std::promise<int32_t>>();
+    std::future<int32_t> fut = prom->get_future();
+    prom->set_value(0);
+    return fut;
+  }
+
+  inline uint32_t dense_dim_per_shard(uint32_t dense_dim_total,
+                                      uint32_t shard_num) {
+    return dense_dim_total / shard_num + 1;
+  }
+
+  inline std::unordered_map<uint32_t, std::shared_ptr<Table>>* table() {
+    return &_table_map;
+  }
+
+  inline Table* table(size_t table_id) {
+    auto itr = _table_map.find(table_id);
+    if (itr != _table_map.end()) {
+      return itr->second.get();
+    }
+    LOG(ERROR) << "table not found " << table_id;
+    return NULL;
+  }
+
+  std::unordered_map<uint32_t, std::shared_ptr<Table>> _table_map;
+
+  bool _running = false;
+  bool _flushing = false;
+
+ private:
+  float _mae = 0;
+  float _mse = 0;
+  uint16_t _push_times = 0;
+};
+}
+}
diff --git a/paddle/fluid/operators/distributed/parameter_send.h b/paddle/fluid/distributed/service/ps_local_server.h
similarity index 56%
rename from paddle/fluid/operators/distributed/parameter_send.h
rename to paddle/fluid/distributed/service/ps_local_server.h
index 4335ef8c73cc0..dfbccc70900e3 100644
--- a/paddle/fluid/operators/distributed/parameter_send.h
+++ b/paddle/fluid/distributed/service/ps_local_server.h
@@ -1,4 +1,4 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -14,22 +14,24 @@
 
 #pragma once
 
-#include <string>
+#include <memory>
 #include <vector>
-
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/distributed/communicator_common.h"
+#include "paddle/fluid/distributed/service/server.h"
 
 namespace paddle {
-namespace operators {
 namespace distributed {
 
-template <typename T>
-struct ParameterSend {
-  void operator()(const CommContext &rpc_ctx, const framework::Scope &scope,
-                  bool sync, int multi_parts);
-};
+class PsLocalServer : public PSServer {
+ public:
+  PsLocalServer() {}
+  virtual ~PsLocalServer() {}
+  virtual uint64_t start() { return 0; }
+  virtual uint64_t start(const std::string& ip, uint32_t port) { return 0; }
+  virtual int32_t stop() { return 0; }
+  virtual int32_t port() { return 0; }
 
-};  // namespace distributed
-};  // namespace operators
-};  // namespace paddle
+ private:
+  virtual int32_t initialize() { return 0; }
+};
+}
+}
diff --git a/paddle/fluid/distributed/service/server.cc b/paddle/fluid/distributed/service/server.cc
index 9324adad6979e..e44876e3d2b78 100644
--- a/paddle/fluid/distributed/service/server.cc
+++ b/paddle/fluid/distributed/service/server.cc
@@ -17,12 +17,14 @@
 #include "glog/logging.h"
 #include "paddle/fluid/distributed/service/brpc_ps_server.h"
 #include "paddle/fluid/distributed/service/graph_brpc_server.h"
+#include "paddle/fluid/distributed/service/ps_local_server.h"
 #include "paddle/fluid/distributed/table/table.h"
 
 namespace paddle {
 namespace distributed {
 
 REGISTER_PSCORE_CLASS(PSServer, BrpcPsServer);
+REGISTER_PSCORE_CLASS(PSServer, PsLocalServer);
 REGISTER_PSCORE_CLASS(PsBaseService, BrpcPsService);
 REGISTER_PSCORE_CLASS(PSServer, GraphBrpcServer);
 REGISTER_PSCORE_CLASS(PsBaseService, GraphBrpcService);
diff --git a/paddle/fluid/distributed/table/common_graph_table.cc b/paddle/fluid/distributed/table/common_graph_table.cc
index 020bcdcc52ef4..0dc99de1bfe82 100644
--- a/paddle/fluid/distributed/table/common_graph_table.cc
+++ b/paddle/fluid/distributed/table/common_graph_table.cc
@@ -171,7 +171,7 @@ int32_t GraphTable::load_nodes(const std::string &path, std::string node_type) {
 
 int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) {
   auto paths = paddle::string::split_string<std::string>(path, ";");
-  int count = 0;
+  int64_t count = 0;
   std::string sample_type = "random";
   bool is_weighted = false;
   int valid_count = 0;
diff --git a/paddle/fluid/distributed/table/common_graph_table.h b/paddle/fluid/distributed/table/common_graph_table.h
index 8ddf3c8f904a6..b18da82abe61c 100644
--- a/paddle/fluid/distributed/table/common_graph_table.h
+++ b/paddle/fluid/distributed/table/common_graph_table.h
@@ -33,26 +33,11 @@ namespace paddle {
 namespace distributed {
 class GraphShard {
  public:
-  // static int bucket_low_bound;
-  // static int gcd(int s, int t) {
-  //   if (s % t == 0) return t;
-  //   return gcd(t, s % t);
-  // }
   size_t get_size();
   GraphShard() {}
-  GraphShard(int shard_num) {
-    this->shard_num = shard_num;
-    // bucket_size = init_bucket_size(shard_num);
-    // bucket.resize(bucket_size);
-  }
+  GraphShard(int shard_num) { this->shard_num = shard_num; }
   std::vector<Node *> &get_bucket() { return bucket; }
   std::vector<Node *> get_batch(int start, int end, int step);
-  // int init_bucket_size(int shard_num) {
-  //   for (int i = bucket_low_bound;; i++) {
-  //     if (gcd(i, shard_num) == 1) return i;
-  //   }
-  //   return -1;
-  // }
   std::vector<uint64_t> get_ids_by_range(int start, int end) {
     std::vector<uint64_t> res;
     for (int i = start; i < end && i < bucket.size(); i++) {
@@ -64,7 +49,6 @@ class GraphShard {
   FeatureNode *add_feature_node(uint64_t id);
   Node *find_node(uint64_t id);
   void add_neighboor(uint64_t id, uint64_t dst_id, float weight);
-  // std::unordered_map<uint64_t, std::list<GraphNode *>::iterator>
   std::unordered_map<uint64_t, int> get_node_location() {
     return node_location;
   }
@@ -131,7 +115,7 @@ class GraphTable : public SparseTable {
  protected:
   std::vector<GraphShard> shards;
   size_t shard_start, shard_end, server_num, shard_num_per_table, shard_num;
-  const int task_pool_size_ = 11;
+  const int task_pool_size_ = 24;
   const int random_sample_nodes_ranges = 3;
 
   std::vector<std::string> feat_name;
diff --git a/paddle/fluid/distributed/table/common_sparse_table.cc b/paddle/fluid/distributed/table/common_sparse_table.cc
index a25a90aa9a7c1..1c315d34abcb6 100644
--- a/paddle/fluid/distributed/table/common_sparse_table.cc
+++ b/paddle/fluid/distributed/table/common_sparse_table.cc
@@ -126,17 +126,17 @@ void ProcessALine(const std::vector<std::string>& columns, const Meta& meta,
 int64_t SaveToText(std::ostream* os, std::shared_ptr<ValueBlock> block,
                    const int mode) {
   int64_t not_save_num = 0;
-  for (auto value : block->values_) {
-    if (mode == SaveMode::delta && !value.second->need_save_) {
+  for (auto& value : block->values_) {
+    if (mode == SaveMode::delta && !value.second.need_save_) {
       not_save_num++;
       continue;
     }
 
-    auto* vs = value.second->data_.data();
+    auto* vs = value.second.data_;
     std::stringstream ss;
     auto id = value.first;
-    ss << id << "\t" << value.second->count_ << "\t"
-       << value.second->unseen_days_ << "\t" << value.second->is_entry_ << "\t";
+    ss << id << "\t" << value.second.count_ << "\t" << value.second.unseen_days_
+       << "\t" << value.second.is_entry_ << "\t";
 
     for (int i = 0; i < block->value_length_; i++) {
       ss << vs[i];
@@ -148,7 +148,7 @@ int64_t SaveToText(std::ostream* os, std::shared_ptr<ValueBlock> block,
     os->write(ss.str().c_str(), sizeof(char) * ss.str().size());
 
     if (mode == SaveMode::base || mode == SaveMode::delta) {
-      value.second->need_save_ = false;
+      value.second.need_save_ = false;
     }
   }
 
@@ -446,6 +446,43 @@ int32_t CommonSparseTable::pull_sparse(float* pull_values,
   return 0;
 }
 
+int32_t CommonSparseTable::pull_sparse_ptr(char** pull_values,
+                                           const uint64_t* keys, size_t num) {
+  std::vector<std::vector<uint64_t>> offset_bucket;
+  offset_bucket.resize(task_pool_size_);
+
+  for (int x = 0; x < num; ++x) {
+    auto y = keys[x] % task_pool_size_;
+    offset_bucket[y].push_back(x);
+  }
+
+  std::vector<std::future<int>> tasks(task_pool_size_);
+
+  for (int shard_id = 0; shard_id < task_pool_size_; ++shard_id) {
+    tasks[shard_id] = _shards_task_pool[shard_id]->enqueue(
+        [this, shard_id, &keys, &offset_bucket, &pull_values]() -> int {
+          auto& block = shard_values_[shard_id];
+          auto& offsets = offset_bucket[shard_id];
+
+          for (int i = 0; i < offsets.size(); ++i) {
+            auto offset = offsets[i];
+            auto id = keys[offset];
+            auto* value = block->InitGet(id);
+            // std::copy_n(value + param_offset_, param_dim_,
+            //            pull_values + param_dim_ * offset);
+            pull_values[offset] = (char*)value;
+          }
+
+          return 0;
+        });
+  }
+
+  for (size_t shard_id = 0; shard_id < tasks.size(); ++shard_id) {
+    tasks[shard_id].wait();
+  }
+  return 0;
+}
+
 int32_t CommonSparseTable::_push_sparse(const uint64_t* keys,
                                         const float* values, size_t num) {
   rwlock_->RDLock();
@@ -502,6 +539,45 @@ int32_t CommonSparseTable::push_sparse(const uint64_t* keys,
   return 0;
 }
 
+int32_t CommonSparseTable::push_sparse(const uint64_t* keys,
+                                       const float** values, size_t num) {
+  _push_sparse(keys, values, num);
+  return 0;
+}
+
+int32_t CommonSparseTable::_push_sparse(const uint64_t* keys,
+                                        const float** values, size_t num) {
+  rwlock_->RDLock();
+  std::vector<std::vector<uint64_t>> offset_bucket;
+  offset_bucket.resize(task_pool_size_);
+
+  for (int x = 0; x < num; ++x) {
+    auto y = keys[x] % task_pool_size_;
+    offset_bucket[y].push_back(x);
+  }
+
+  std::vector<std::future<int>> tasks(task_pool_size_);
+
+  for (int shard_id = 0; shard_id < task_pool_size_; ++shard_id) {
+    tasks[shard_id] = _shards_task_pool[shard_id]->enqueue(
+        [this, shard_id, &keys, &values, num, &offset_bucket]() -> int {
+          auto& offsets = offset_bucket[shard_id];
+          for (size_t i = 0; i < offsets.size(); ++i) {
+            std::vector<uint64_t> tmp_off = {0};
+            optimizer_->update(keys + offsets[i], values[offsets[i]], num,
+                               tmp_off, shard_values_[shard_id].get());
+          }
+          return 0;
+        });
+  }
+
+  for (size_t shard_id = 0; shard_id < tasks.size(); ++shard_id) {
+    tasks[shard_id].wait();
+  }
+  rwlock_->UNLock();
+  return 0;
+}
+
 int32_t CommonSparseTable::push_sparse_param(const uint64_t* keys,
                                              const float* values, size_t num) {
   rwlock_->RDLock();
diff --git a/paddle/fluid/distributed/table/common_sparse_table.h b/paddle/fluid/distributed/table/common_sparse_table.h
index 31f4dabcdfdd7..50c295da53464 100644
--- a/paddle/fluid/distributed/table/common_sparse_table.h
+++ b/paddle/fluid/distributed/table/common_sparse_table.h
@@ -63,9 +63,15 @@ class CommonSparseTable : public SparseTable {
   virtual std::pair<int64_t, int64_t> print_table_stat();
   virtual int32_t pull_sparse(float* values, const PullSparseValue& pull_value);
 
+  virtual int32_t pull_sparse_ptr(char** pull_values, const uint64_t* keys,
+                                  size_t num);
+
   virtual int32_t push_sparse(const uint64_t* keys, const float* values,
                               size_t num);
 
+  virtual int32_t push_sparse(const uint64_t* keys, const float** values,
+                              size_t num);
+
   // only for sparse geo table
   virtual int32_t push_sparse_param(const uint64_t* keys, const float* values,
                                     size_t num);
@@ -80,6 +86,8 @@ class CommonSparseTable : public SparseTable {
  protected:
   virtual int32_t _push_sparse(const uint64_t* keys, const float* values,
                                size_t num);
+  virtual int32_t _push_sparse(const uint64_t* keys, const float** values,
+                               size_t num);
 
  private:
   const int task_pool_size_ = 11;
diff --git a/paddle/fluid/distributed/table/depends/large_scale_kv.h b/paddle/fluid/distributed/table/depends/large_scale_kv.h
index cb077033cad42..bb4174bd2c579 100644
--- a/paddle/fluid/distributed/table/depends/large_scale_kv.h
+++ b/paddle/fluid/distributed/table/depends/large_scale_kv.h
@@ -28,6 +28,7 @@
 
 #include "paddle/fluid/distributed/common/utils.h"
 #include "paddle/fluid/distributed/table/depends/initializers.h"
+#include "paddle/fluid/distributed/thirdparty/round_robin.h"
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/rw_lock.h"
@@ -54,23 +55,53 @@ struct VALUE {
         unseen_days_(0),
         need_save_(false),
         is_entry_(false) {
-    data_.resize(length);
-    memset(data_.data(), 0, sizeof(float) * length);
+    data_ = new float[length];
+    memset(data_, 0, sizeof(float) * length);
+  }
+
+  VALUE(const VALUE &value) {
+    length_ = value.length_;
+    count_ = value.count_;
+    unseen_days_ = value.unseen_days_;
+    need_save_ = value.need_save_;
+    is_entry_ = value.is_entry_;
+    data_ = new float[length_];
+    memcpy(data_, value.data_, sizeof(float) * length_);
+  }
+
+  VALUE &operator=(const VALUE &value) {
+    if (this != &value) {
+      delete[] data_;
+      length_ = value.length_;
+      count_ = value.count_;
+      unseen_days_ = value.unseen_days_;
+      need_save_ = value.need_save_;
+      is_entry_ = value.is_entry_;
+
+      data_ = new float[length_];
+      memcpy(data_, value.data_, sizeof(float) * length_);
+    }
+    return *this;
+  }
+
+  ~VALUE() {
+    delete[] data_;
+    data_ = nullptr;
   }
 
   size_t length_;
-  std::vector<float> data_;
   int count_;
   int unseen_days_;  // use to check knock-out
   bool need_save_;   // whether need to save
   bool is_entry_;    // whether knock-in
+  float *data_;
 };
 
-inline bool count_entry(std::shared_ptr<VALUE> value, int threshold) {
+inline bool count_entry(VALUE *value, int threshold) {
   return value->count_ >= threshold;
 }
 
-inline bool probility_entry(std::shared_ptr<VALUE> value, float threshold) {
+inline bool probility_entry(VALUE *value, float threshold) {
   UniformInitializer uniform = UniformInitializer({"uniform", "0", "0", "1"});
   return uniform.GetValue() >= threshold;
 }
@@ -87,7 +118,7 @@ class ValueBlock {
         value_dims_(value_dims),
         value_offsets_(value_offsets),
         value_idx_(value_idx) {
-    for (int x = 0; x < value_dims.size(); ++x) {
+    for (size_t x = 0; x < value_dims.size(); ++x) {
       value_length_ += value_dims[x];
     }
 
@@ -96,13 +127,15 @@ class ValueBlock {
       auto slices = string::split_string<std::string>(entry_attr, ":");
       if (slices[0] == "none") {
         entry_func_ = std::bind(&count_entry, std::placeholders::_1, 0);
+        threshold_ = 0;
       } else if (slices[0] == "count_filter_entry") {
-        int threshold = std::stoi(slices[1]);
-        entry_func_ = std::bind(&count_entry, std::placeholders::_1, threshold);
+        threshold_ = std::stoi(slices[1]);
+        entry_func_ =
+            std::bind(&count_entry, std::placeholders::_1, threshold_);
       } else if (slices[0] == "probability_entry") {
-        float threshold = std::stof(slices[1]);
+        threshold_ = std::stof(slices[1]);
         entry_func_ =
-            std::bind(&probility_entry, std::placeholders::_1, threshold);
+            std::bind(&probility_entry, std::placeholders::_1, threshold_);
       } else {
         PADDLE_THROW(platform::errors::InvalidArgument(
             "Not supported Entry Type : %s, Only support [CountFilterEntry, "
@@ -148,7 +181,7 @@ class ValueBlock {
       PADDLE_ENFORCE_EQ(
           value_dims[i], value_dims_[i],
           platform::errors::InvalidArgument("value dims is not match"));
-      pts.push_back(values->data_.data() +
+      pts.push_back(values.data_ +
                     value_offsets_.at(value_idx_.at(value_names[i])));
     }
     return pts;
@@ -158,19 +191,35 @@ class ValueBlock {
   float *Init(const uint64_t &id, const bool with_update = true,
               const int counter = 1) {
     if (!Has(id)) {
-      values_[id] = std::make_shared<VALUE>(value_length_);
+      values_.emplace(std::make_pair(id, VALUE(value_length_)));
     }
 
     auto &value = values_.at(id);
 
     if (with_update) {
-      AttrUpdate(value, counter);
+      AttrUpdate(&value, counter);
     }
 
-    return value->data_.data();
+    return value.data_;
   }
 
-  void AttrUpdate(std::shared_ptr<VALUE> value, const int counter) {
+
+  VALUE *InitGet(const uint64_t &id, const bool with_update = true,
+                 const int counter = 1) {
+    if (!Has(id)) {
+      values_.emplace(std::make_pair(id, VALUE(value_length_)));
+    }
+
+    auto &value = values_.at(id);
+
+    if (with_update) {
+      AttrUpdate(&value, counter);
+    }
+
+    return &value;
+  }
+
+  void AttrUpdate(VALUE *value, const int counter) {
     // update state
     value->unseen_days_ = 0;
     value->count_ += counter;
@@ -179,8 +228,8 @@ class ValueBlock {
       value->is_entry_ = entry_func_(value);
       if (value->is_entry_) {
         // initialize
-        for (int x = 0; x < value_names_.size(); ++x) {
-          initializers_[x]->GetValue(value->data_.data() + value_offsets_[x],
+        for (size_t x = 0; x < value_names_.size(); ++x) {
+          initializers_[x]->GetValue(value->data_ + value_offsets_[x],
                                      value_dims_[x]);
         }
         value->need_save_ = true;
@@ -195,27 +244,27 @@ class ValueBlock {
   // dont jude if (has(id))
   float *Get(const uint64_t &id) {
     auto &value = values_.at(id);
-    return value->data_.data();
+    return value.data_;
   }
 
   // for load, to reset count, unseen_days
-  std::shared_ptr<VALUE> GetValue(const uint64_t &id) { return values_.at(id); }
+  VALUE *GetValue(const uint64_t &id) { return &values_.at(id); }
 
   bool GetEntry(const uint64_t &id) {
     auto &value = values_.at(id);
-    return value->is_entry_;
+    return value.is_entry_;
   }
 
   void SetEntry(const uint64_t &id, const bool state) {
     auto &value = values_.at(id);
-    value->is_entry_ = state;
+    value.is_entry_ = state;
   }
 
   void Shrink(const int threshold) {
     for (auto iter = values_.begin(); iter != values_.end();) {
       auto &value = iter->second;
-      value->unseen_days_++;
-      if (value->unseen_days_ >= threshold) {
+      value.unseen_days_++;
+      if (value.unseen_days_ >= threshold) {
         iter = values_.erase(iter);
       } else {
         ++iter;
@@ -224,6 +273,8 @@ class ValueBlock {
     return;
   }
 
+  float GetThreshold() { return threshold_; }
+
  private:
   bool Has(const uint64_t id) {
     auto got = values_.find(id);
@@ -235,7 +286,7 @@ class ValueBlock {
   }
 
  public:
-  std::unordered_map<uint64_t, std::shared_ptr<VALUE>> values_;
+  robin_hood::unordered_map<uint64_t, VALUE> values_;
   size_t value_length_ = 0;
 
  private:
@@ -244,9 +295,11 @@ class ValueBlock {
   const std::vector<int> &value_offsets_;
   const std::unordered_map<std::string, int> &value_idx_;
 
-  std::function<bool(std::shared_ptr<VALUE>)> entry_func_;
+  std::function<bool(VALUE *)> entry_func_;
   std::vector<std::shared_ptr<Initializer>> initializers_;
+  float threshold_;
 };
 
 }  // namespace distributed
 }  // namespace paddle
+
diff --git a/paddle/fluid/distributed/table/table.h b/paddle/fluid/distributed/table/table.h
index 5bc818ff4741f..81a1ff5eced2b 100644
--- a/paddle/fluid/distributed/table/table.h
+++ b/paddle/fluid/distributed/table/table.h
@@ -48,10 +48,17 @@ class Table {
     return 0;
   }
 
+  virtual int32_t pull_sparse_ptr(char **pull_values, const uint64_t *keys,
+                                  size_t num) {
+    VLOG(0) << "NOT IMPLEMENT";
+    return 0;
+  }
   virtual int32_t pull_sparse(float *values,
                               const PullSparseValue &pull_value) = 0;
   virtual int32_t push_sparse(const uint64_t *keys, const float *values,
                               size_t num) = 0;
+  virtual int32_t push_sparse(const uint64_t *keys, const float **values,
+                              size_t num){};
   virtual int32_t push_sparse_param(const uint64_t *keys, const float *values,
                                     size_t num) {
     return 0;
diff --git a/paddle/fluid/distributed/thirdparty/round_robin.h b/paddle/fluid/distributed/thirdparty/round_robin.h
new file mode 100644
index 0000000000000..f5075b4545af0
--- /dev/null
+++ b/paddle/fluid/distributed/thirdparty/round_robin.h
@@ -0,0 +1,2685 @@
+//                 ______  _____                 ______                _________
+//  ______________ ___  /_ ___(_)_______         ___  /_ ______ ______ ______  /
+//  __  ___/_  __ \__  __ \__  / __  __ \        __  __ \_  __ \_  __ \_  __  /
+//  _  /    / /_/ /_  /_/ /_  /  _  / / /        _  / / // /_/ // /_/ // /_/ /
+//  /_/     \____/ /_.___/ /_/   /_/ /_/ ________/_/ /_/ \____/ \____/ \__,_/
+//                                      _/_____/
+//
+// Fast & memory efficient hashtable based on robin hood hashing for
+// C++11/14/17/20
+// https://github.com/martinus/robin-hood-hashing
+//
+// Licensed under the MIT License <http://opensource.org/licenses/MIT>.
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2021 Martin Ankerl <http://martin.ankerl.com>
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all
+// copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#ifndef ROBIN_HOOD_H_INCLUDED
+#define ROBIN_HOOD_H_INCLUDED
+
+// see https://semver.org/
+#define ROBIN_HOOD_VERSION_MAJOR 3  // for incompatible API changes
+#define ROBIN_HOOD_VERSION_MINOR \
+  11  // for adding functionality in a backwards-compatible manner
+#define ROBIN_HOOD_VERSION_PATCH 1  // for backwards-compatible bug fixes
+
+#include <algorithm>
+#include <cstdlib>
+#include <cstring>
+#include <functional>
+#include <memory>  // only to support hash of smart pointers
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+#include <utility>
+#if __cplusplus >= 201703L
+#include <string_view>
+#endif
+
+// #define ROBIN_HOOD_LOG_ENABLED
+#ifdef ROBIN_HOOD_LOG_ENABLED
+#include <iostream>
+#define ROBIN_HOOD_LOG(...)                                           \
+  std::cout << __FUNCTION__ << "@" << __LINE__ << ": " << __VA_ARGS__ \
+            << std::endl;
+#else
+#define ROBIN_HOOD_LOG(x)
+#endif
+
+// #define ROBIN_HOOD_TRACE_ENABLED
+#ifdef ROBIN_HOOD_TRACE_ENABLED
+#include <iostream>
+#define ROBIN_HOOD_TRACE(...)                                         \
+  std::cout << __FUNCTION__ << "@" << __LINE__ << ": " << __VA_ARGS__ \
+            << std::endl;
+#else
+#define ROBIN_HOOD_TRACE(x)
+#endif
+
+// #define ROBIN_HOOD_COUNT_ENABLED
+#ifdef ROBIN_HOOD_COUNT_ENABLED
+#include <iostream>
+#define ROBIN_HOOD_COUNT(x) ++counts().x;
+namespace robin_hood {
+struct Counts {
+  uint64_t shiftUp{};
+  uint64_t shiftDown{};
+};
+inline std::ostream &operator<<(std::ostream &os, Counts const &c) {
+  return os << c.shiftUp << " shiftUp" << std::endl
+            << c.shiftDown << " shiftDown" << std::endl;
+}
+
+static Counts &counts() {
+  static Counts counts{};
+  return counts;
+}
+}  // namespace robin_hood
+#else
+#define ROBIN_HOOD_COUNT(x)
+#endif
+
+// all non-argument macros should use this facility. See
+// https://www.fluentcpp.com/2019/05/28/better-macros-better-flags/
+#define ROBIN_HOOD(x) ROBIN_HOOD_PRIVATE_DEFINITION_##x()
+
+// mark unused members with this macro
+#define ROBIN_HOOD_UNUSED(identifier)
+
+// bitness
+#if SIZE_MAX == UINT32_MAX
+#define ROBIN_HOOD_PRIVATE_DEFINITION_BITNESS() 32
+#elif SIZE_MAX == UINT64_MAX
+#define ROBIN_HOOD_PRIVATE_DEFINITION_BITNESS() 64
+#else
+#error Unsupported bitness
+#endif
+
+// endianess
+#ifdef _MSC_VER
+#define ROBIN_HOOD_PRIVATE_DEFINITION_LITTLE_ENDIAN() 1
+#define ROBIN_HOOD_PRIVATE_DEFINITION_BIG_ENDIAN() 0
+#else
+#define ROBIN_HOOD_PRIVATE_DEFINITION_LITTLE_ENDIAN() \
+  (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+#define ROBIN_HOOD_PRIVATE_DEFINITION_BIG_ENDIAN() \
+  (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#endif
+
+// inline
+#ifdef _MSC_VER
+#define ROBIN_HOOD_PRIVATE_DEFINITION_NOINLINE() __declspec(noinline)
+#else
+#define ROBIN_HOOD_PRIVATE_DEFINITION_NOINLINE() __attribute__((noinline))
+#endif
+
+// exceptions
+#if !defined(__cpp_exceptions) && !defined(__EXCEPTIONS) && !defined(_CPPUNWIND)
+#define ROBIN_HOOD_PRIVATE_DEFINITION_HAS_EXCEPTIONS() 0
+#else
+#define ROBIN_HOOD_PRIVATE_DEFINITION_HAS_EXCEPTIONS() 1
+#endif
+
+// count leading/trailing bits
+#if !defined(ROBIN_HOOD_DISABLE_INTRINSICS)
+#ifdef _MSC_VER
+#if ROBIN_HOOD(BITNESS) == 32
+#define ROBIN_HOOD_PRIVATE_DEFINITION_BITSCANFORWARD() _BitScanForward
+#else
+#define ROBIN_HOOD_PRIVATE_DEFINITION_BITSCANFORWARD() _BitScanForward64
+#endif
+#include <intrin.h>
+#pragma intrinsic(ROBIN_HOOD(BITSCANFORWARD))
+#define ROBIN_HOOD_COUNT_TRAILING_ZEROES(x)                                   \
+  [](size_t mask) noexcept->int {                                             \
+    unsigned long index;                                                      \
+    return ROBIN_HOOD(BITSCANFORWARD)(&index, mask) ? static_cast<int>(index) \
+                                                    : ROBIN_HOOD(BITNESS);    \
+  }                                                                           \
+  (x)
+#else
+#if ROBIN_HOOD(BITNESS) == 32
+#define ROBIN_HOOD_PRIVATE_DEFINITION_CTZ() __builtin_ctzl
+#define ROBIN_HOOD_PRIVATE_DEFINITION_CLZ() __builtin_clzl
+#else
+#define ROBIN_HOOD_PRIVATE_DEFINITION_CTZ() __builtin_ctzll
+#define ROBIN_HOOD_PRIVATE_DEFINITION_CLZ() __builtin_clzll
+#endif
+#define ROBIN_HOOD_COUNT_LEADING_ZEROES(x) \
+  ((x) ? ROBIN_HOOD(CLZ)(x) : ROBIN_HOOD(BITNESS))
+#define ROBIN_HOOD_COUNT_TRAILING_ZEROES(x) \
+  ((x) ? ROBIN_HOOD(CTZ)(x) : ROBIN_HOOD(BITNESS))
+#endif
+#endif
+
+// fallthrough
+#ifndef __has_cpp_attribute  // For backwards compatibility
+#define __has_cpp_attribute(x) 0
+#endif
+#if __has_cpp_attribute(clang::fallthrough)
+#define ROBIN_HOOD_PRIVATE_DEFINITION_FALLTHROUGH() [[clang::fallthrough]]
+#elif __has_cpp_attribute(gnu::fallthrough)
+#define ROBIN_HOOD_PRIVATE_DEFINITION_FALLTHROUGH() [[gnu::fallthrough]]
+#else
+#define ROBIN_HOOD_PRIVATE_DEFINITION_FALLTHROUGH()
+#endif
+
+// likely/unlikely
+#ifdef _MSC_VER
+#define ROBIN_HOOD_LIKELY(condition) condition
+#define ROBIN_HOOD_UNLIKELY(condition) condition
+#else
+#define ROBIN_HOOD_LIKELY(condition) __builtin_expect(condition, 1)
+#define ROBIN_HOOD_UNLIKELY(condition) __builtin_expect(condition, 0)
+#endif
+
+// detect if native wchar_t type is availiable in MSVC
+#ifdef _MSC_VER
+#ifdef _NATIVE_WCHAR_T_DEFINED
+#define ROBIN_HOOD_PRIVATE_DEFINITION_HAS_NATIVE_WCHART() 1
+#else
+#define ROBIN_HOOD_PRIVATE_DEFINITION_HAS_NATIVE_WCHART() 0
+#endif
+#else
+#define ROBIN_HOOD_PRIVATE_DEFINITION_HAS_NATIVE_WCHART() 1
+#endif
+
+// detect if MSVC supports the pair(std::piecewise_construct_t,...) consructor
+// being constexpr
+#ifdef _MSC_VER
+#if _MSC_VER <= 1900
+#define ROBIN_HOOD_PRIVATE_DEFINITION_BROKEN_CONSTEXPR() 1
+#else
+#define ROBIN_HOOD_PRIVATE_DEFINITION_BROKEN_CONSTEXPR() 0
+#endif
+#else
+#define ROBIN_HOOD_PRIVATE_DEFINITION_BROKEN_CONSTEXPR() 0
+#endif
+
+// workaround missing "is_trivially_copyable" in g++ < 5.0
+// See https://stackoverflow.com/a/31798726/48181
+#if defined(__GNUC__) && __GNUC__ < 5
+#define ROBIN_HOOD_IS_TRIVIALLY_COPYABLE(...) __has_trivial_copy(__VA_ARGS__)
+#else
+#define ROBIN_HOOD_IS_TRIVIALLY_COPYABLE(...) \
+  std::is_trivially_copyable<__VA_ARGS__>::value
+#endif
+
+// helpers for C++ versions, see
+// https://gcc.gnu.org/onlinedocs/cpp/Standard-Predefined-Macros.html
+#define ROBIN_HOOD_PRIVATE_DEFINITION_CXX() __cplusplus
+#define ROBIN_HOOD_PRIVATE_DEFINITION_CXX98() 199711L
+#define ROBIN_HOOD_PRIVATE_DEFINITION_CXX11() 201103L
+#define ROBIN_HOOD_PRIVATE_DEFINITION_CXX14() 201402L
+#define ROBIN_HOOD_PRIVATE_DEFINITION_CXX17() 201703L
+
+#if ROBIN_HOOD(CXX) >= ROBIN_HOOD(CXX17)
+#define ROBIN_HOOD_PRIVATE_DEFINITION_NODISCARD() [[nodiscard]]
+#else
+#define ROBIN_HOOD_PRIVATE_DEFINITION_NODISCARD()
+#endif
+
+namespace robin_hood {
+
+#if ROBIN_HOOD(CXX) >= ROBIN_HOOD(CXX14)
+#define ROBIN_HOOD_STD std
+#else
+
+// c++11 compatibility layer
+namespace ROBIN_HOOD_STD {
+template <class T>
+struct alignment_of
+    : std::integral_constant<
+          std::size_t, alignof(typename std::remove_all_extents<T>::type)> {};
+
+template <class T, T... Ints>
+class integer_sequence {
+ public:
+  using value_type = T;
+  static_assert(std::is_integral<value_type>::value, "not integral type");
+  static constexpr std::size_t size() noexcept { return sizeof...(Ints); }
+};
+template <std::size_t... Inds>
+using index_sequence = integer_sequence<std::size_t, Inds...>;
+
+namespace detail_ {
+template <class T, T Begin, T End, bool>
+struct IntSeqImpl {
+  using TValue = T;
+  static_assert(std::is_integral<TValue>::value, "not integral type");
+  static_assert(Begin >= 0 && Begin < End,
+                "unexpected argument (Begin<0 || Begin<=End)");
+
+  template <class, class>
+  struct IntSeqCombiner;
+
+  template <TValue... Inds0, TValue... Inds1>
+  struct IntSeqCombiner<integer_sequence<TValue, Inds0...>,
+                        integer_sequence<TValue, Inds1...>> {
+    using TResult = integer_sequence<TValue, Inds0..., Inds1...>;
+  };
+
+  using TResult = typename IntSeqCombiner<
+      typename IntSeqImpl<TValue, Begin, Begin + (End - Begin) / 2,
+                          (End - Begin) / 2 == 1>::TResult,
+      typename IntSeqImpl<TValue, Begin + (End - Begin) / 2, End,
+                          (End - Begin + 1) / 2 == 1>::TResult>::TResult;
+};
+
+template <class T, T Begin>
+struct IntSeqImpl<T, Begin, Begin, false> {
+  using TValue = T;
+  static_assert(std::is_integral<TValue>::value, "not integral type");
+  static_assert(Begin >= 0, "unexpected argument (Begin<0)");
+  using TResult = integer_sequence<TValue>;
+};
+
+template <class T, T Begin, T End>
+struct IntSeqImpl<T, Begin, End, true> {
+  using TValue = T;
+  static_assert(std::is_integral<TValue>::value, "not integral type");
+  static_assert(Begin >= 0, "unexpected argument (Begin<0)");
+  using TResult = integer_sequence<TValue, Begin>;
+};
+}  // namespace detail_
+
+template <class T, T N>
+using make_integer_sequence =
+    typename detail_::IntSeqImpl<T, 0, N, (N - 0) == 1>::TResult;
+
+template <std::size_t N>
+using make_index_sequence = make_integer_sequence<std::size_t, N>;
+
+template <class... T>
+using index_sequence_for = make_index_sequence<sizeof...(T)>;
+
+}  // namespace ROBIN_HOOD_STD
+
+#endif
+
+namespace detail {
+
+// make sure we static_cast to the correct type for hash_int
+#if ROBIN_HOOD(BITNESS) == 64
+using SizeT = uint64_t;
+#else
+using SizeT = uint32_t;
+#endif
+
+template <typename T>
+T rotr(T x, unsigned k) {
+  return (x >> k) | (x << (8U * sizeof(T) - k));
+}
+
+// This cast gets rid of warnings like "cast from 'uint8_t*' {aka 'unsigned
+// char*'} to
+// 'uint64_t*' {aka 'long unsigned int*'} increases required alignment of target
+// type". Use with
+// care!
+template <typename T>
+inline T reinterpret_cast_no_cast_align_warning(void *ptr) noexcept {
+  return reinterpret_cast<T>(ptr);
+}
+
+template <typename T>
+inline T reinterpret_cast_no_cast_align_warning(void const *ptr) noexcept {
+  return reinterpret_cast<T>(ptr);
+}
+
+// make sure this is not inlined as it is slow and dramatically enlarges code,
+// thus making other
+// inlinings more difficult. Throws are also generally the slow path.
+template <typename E, typename... Args>
+[[noreturn]] ROBIN_HOOD(NOINLINE)
+#if ROBIN_HOOD(HAS_EXCEPTIONS)
+    void doThrow(Args &&... args) {
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-array-to-pointer-decay)
+  throw E(std::forward<Args>(args)...);
+}
+#else
+    void doThrow(Args &&... ROBIN_HOOD_UNUSED(args) /*unused*/) {
+  abort();
+}
+#endif
+
+template <typename E, typename T, typename... Args>
+T *assertNotNull(T *t, Args &&... args) {
+  if (ROBIN_HOOD_UNLIKELY(nullptr == t)) {
+    doThrow<E>(std::forward<Args>(args)...);
+  }
+  return t;
+}
+
+template <typename T>
+inline T unaligned_load(void const *ptr) noexcept {
+  // using memcpy so we don't get into unaligned load problems.
+  // compiler should optimize this very well anyways.
+  T t;
+  std::memcpy(&t, ptr, sizeof(T));
+  return t;
+}
+
+// Allocates bulks of memory for objects of type T. This deallocates the memory
+// in the destructor,
+// and keeps a linked list of the allocated memory around. Overhead per
+// allocation is the size of a
+// pointer.
+template <typename T, size_t MinNumAllocs = 4, size_t MaxNumAllocs = 256>
+class BulkPoolAllocator {
+ public:
+  BulkPoolAllocator() noexcept = default;
+
+  // does not copy anything, just creates a new allocator.
+  BulkPoolAllocator(const BulkPoolAllocator &ROBIN_HOOD_UNUSED(
+      o) /*unused*/) noexcept : mHead(nullptr),
+                                mListForFree(nullptr) {}
+
+  BulkPoolAllocator(BulkPoolAllocator &&o) noexcept
+      : mHead(o.mHead),
+        mListForFree(o.mListForFree) {
+    o.mListForFree = nullptr;
+    o.mHead = nullptr;
+  }
+
+  BulkPoolAllocator &operator=(BulkPoolAllocator &&o) noexcept {
+    reset();
+    mHead = o.mHead;
+    mListForFree = o.mListForFree;
+    o.mListForFree = nullptr;
+    o.mHead = nullptr;
+    return *this;
+  }
+
+  BulkPoolAllocator &
+  // NOLINTNEXTLINE(bugprone-unhandled-self-assignment,cert-oop54-cpp)
+  operator=(const BulkPoolAllocator &ROBIN_HOOD_UNUSED(o) /*unused*/) noexcept {
+    // does not do anything
+    return *this;
+  }
+
+  ~BulkPoolAllocator() noexcept { reset(); }
+
+  // Deallocates all allocated memory.
+  void reset() noexcept {
+    while (mListForFree) {
+      T *tmp = *mListForFree;
+      ROBIN_HOOD_LOG("std::free")
+      std::free(mListForFree);
+      mListForFree = reinterpret_cast_no_cast_align_warning<T **>(tmp);
+    }
+    mHead = nullptr;
+  }
+
+  // allocates, but does NOT initialize. Use in-place new constructor, e.g.
+  //   T* obj = pool.allocate();
+  //   ::new (static_cast<void*>(obj)) T();
+  T *allocate() {
+    T *tmp = mHead;
+    if (!tmp) {
+      tmp = performAllocation();
+    }
+
+    mHead = *reinterpret_cast_no_cast_align_warning<T **>(tmp);
+    return tmp;
+  }
+
+  // does not actually deallocate but puts it in store.
+  // make sure you have already called the destructor! e.g. with
+  //  obj->~T();
+  //  pool.deallocate(obj);
+  void deallocate(T *obj) noexcept {
+    *reinterpret_cast_no_cast_align_warning<T **>(obj) = mHead;
+    mHead = obj;
+  }
+
+  // Adds an already allocated block of memory to the allocator. This allocator
+  // is from now on
+  // responsible for freeing the data (with free()). If the provided data is not
+  // large enough to
+  // make use of, it is immediately freed. Otherwise it is reused and freed in
+  // the destructor.
+  void addOrFree(void *ptr, const size_t numBytes) noexcept {
+    // calculate number of available elements in ptr
+    if (numBytes < ALIGNMENT + ALIGNED_SIZE) {
+      // not enough data for at least one element. Free and return.
+      ROBIN_HOOD_LOG("std::free")
+      std::free(ptr);
+    } else {
+      ROBIN_HOOD_LOG("add to buffer")
+      add(ptr, numBytes);
+    }
+  }
+
+  void swap(BulkPoolAllocator<T, MinNumAllocs, MaxNumAllocs> &other) noexcept {
+    using std::swap;
+    swap(mHead, other.mHead);
+    swap(mListForFree, other.mListForFree);
+  }
+
+ private:
+  // iterates the list of allocated memory to calculate how many to alloc next.
+  // Recalculating this each time saves us a size_t member.
+  // This ignores the fact that memory blocks might have been added manually
+  // with addOrFree. In
+  // practice, this should not matter much.
+  ROBIN_HOOD(NODISCARD) size_t calcNumElementsToAlloc() const noexcept {
+    auto tmp = mListForFree;
+    size_t numAllocs = MinNumAllocs;
+
+    while (numAllocs * 2 <= MaxNumAllocs && tmp) {
+      auto x = reinterpret_cast<T ***>(tmp);
+      tmp = *x;
+      numAllocs *= 2;
+    }
+
+    return numAllocs;
+  }
+
+  // WARNING: Underflow if numBytes < ALIGNMENT! This is guarded in addOrFree().
+  void add(void *ptr, const size_t numBytes) noexcept {
+    const size_t numElements = (numBytes - ALIGNMENT) / ALIGNED_SIZE;
+
+    auto data = reinterpret_cast<T **>(ptr);
+
+    // link free list
+    auto x = reinterpret_cast<T ***>(data);
+    *x = mListForFree;
+    mListForFree = data;
+
+    // create linked list for newly allocated data
+    auto *const headT = reinterpret_cast_no_cast_align_warning<T *>(
+        reinterpret_cast<char *>(ptr) + ALIGNMENT);
+
+    auto *const head = reinterpret_cast<char *>(headT);
+
+    // Visual Studio compiler automatically unrolls this loop, which is pretty
+    // cool
+    for (size_t i = 0; i < numElements; ++i) {
+      *reinterpret_cast_no_cast_align_warning<char **>(
+          head + i * ALIGNED_SIZE) = head + (i + 1) * ALIGNED_SIZE;
+    }
+
+    // last one points to 0
+    *reinterpret_cast_no_cast_align_warning<T **>(
+        head + (numElements - 1) * ALIGNED_SIZE) = mHead;
+    mHead = headT;
+  }
+
+  // Called when no memory is available (mHead == 0).
+  // Don't inline this slow path.
+  ROBIN_HOOD(NOINLINE) T *performAllocation() {
+    size_t const numElementsToAlloc = calcNumElementsToAlloc();
+
+    // alloc new memory: [prev |T, T, ... T]
+    size_t const bytes = ALIGNMENT + ALIGNED_SIZE * numElementsToAlloc;
+    ROBIN_HOOD_LOG("std::malloc " << bytes << " = " << ALIGNMENT << " + "
+                                  << ALIGNED_SIZE << " * "
+                                  << numElementsToAlloc)
+    add(assertNotNull<std::bad_alloc>(std::malloc(bytes)), bytes);
+    return mHead;
+  }
+
+// enforce byte alignment of the T's
+#if ROBIN_HOOD(CXX) >= ROBIN_HOOD(CXX14)
+  static constexpr size_t ALIGNMENT =
+      (std::max)(std::alignment_of<T>::value, std::alignment_of<T *>::value);
+#else
+  static const size_t ALIGNMENT =
+      (ROBIN_HOOD_STD::alignment_of<T>::value >
+       ROBIN_HOOD_STD::alignment_of<T *>::value)
+          ? ROBIN_HOOD_STD::alignment_of<T>::value
+          : +ROBIN_HOOD_STD::alignment_of<T *>::value;  // the + is for
+                                                        // walkarround
+#endif
+
+  static constexpr size_t ALIGNED_SIZE =
+      ((sizeof(T) - 1) / ALIGNMENT + 1) * ALIGNMENT;
+
+  static_assert(MinNumAllocs >= 1, "MinNumAllocs");
+  static_assert(MaxNumAllocs >= MinNumAllocs, "MaxNumAllocs");
+  static_assert(ALIGNED_SIZE >= sizeof(T *), "ALIGNED_SIZE");
+  static_assert(0 == (ALIGNED_SIZE % sizeof(T *)), "ALIGNED_SIZE mod");
+  static_assert(ALIGNMENT >= sizeof(T *), "ALIGNMENT");
+
+  T *mHead{nullptr};
+  T **mListForFree{nullptr};
+};
+
+template <typename T, size_t MinSize, size_t MaxSize, bool IsFlat>
+struct NodeAllocator;
+
+// dummy allocator that does nothing
+template <typename T, size_t MinSize, size_t MaxSize>
+struct NodeAllocator<T, MinSize, MaxSize, true> {
+  // we are not using the data, so just free it.
+  void addOrFree(void *ptr,
+                 size_t ROBIN_HOOD_UNUSED(numBytes) /*unused*/) noexcept {
+    ROBIN_HOOD_LOG("std::free")
+    std::free(ptr);
+  }
+};
+
+template <typename T, size_t MinSize, size_t MaxSize>
+struct NodeAllocator<T, MinSize, MaxSize, false>
+    : public BulkPoolAllocator<T, MinSize, MaxSize> {};
+
+// c++14 doesn't have is_nothrow_swappable, and clang++ 6.0.1 doesn't like it
+// either, so I'm making
+// my own here.
+namespace swappable {
+#if ROBIN_HOOD(CXX) < ROBIN_HOOD(CXX17)
+using std::swap;
+template <typename T>
+struct nothrow {
+  static const bool value =
+      noexcept(swap(std::declval<T &>(), std::declval<T &>()));
+};
+#else
+template <typename T>
+struct nothrow {
+  static const bool value = std::is_nothrow_swappable<T>::value;
+};
+#endif
+}  // namespace swappable
+
+}  // namespace detail
+
+struct is_transparent_tag {};
+
+// A custom pair implementation is used in the map because std::pair is not
+// is_trivially_copyable,
+// which means it would  not be allowed to be used in std::memcpy. This struct
+// is copyable, which is
+// also tested.
+template <typename T1, typename T2>
+struct pair {
+  using first_type = T1;
+  using second_type = T2;
+
+  template <typename U1 = T1, typename U2 = T2,
+            typename = typename std::enable_if<
+                std::is_default_constructible<U1>::value &&
+                std::is_default_constructible<U2>::value>::type>
+  constexpr pair() noexcept(noexcept(U1()) && noexcept(U2()))
+      : first(), second() {}
+
+  // pair constructors are explicit so we don't accidentally call this ctor when
+  // we don't have to.
+  explicit constexpr pair(std::pair<T1, T2> const &o) noexcept(
+      noexcept(T1(std::declval<T1 const &>())) &&
+      noexcept(T2(std::declval<T2 const &>())))
+      : first(o.first), second(o.second) {}
+
+  // pair constructors are explicit so we don't accidentally call this ctor when
+  // we don't have to.
+  explicit constexpr pair(std::pair<T1, T2> &&o) noexcept(
+      noexcept(T1(std::move(std::declval<T1 &&>()))) &&
+      noexcept(T2(std::move(std::declval<T2 &&>()))))
+      : first(std::move(o.first)), second(std::move(o.second)) {}
+
+  constexpr pair(T1 &&a, T2 &&b) noexcept(
+      noexcept(T1(std::move(std::declval<T1 &&>()))) &&
+      noexcept(T2(std::move(std::declval<T2 &&>()))))
+      : first(std::move(a)), second(std::move(b)) {}
+
+  template <typename U1, typename U2>
+  constexpr pair(U1 &&a, U2 &&b) noexcept(
+      noexcept(T1(std::forward<U1>(std::declval<U1 &&>()))) &&
+      noexcept(T2(std::forward<U2>(std::declval<U2 &&>()))))
+      : first(std::forward<U1>(a)), second(std::forward<U2>(b)) {}
+
+  template <typename... U1, typename... U2>
+// MSVC 2015 produces error "C2476: ‘constexpr’ constructor does not initialize
+// all members"
+// if this constructor is constexpr
+#if !ROBIN_HOOD(BROKEN_CONSTEXPR)
+  constexpr
+#endif
+      pair(std::piecewise_construct_t /*unused*/, std::tuple<U1...> a,
+           std::tuple<U2...>
+               b) noexcept(noexcept(pair(std::declval<std::tuple<U1...> &>(),
+                                         std::declval<std::tuple<U2...> &>(),
+                                         ROBIN_HOOD_STD::index_sequence_for<
+                                             U1...>(),
+                                         ROBIN_HOOD_STD::index_sequence_for<
+                                             U2...>())))
+      : pair(a, b, ROBIN_HOOD_STD::index_sequence_for<U1...>(),
+             ROBIN_HOOD_STD::index_sequence_for<U2...>()) {
+  }
+
+  // constructor called from the std::piecewise_construct_t ctor
+  template <typename... U1, size_t... I1, typename... U2, size_t... I2>
+  pair(
+      std::tuple<U1...> &a, std::tuple<U2...> &b,
+      ROBIN_HOOD_STD::index_sequence<I1...> /*unused*/,
+      ROBIN_HOOD_STD::index_sequence<
+          I2...> /*unused*/) noexcept(noexcept(T1(std::
+                                                      forward<U1>(std::get<I1>(
+                                                          std::declval<
+                                                              std::tuple<U1...>
+                                                                  &>()))...)) &&
+                                      noexcept(T2(std::forward<U2>(std::get<I2>(
+                                          std::declval<
+                                              std::tuple<U2...> &>()))...)))
+      : first(std::forward<U1>(std::get<I1>(a))...),
+        second(std::forward<U2>(std::get<I2>(b))...) {
+    // make visual studio compiler happy about warning about unused a & b.
+    // Visual studio's pair implementation disables warning 4100.
+    (void)a;
+    (void)b;
+  }
+
+  void swap(pair<T1, T2> &o) noexcept((detail::swappable::nothrow<T1>::value) &&
+                                      (detail::swappable::nothrow<T2>::value)) {
+    using std::swap;
+    swap(first, o.first);
+    swap(second, o.second);
+  }
+
+  T1 first;   // NOLINT(misc-non-private-member-variables-in-classes)
+  T2 second;  // NOLINT(misc-non-private-member-variables-in-classes)
+};
+
+template <typename A, typename B>
+inline void swap(pair<A, B> &a, pair<A, B> &b) noexcept(
+    noexcept(std::declval<pair<A, B> &>().swap(std::declval<pair<A, B> &>()))) {
+  a.swap(b);
+}
+
+template <typename A, typename B>
+inline constexpr bool operator==(pair<A, B> const &x, pair<A, B> const &y) {
+  return (x.first == y.first) && (x.second == y.second);
+}
+template <typename A, typename B>
+inline constexpr bool operator!=(pair<A, B> const &x, pair<A, B> const &y) {
+  return !(x == y);
+}
+template <typename A, typename B>
+inline constexpr bool
+operator<(pair<A, B> const &x, pair<A, B> const &y) noexcept(
+    noexcept(std::declval<A const &>() < std::declval<A const &>()) &&
+    noexcept(std::declval<B const &>() < std::declval<B const &>())) {
+  return x.first < y.first || (!(y.first < x.first) && x.second < y.second);
+}
+template <typename A, typename B>
+inline constexpr bool operator>(pair<A, B> const &x, pair<A, B> const &y) {
+  return y < x;
+}
+template <typename A, typename B>
+inline constexpr bool operator<=(pair<A, B> const &x, pair<A, B> const &y) {
+  return !(x > y);
+}
+template <typename A, typename B>
+inline constexpr bool operator>=(pair<A, B> const &x, pair<A, B> const &y) {
+  return !(x < y);
+}
+
+inline size_t hash_bytes(void const *ptr, size_t len) noexcept {
+  static constexpr uint64_t m = UINT64_C(0xc6a4a7935bd1e995);
+  static constexpr uint64_t seed = UINT64_C(0xe17a1465);
+  static constexpr unsigned int r = 47;
+
+  auto const *const data64 = static_cast<uint64_t const *>(ptr);
+  uint64_t h = seed ^ (len * m);
+
+  size_t const n_blocks = len / 8;
+  for (size_t i = 0; i < n_blocks; ++i) {
+    auto k = detail::unaligned_load<uint64_t>(data64 + i);
+
+    k *= m;
+    k ^= k >> r;
+    k *= m;
+
+    h ^= k;
+    h *= m;
+  }
+
+  auto const *const data8 =
+      reinterpret_cast<uint8_t const *>(data64 + n_blocks);
+  switch (len & 7U) {
+    case 7:
+      h ^= static_cast<uint64_t>(data8[6]) << 48U;
+      ROBIN_HOOD(FALLTHROUGH);  // FALLTHROUGH
+    case 6:
+      h ^= static_cast<uint64_t>(data8[5]) << 40U;
+      ROBIN_HOOD(FALLTHROUGH);  // FALLTHROUGH
+    case 5:
+      h ^= static_cast<uint64_t>(data8[4]) << 32U;
+      ROBIN_HOOD(FALLTHROUGH);  // FALLTHROUGH
+    case 4:
+      h ^= static_cast<uint64_t>(data8[3]) << 24U;
+      ROBIN_HOOD(FALLTHROUGH);  // FALLTHROUGH
+    case 3:
+      h ^= static_cast<uint64_t>(data8[2]) << 16U;
+      ROBIN_HOOD(FALLTHROUGH);  // FALLTHROUGH
+    case 2:
+      h ^= static_cast<uint64_t>(data8[1]) << 8U;
+      ROBIN_HOOD(FALLTHROUGH);  // FALLTHROUGH
+    case 1:
+      h ^= static_cast<uint64_t>(data8[0]);
+      h *= m;
+      ROBIN_HOOD(FALLTHROUGH);  // FALLTHROUGH
+    default:
+      break;
+  }
+
+  h ^= h >> r;
+
+  // not doing the final step here, because this will be done by keyToIdx
+  // anyways
+  // h *= m;
+  // h ^= h >> r;
+  return static_cast<size_t>(h);
+}
+
+inline size_t hash_int(uint64_t x) noexcept {
+  // tried lots of different hashes, let's stick with murmurhash3. It's simple,
+  // fast, well tested,
+  // and doesn't need any special 128bit operations.
+  x ^= x >> 33U;
+  x *= UINT64_C(0xff51afd7ed558ccd);
+  x ^= x >> 33U;
+
+  // not doing the final step here, because this will be done by keyToIdx
+  // anyways
+  // x *= UINT64_C(0xc4ceb9fe1a85ec53);
+  // x ^= x >> 33U;
+  return static_cast<size_t>(x);
+}
+
+// A thin wrapper around std::hash, performing an additional simple mixing step
+// of the result.
+template <typename T, typename Enable = void>
+struct hash : public std::hash<T> {
+  size_t operator()(T const &obj) const noexcept(noexcept(
+      std::declval<std::hash<T>>().operator()(std::declval<T const &>()))) {
+    // call base hash
+    auto result = std::hash<T>::operator()(obj);
+    // return mixed of that, to be save against identity has
+    return hash_int(static_cast<detail::SizeT>(result));
+  }
+};
+
+template <typename CharT>
+struct hash<std::basic_string<CharT>> {
+  size_t operator()(std::basic_string<CharT> const &str) const noexcept {
+    return hash_bytes(str.data(), sizeof(CharT) * str.size());
+  }
+};
+
+#if ROBIN_HOOD(CXX) >= ROBIN_HOOD(CXX17)
+template <typename CharT>
+struct hash<std::basic_string_view<CharT>> {
+  size_t operator()(std::basic_string_view<CharT> const &sv) const noexcept {
+    return hash_bytes(sv.data(), sizeof(CharT) * sv.size());
+  }
+};
+#endif
+
+template <class T>
+struct hash<T *> {
+  size_t operator()(T *ptr) const noexcept {
+    return hash_int(reinterpret_cast<detail::SizeT>(ptr));
+  }
+};
+
+template <class T>
+struct hash<std::unique_ptr<T>> {
+  size_t operator()(std::unique_ptr<T> const &ptr) const noexcept {
+    return hash_int(reinterpret_cast<detail::SizeT>(ptr.get()));
+  }
+};
+
+template <class T>
+struct hash<std::shared_ptr<T>> {
+  size_t operator()(std::shared_ptr<T> const &ptr) const noexcept {
+    return hash_int(reinterpret_cast<detail::SizeT>(ptr.get()));
+  }
+};
+
+template <typename Enum>
+struct hash<Enum, typename std::enable_if<std::is_enum<Enum>::value>::type> {
+  size_t operator()(Enum e) const noexcept {
+    using Underlying = typename std::underlying_type<Enum>::type;
+    return hash<Underlying>{}(static_cast<Underlying>(e));
+  }
+};
+
+#define ROBIN_HOOD_HASH_INT(T)                       \
+  template <>                                        \
+  struct hash<T> {                                   \
+    size_t operator()(T const &obj) const noexcept { \
+      return hash_int(static_cast<uint64_t>(obj));   \
+    }                                                \
+  }
+
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wuseless-cast"
+#endif
+// see https://en.cppreference.com/w/cpp/utility/hash
+ROBIN_HOOD_HASH_INT(bool);
+ROBIN_HOOD_HASH_INT(char);
+ROBIN_HOOD_HASH_INT(signed char);
+ROBIN_HOOD_HASH_INT(unsigned char);
+ROBIN_HOOD_HASH_INT(char16_t);
+ROBIN_HOOD_HASH_INT(char32_t);
+#if ROBIN_HOOD(HAS_NATIVE_WCHART)
+ROBIN_HOOD_HASH_INT(wchar_t);
+#endif
+ROBIN_HOOD_HASH_INT(short);
+ROBIN_HOOD_HASH_INT(unsigned short);
+ROBIN_HOOD_HASH_INT(int);
+ROBIN_HOOD_HASH_INT(unsigned int);
+ROBIN_HOOD_HASH_INT(long);
+ROBIN_HOOD_HASH_INT(long long);
+ROBIN_HOOD_HASH_INT(unsigned long);
+ROBIN_HOOD_HASH_INT(unsigned long long);
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+namespace detail {
+
+template <typename T>
+struct void_type {
+  using type = void;
+};
+
+template <typename T, typename = void>
+struct has_is_transparent : public std::false_type {};
+
+template <typename T>
+struct has_is_transparent<T,
+                          typename void_type<typename T::is_transparent>::type>
+    : public std::true_type {};
+
+// using wrapper classes for hash and key_equal prevents the diamond problem
+// when the same type
+// is used. see https://stackoverflow.com/a/28771920/48181
+template <typename T>
+struct WrapHash : public T {
+  WrapHash() = default;
+  explicit WrapHash(T const &o) noexcept(noexcept(T(std::declval<T const &>())))
+      : T(o) {}
+};
+
+template <typename T>
+struct WrapKeyEqual : public T {
+  WrapKeyEqual() = default;
+  explicit WrapKeyEqual(T const &o) noexcept(
+      noexcept(T(std::declval<T const &>())))
+      : T(o) {}
+};
+
+// A highly optimized hashmap implementation, using the Robin Hood algorithm.
+//
+// In most cases, this map should be usable as a drop-in replacement for
+// std::unordered_map, but
+// be about 2x faster in most cases and require much less allocations.
+//
+// This implementation uses the following memory layout:
+//
+// [Node, Node, ... Node | info, info, ... infoSentinel ]
+//
+// * Node: either a DataNode that directly has the std::pair<key, val> as
+// member,
+//   or a DataNode with a pointer to std::pair<key,val>. Which DataNode
+//   representation to use
+//   depends on how fast the swap() operation is. Heuristically, this is
+//   automatically choosen
+//   based on sizeof(). there are always 2^n Nodes.
+//
+// * info: Each Node in the map has a corresponding info byte, so there are 2^n
+// info bytes.
+//   Each byte is initialized to 0, meaning the corresponding Node is empty. Set
+//   to 1 means the
+//   corresponding node contains data. Set to 2 means the corresponding Node is
+//   filled, but it
+//   actually belongs to the previous position and was pushed out because that
+//   place is already
+//   taken.
+//
+// * infoSentinel: Sentinel byte set to 1, so that iterator's ++ can stop at
+// end() without the
+//   need for a idx variable.
+//
+// According to STL, order of templates has effect on throughput. That's why
+// I've moved the
+// boolean to the front.
+// https://www.reddit.com/r/cpp/comments/ahp6iu/compile_time_binary_size_reductions_and_cs_future/eeguck4/
+template <bool IsFlat, size_t MaxLoadFactor100, typename Key, typename T,
+          typename Hash, typename KeyEqual>
+class Table
+    : public WrapHash<Hash>,
+      public WrapKeyEqual<KeyEqual>,
+      detail::NodeAllocator<
+          typename std::conditional<
+              std::is_void<T>::value, Key,
+              robin_hood::pair<
+                  typename std::conditional<IsFlat, Key, Key const>::type,
+                  T>>::type,
+          4, 16384, IsFlat> {
+ public:
+  static constexpr bool is_flat = IsFlat;
+  static constexpr bool is_map = !std::is_void<T>::value;
+  static constexpr bool is_set = !is_map;
+  static constexpr bool is_transparent =
+      has_is_transparent<Hash>::value && has_is_transparent<KeyEqual>::value;
+
+  using key_type = Key;
+  using mapped_type = T;
+  using value_type = typename std::conditional<
+      is_set, Key,
+      robin_hood::pair<typename std::conditional<is_flat, Key, Key const>::type,
+                       T>>::type;
+  using size_type = size_t;
+  using hasher = Hash;
+  using key_equal = KeyEqual;
+  using Self =
+      Table<IsFlat, MaxLoadFactor100, key_type, mapped_type, hasher, key_equal>;
+
+ private:
+  static_assert(MaxLoadFactor100 > 10 && MaxLoadFactor100 < 100,
+                "MaxLoadFactor100 needs to be >10 && < 100");
+
+  using WHash = WrapHash<Hash>;
+  using WKeyEqual = WrapKeyEqual<KeyEqual>;
+
+  // configuration defaults
+
+  // make sure we have 8 elements, needed to quickly rehash mInfo
+  static constexpr size_t InitialNumElements = sizeof(uint64_t);
+  static constexpr uint32_t InitialInfoNumBits = 5;
+  static constexpr uint8_t InitialInfoInc = 1U << InitialInfoNumBits;
+  static constexpr size_t InfoMask = InitialInfoInc - 1U;
+  static constexpr uint8_t InitialInfoHashShift = 0;
+  using DataPool = detail::NodeAllocator<value_type, 4, 16384, IsFlat>;
+
+  // type needs to be wider than uint8_t.
+  using InfoType = uint32_t;
+
+  // DataNode ////////////////////////////////////////////////////////
+
+  // Primary template for the data node. We have special implementations for
+  // small and big
+  // objects. For large objects it is assumed that swap() is fairly slow, so we
+  // allocate these
+  // on the heap so swap merely swaps a pointer.
+  template <typename M, bool>
+  class DataNode {};
+
+  // Small: just allocate on the stack.
+  template <typename M>
+  class DataNode<M, true> final {
+   public:
+    template <typename... Args>
+    explicit DataNode(
+        M &ROBIN_HOOD_UNUSED(map) /*unused*/,
+        Args &&... args) noexcept(noexcept(value_type(std::
+                                                          forward<Args>(
+                                                              args)...)))
+        : mData(std::forward<Args>(args)...) {}
+
+    DataNode(
+        M &ROBIN_HOOD_UNUSED(map) /*unused*/,
+        DataNode<M, true>
+            &&n) noexcept(std::is_nothrow_move_constructible<value_type>::value)
+        : mData(std::move(n.mData)) {}
+
+    // doesn't do anything
+    void destroy(M &ROBIN_HOOD_UNUSED(map) /*unused*/) noexcept {}
+    void destroyDoNotDeallocate() noexcept {}
+
+    value_type const *operator->() const noexcept { return &mData; }
+    value_type *operator->() noexcept { return &mData; }
+
+    const value_type &operator*() const noexcept { return mData; }
+
+    value_type &operator*() noexcept { return mData; }
+
+    template <typename VT = value_type>
+    ROBIN_HOOD(NODISCARD)
+    typename std::enable_if<is_map, typename VT::first_type &>::type
+        getFirst() noexcept {
+      return mData.first;
+    }
+    template <typename VT = value_type>
+    ROBIN_HOOD(NODISCARD)
+    typename std::enable_if<is_set, VT &>::type getFirst() noexcept {
+      return mData;
+    }
+
+    template <typename VT = value_type>
+    ROBIN_HOOD(NODISCARD)
+    typename std::enable_if<is_map, typename VT::first_type const &>::type
+        getFirst() const noexcept {
+      return mData.first;
+    }
+    template <typename VT = value_type>
+    ROBIN_HOOD(NODISCARD)
+    typename std::enable_if<is_set, VT const &>::type getFirst() const
+        noexcept {
+      return mData;
+    }
+
+    template <typename MT = mapped_type>
+    ROBIN_HOOD(NODISCARD)
+    typename std::enable_if<is_map, MT &>::type getSecond() noexcept {
+      return mData.second;
+    }
+
+    template <typename MT = mapped_type>
+    ROBIN_HOOD(NODISCARD)
+    typename std::enable_if<is_set, MT const &>::type getSecond() const
+        noexcept {
+      return mData.second;
+    }
+
+    void swap(DataNode<M, true> &o) noexcept(
+        noexcept(std::declval<value_type>().swap(std::declval<value_type>()))) {
+      mData.swap(o.mData);
+    }
+
+   private:
+    value_type mData;
+  };
+
+  // big object: allocate on heap.
+  template <typename M>
+  class DataNode<M, false> {
+   public:
+    template <typename... Args>
+    explicit DataNode(M &map, Args &&... args) : mData(map.allocate()) {
+      ::new (static_cast<void *>(mData))
+          value_type(std::forward<Args>(args)...);
+    }
+
+    DataNode(M &ROBIN_HOOD_UNUSED(map) /*unused*/,
+             DataNode<M, false> &&n) noexcept : mData(std::move(n.mData)) {}
+
+    void destroy(M &map) noexcept {
+      // don't deallocate, just put it into list of datapool.
+      mData->~value_type();
+      map.deallocate(mData);
+    }
+
+    void destroyDoNotDeallocate() noexcept { mData->~value_type(); }
+
+    value_type const *operator->() const noexcept { return mData; }
+
+    value_type *operator->() noexcept { return mData; }
+
+    const value_type &operator*() const { return *mData; }
+
+    value_type &operator*() { return *mData; }
+
+    template <typename VT = value_type>
+    ROBIN_HOOD(NODISCARD)
+    typename std::enable_if<is_map, typename VT::first_type &>::type
+        getFirst() noexcept {
+      return mData->first;
+    }
+    template <typename VT = value_type>
+    ROBIN_HOOD(NODISCARD)
+    typename std::enable_if<is_set, VT &>::type getFirst() noexcept {
+      return *mData;
+    }
+
+    template <typename VT = value_type>
+    ROBIN_HOOD(NODISCARD)
+    typename std::enable_if<is_map, typename VT::first_type const &>::type
+        getFirst() const noexcept {
+      return mData->first;
+    }
+    template <typename VT = value_type>
+    ROBIN_HOOD(NODISCARD)
+    typename std::enable_if<is_set, VT const &>::type getFirst() const
+        noexcept {
+      return *mData;
+    }
+
+    template <typename MT = mapped_type>
+    ROBIN_HOOD(NODISCARD)
+    typename std::enable_if<is_map, MT &>::type getSecond() noexcept {
+      return mData->second;
+    }
+
+    template <typename MT = mapped_type>
+    ROBIN_HOOD(NODISCARD)
+    typename std::enable_if<is_map, MT const &>::type getSecond() const
+        noexcept {
+      return mData->second;
+    }
+
+    void swap(DataNode<M, false> &o) noexcept {
+      using std::swap;
+      swap(mData, o.mData);
+    }
+
+   private:
+    value_type *mData;
+  };
+
+  using Node = DataNode<Self, IsFlat>;
+
+  // helpers for insertKeyPrepareEmptySpot: extract first entry (only const
+  // required)
+  ROBIN_HOOD(NODISCARD)
+  key_type const &getFirstConst(Node const &n) const noexcept {
+    return n.getFirst();
+  }
+
+  // in case we have void mapped_type, we are not using a pair, thus we just
+  // route k through.
+  // No need to disable this because it's just not used if not applicable.
+  ROBIN_HOOD(NODISCARD)
+  key_type const &getFirstConst(key_type const &k) const noexcept { return k; }
+
+  // in case we have non-void mapped_type, we have a standard robin_hood::pair
+  template <typename Q = mapped_type>
+  ROBIN_HOOD(NODISCARD)
+  typename std::enable_if<!std::is_void<Q>::value, key_type const &>::type
+      getFirstConst(value_type const &vt) const noexcept {
+    return vt.first;
+  }
+
+  // Cloner //////////////////////////////////////////////////////////
+
+  template <typename M, bool UseMemcpy>
+  struct Cloner;
+
+  // fast path: Just copy data, without allocating anything.
+  template <typename M>
+  struct Cloner<M, true> {
+    void operator()(M const &source, M &target) const {
+      auto const *const src = reinterpret_cast<char const *>(source.mKeyVals);
+      auto *tgt = reinterpret_cast<char *>(target.mKeyVals);
+      auto const numElementsWithBuffer =
+          target.calcNumElementsWithBuffer(target.mMask + 1);
+      std::copy(src, src + target.calcNumBytesTotal(numElementsWithBuffer),
+                tgt);
+    }
+  };
+
+  template <typename M>
+  struct Cloner<M, false> {
+    void operator()(M const &s, M &t) const {
+      auto const numElementsWithBuffer =
+          t.calcNumElementsWithBuffer(t.mMask + 1);
+      std::copy(s.mInfo, s.mInfo + t.calcNumBytesInfo(numElementsWithBuffer),
+                t.mInfo);
+
+      for (size_t i = 0; i < numElementsWithBuffer; ++i) {
+        if (t.mInfo[i]) {
+          ::new (static_cast<void *>(t.mKeyVals + i)) Node(t, *s.mKeyVals[i]);
+        }
+      }
+    }
+  };
+
+  // Destroyer ///////////////////////////////////////////////////////
+
+  template <typename M, bool IsFlatAndTrivial>
+  struct Destroyer {};
+
+  template <typename M>
+  struct Destroyer<M, true> {
+    void nodes(M &m) const noexcept { m.mNumElements = 0; }
+
+    void nodesDoNotDeallocate(M &m) const noexcept { m.mNumElements = 0; }
+  };
+
+  template <typename M>
+  struct Destroyer<M, false> {
+    void nodes(M &m) const noexcept {
+      m.mNumElements = 0;
+      // clear also resets mInfo to 0, that's sometimes not necessary.
+      auto const numElementsWithBuffer =
+          m.calcNumElementsWithBuffer(m.mMask + 1);
+
+      for (size_t idx = 0; idx < numElementsWithBuffer; ++idx) {
+        if (0 != m.mInfo[idx]) {
+          Node &n = m.mKeyVals[idx];
+          n.destroy(m);
+          n.~Node();
+        }
+      }
+    }
+
+    void nodesDoNotDeallocate(M &m) const noexcept {
+      m.mNumElements = 0;
+      // clear also resets mInfo to 0, that's sometimes not necessary.
+      auto const numElementsWithBuffer =
+          m.calcNumElementsWithBuffer(m.mMask + 1);
+      for (size_t idx = 0; idx < numElementsWithBuffer; ++idx) {
+        if (0 != m.mInfo[idx]) {
+          Node &n = m.mKeyVals[idx];
+          n.destroyDoNotDeallocate();
+          n.~Node();
+        }
+      }
+    }
+  };
+
+  // Iter ////////////////////////////////////////////////////////////
+
+  struct fast_forward_tag {};
+
+  // generic iterator for both const_iterator and iterator.
+  template <bool IsConst>
+  // NOLINTNEXTLINE(hicpp-special-member-functions,cppcoreguidelines-special-member-functions)
+  class Iter {
+   private:
+    using NodePtr =
+        typename std::conditional<IsConst, Node const *, Node *>::type;
+
+   public:
+    using difference_type = std::ptrdiff_t;
+    using value_type = typename Self::value_type;
+    using reference = typename std::conditional<IsConst, value_type const &,
+                                                value_type &>::type;
+    using pointer = typename std::conditional<IsConst, value_type const *,
+                                              value_type *>::type;
+    using iterator_category = std::forward_iterator_tag;
+
+    // default constructed iterator can be compared to itself, but WON'T return
+    // true when
+    // compared to end().
+    Iter() = default;
+
+    // Rule of zero: nothing specified. The conversion constructor is only
+    // enabled for
+    // iterator to const_iterator, so it doesn't accidentally work as a copy
+    // ctor.
+
+    // Conversion constructor from iterator to const_iterator.
+    template <bool OtherIsConst, typename = typename std::enable_if<
+                                     IsConst && !OtherIsConst>::type>
+    // NOLINTNEXTLINE(hicpp-explicit-conversions)
+    Iter(Iter<OtherIsConst> const &other) noexcept : mKeyVals(other.mKeyVals),
+                                                     mInfo(other.mInfo) {}
+
+    Iter(NodePtr valPtr, uint8_t const *infoPtr) noexcept : mKeyVals(valPtr),
+                                                            mInfo(infoPtr) {}
+
+    Iter(NodePtr valPtr, uint8_t const *infoPtr,
+         fast_forward_tag ROBIN_HOOD_UNUSED(tag) /*unused*/) noexcept
+        : mKeyVals(valPtr),
+          mInfo(infoPtr) {
+      fastForward();
+    }
+
+    template <bool OtherIsConst, typename = typename std::enable_if<
+                                     IsConst && !OtherIsConst>::type>
+    Iter &operator=(Iter<OtherIsConst> const &other) noexcept {
+      mKeyVals = other.mKeyVals;
+      mInfo = other.mInfo;
+      return *this;
+    }
+
+    // prefix increment. Undefined behavior if we are at end()!
+    Iter &operator++() noexcept {
+      mInfo++;
+      mKeyVals++;
+      fastForward();
+      return *this;
+    }
+
+    Iter operator++(int)noexcept {
+      Iter tmp = *this;
+      ++(*this);
+      return tmp;
+    }
+
+    reference operator*() const { return **mKeyVals; }
+
+    pointer operator->() const { return &**mKeyVals; }
+
+    template <bool O>
+    bool operator==(Iter<O> const &o) const noexcept {
+      return mKeyVals == o.mKeyVals;
+    }
+
+    template <bool O>
+    bool operator!=(Iter<O> const &o) const noexcept {
+      return mKeyVals != o.mKeyVals;
+    }
+
+   private:
+    // fast forward to the next non-free info byte
+    // I've tried a few variants that don't depend on intrinsics, but
+    // unfortunately they are
+    // quite a bit slower than this one. So I've reverted that change again. See
+    // map_benchmark.
+    void fastForward() noexcept {
+      size_t n = 0;
+      while (0U == (n = detail::unaligned_load<size_t>(mInfo))) {
+        mInfo += sizeof(size_t);
+        mKeyVals += sizeof(size_t);
+      }
+#if defined(ROBIN_HOOD_DISABLE_INTRINSICS)
+      // we know for certain that within the next 8 bytes we'll find a non-zero
+      // one.
+      if (ROBIN_HOOD_UNLIKELY(0U == detail::unaligned_load<uint32_t>(mInfo))) {
+        mInfo += 4;
+        mKeyVals += 4;
+      }
+      if (ROBIN_HOOD_UNLIKELY(0U == detail::unaligned_load<uint16_t>(mInfo))) {
+        mInfo += 2;
+        mKeyVals += 2;
+      }
+      if (ROBIN_HOOD_UNLIKELY(0U == *mInfo)) {
+        mInfo += 1;
+        mKeyVals += 1;
+      }
+#else
+#if ROBIN_HOOD(LITTLE_ENDIAN)
+      auto inc = ROBIN_HOOD_COUNT_TRAILING_ZEROES(n) / 8;
+#else
+      auto inc = ROBIN_HOOD_COUNT_LEADING_ZEROES(n) / 8;
+#endif
+      mInfo += inc;
+      mKeyVals += inc;
+#endif
+    }
+
+    friend class Table<IsFlat, MaxLoadFactor100, key_type, mapped_type, hasher,
+                       key_equal>;
+    NodePtr mKeyVals{nullptr};
+    uint8_t const *mInfo{nullptr};
+  };
+
+  ////////////////////////////////////////////////////////////////////
+
+  // highly performance relevant code.
+  // Lower bits are used for indexing into the array (2^n size)
+  // The upper 1-5 bits need to be a reasonable good hash, to save comparisons.
+  template <typename HashKey>
+  void keyToIdx(HashKey &&key, size_t *idx, InfoType *info) const {
+    // In addition to whatever hash is used, add another mul & shift so we get
+    // better hashing.
+    // This serves as a bad hash prevention, if the given data is
+    // badly mixed.
+    auto h = static_cast<uint64_t>(WHash::operator()(key));
+
+    h *= mHashMultiplier;
+    h ^= h >> 33U;
+
+    // the lower InitialInfoNumBits are reserved for info.
+    *info = mInfoInc + static_cast<InfoType>((h & InfoMask) >> mInfoHashShift);
+    *idx = (static_cast<size_t>(h) >> InitialInfoNumBits) & mMask;
+  }
+
+  // forwards the index by one, wrapping around at the end
+  void next(InfoType *info, size_t *idx) const noexcept {
+    *idx = *idx + 1;
+    *info += mInfoInc;
+  }
+
+  void nextWhileLess(InfoType *info, size_t *idx) const noexcept {
+    // unrolling this by hand did not bring any speedups.
+    while (*info < mInfo[*idx]) {
+      next(info, idx);
+    }
+  }
+
+  // Shift everything up by one element. Tries to move stuff around.
+  void shiftUp(size_t startIdx, size_t const insertion_idx) noexcept(
+      std::is_nothrow_move_assignable<Node>::value) {
+    auto idx = startIdx;
+    ::new (static_cast<void *>(mKeyVals + idx))
+        Node(std::move(mKeyVals[idx - 1]));
+    while (--idx != insertion_idx) {
+      mKeyVals[idx] = std::move(mKeyVals[idx - 1]);
+    }
+
+    idx = startIdx;
+    while (idx != insertion_idx) {
+      ROBIN_HOOD_COUNT(shiftUp)
+      mInfo[idx] = static_cast<uint8_t>(mInfo[idx - 1] + mInfoInc);
+      if (ROBIN_HOOD_UNLIKELY(mInfo[idx] + mInfoInc > 0xFF)) {
+        mMaxNumElementsAllowed = 0;
+      }
+      --idx;
+    }
+  }
+
+  void shiftDown(size_t idx) noexcept(
+      std::is_nothrow_move_assignable<Node>::value) {
+    // until we find one that is either empty or has zero offset.
+    // TODO(martinus) we don't need to move everything, just the last one for
+    // the same
+    // bucket.
+    mKeyVals[idx].destroy(*this);
+
+    // until we find one that is either empty or has zero offset.
+    while (mInfo[idx + 1] >= 2 * mInfoInc) {
+      ROBIN_HOOD_COUNT(shiftDown)
+      mInfo[idx] = static_cast<uint8_t>(mInfo[idx + 1] - mInfoInc);
+      mKeyVals[idx] = std::move(mKeyVals[idx + 1]);
+      ++idx;
+    }
+
+    mInfo[idx] = 0;
+    // don't destroy, we've moved it
+    // mKeyVals[idx].destroy(*this);
+    mKeyVals[idx].~Node();
+  }
+
+  // copy of find(), except that it returns iterator instead of const_iterator.
+  template <typename Other>
+  ROBIN_HOOD(NODISCARD)
+  size_t findIdx(Other const &key) const {
+    size_t idx{};
+    InfoType info{};
+    keyToIdx(key, &idx, &info);
+
+    do {
+      // unrolling this twice gives a bit of a speedup. More unrolling did not
+      // help.
+      if (info == mInfo[idx] && ROBIN_HOOD_LIKELY(WKeyEqual::operator()(
+                                    key, mKeyVals[idx].getFirst()))) {
+        return idx;
+      }
+      next(&info, &idx);
+      if (info == mInfo[idx] && ROBIN_HOOD_LIKELY(WKeyEqual::operator()(
+                                    key, mKeyVals[idx].getFirst()))) {
+        return idx;
+      }
+      next(&info, &idx);
+    } while (info <= mInfo[idx]);
+
+    // nothing found!
+    return mMask == 0
+               ? 0
+               : static_cast<size_t>(std::distance(
+                     mKeyVals,
+                     reinterpret_cast_no_cast_align_warning<Node *>(mInfo)));
+  }
+
+  void cloneData(const Table &o) {
+    Cloner<Table, IsFlat && ROBIN_HOOD_IS_TRIVIALLY_COPYABLE(Node)>()(o, *this);
+  }
+
+  // inserts a keyval that is guaranteed to be new, e.g. when the hashmap is
+  // resized.
+  // @return True on success, false if something went wrong
+  void insert_move(Node &&keyval) {
+    // we don't retry, fail if overflowing
+    // don't need to check max num elements
+    if (0 == mMaxNumElementsAllowed && !try_increase_info()) {
+      throwOverflowError();
+    }
+
+    size_t idx{};
+    InfoType info{};
+    keyToIdx(keyval.getFirst(), &idx, &info);
+
+    // skip forward. Use <= because we are certain that the element is not
+    // there.
+    while (info <= mInfo[idx]) {
+      idx = idx + 1;
+      info += mInfoInc;
+    }
+
+    // key not found, so we are now exactly where we want to insert it.
+    auto const insertion_idx = idx;
+    auto const insertion_info = static_cast<uint8_t>(info);
+    if (ROBIN_HOOD_UNLIKELY(insertion_info + mInfoInc > 0xFF)) {
+      mMaxNumElementsAllowed = 0;
+    }
+
+    // find an empty spot
+    while (0 != mInfo[idx]) {
+      next(&info, &idx);
+    }
+
+    auto &l = mKeyVals[insertion_idx];
+    if (idx == insertion_idx) {
+      ::new (static_cast<void *>(&l)) Node(std::move(keyval));
+    } else {
+      shiftUp(idx, insertion_idx);
+      l = std::move(keyval);
+    }
+
+    // put at empty spot
+    mInfo[insertion_idx] = insertion_info;
+
+    ++mNumElements;
+  }
+
+ public:
+  using iterator = Iter<false>;
+  using const_iterator = Iter<true>;
+
+  Table() noexcept(noexcept(Hash()) && noexcept(KeyEqual()))
+      : WHash(), WKeyEqual() {
+    ROBIN_HOOD_TRACE(this)
+  }
+
+  // Creates an empty hash map. Nothing is allocated yet, this happens at the
+  // first insert.
+  // This tremendously speeds up ctor & dtor of a map that never receives an
+  // element. The
+  // penalty is payed at the first insert, and not before. Lookup of this empty
+  // map works
+  // because everybody points to DummyInfoByte::b. parameter bucket_count is
+  // dictated by the
+  // standard, but we can ignore it.
+  explicit Table(
+      size_t ROBIN_HOOD_UNUSED(bucket_count) /*unused*/, const Hash &h = Hash{},
+      const KeyEqual &equal = KeyEqual{}) noexcept(noexcept(Hash(h)) &&
+                                                   noexcept(KeyEqual(equal)))
+      : WHash(h), WKeyEqual(equal) {
+    ROBIN_HOOD_TRACE(this)
+  }
+
+  template <typename Iter>
+  Table(Iter first, Iter last,
+        size_t ROBIN_HOOD_UNUSED(bucket_count) /*unused*/ = 0,
+        const Hash &h = Hash{}, const KeyEqual &equal = KeyEqual{})
+      : WHash(h), WKeyEqual(equal) {
+    ROBIN_HOOD_TRACE(this)
+    insert(first, last);
+  }
+
+  Table(std::initializer_list<value_type> initlist,
+        size_t ROBIN_HOOD_UNUSED(bucket_count) /*unused*/ = 0,
+        const Hash &h = Hash{}, const KeyEqual &equal = KeyEqual{})
+      : WHash(h), WKeyEqual(equal) {
+    ROBIN_HOOD_TRACE(this)
+    insert(initlist.begin(), initlist.end());
+  }
+
+  Table(Table &&o) noexcept : WHash(std::move(static_cast<WHash &>(o))),
+                              WKeyEqual(std::move(static_cast<WKeyEqual &>(o))),
+                              DataPool(std::move(static_cast<DataPool &>(o))) {
+    ROBIN_HOOD_TRACE(this)
+    if (o.mMask) {
+      mHashMultiplier = std::move(o.mHashMultiplier);
+      mKeyVals = std::move(o.mKeyVals);
+      mInfo = std::move(o.mInfo);
+      mNumElements = std::move(o.mNumElements);
+      mMask = std::move(o.mMask);
+      mMaxNumElementsAllowed = std::move(o.mMaxNumElementsAllowed);
+      mInfoInc = std::move(o.mInfoInc);
+      mInfoHashShift = std::move(o.mInfoHashShift);
+      // set other's mask to 0 so its destructor won't do anything
+      o.init();
+    }
+  }
+
+  Table &operator=(Table &&o) noexcept {
+    ROBIN_HOOD_TRACE(this)
+    if (&o != this) {
+      if (o.mMask) {
+        // only move stuff if the other map actually has some data
+        destroy();
+        mHashMultiplier = std::move(o.mHashMultiplier);
+        mKeyVals = std::move(o.mKeyVals);
+        mInfo = std::move(o.mInfo);
+        mNumElements = std::move(o.mNumElements);
+        mMask = std::move(o.mMask);
+        mMaxNumElementsAllowed = std::move(o.mMaxNumElementsAllowed);
+        mInfoInc = std::move(o.mInfoInc);
+        mInfoHashShift = std::move(o.mInfoHashShift);
+        WHash::operator=(std::move(static_cast<WHash &>(o)));
+        WKeyEqual::operator=(std::move(static_cast<WKeyEqual &>(o)));
+        DataPool::operator=(std::move(static_cast<DataPool &>(o)));
+
+        o.init();
+
+      } else {
+        // nothing in the other map => just clear us.
+        clear();
+      }
+    }
+    return *this;
+  }
+
+  Table(const Table &o)
+      : WHash(static_cast<const WHash &>(o)),
+        WKeyEqual(static_cast<const WKeyEqual &>(o)),
+        DataPool(static_cast<const DataPool &>(o)) {
+    ROBIN_HOOD_TRACE(this)
+    if (!o.empty()) {
+      // not empty: create an exact copy. it is also possible to just iterate
+      // through all
+      // elements and insert them, but copying is probably faster.
+
+      auto const numElementsWithBuffer = calcNumElementsWithBuffer(o.mMask + 1);
+      auto const numBytesTotal = calcNumBytesTotal(numElementsWithBuffer);
+
+      ROBIN_HOOD_LOG("std::malloc " << numBytesTotal << " = calcNumBytesTotal("
+                                    << numElementsWithBuffer << ")")
+      mHashMultiplier = o.mHashMultiplier;
+      mKeyVals = static_cast<Node *>(
+          detail::assertNotNull<std::bad_alloc>(std::malloc(numBytesTotal)));
+      // no need for calloc because clonData does memcpy
+      mInfo = reinterpret_cast<uint8_t *>(mKeyVals + numElementsWithBuffer);
+      mNumElements = o.mNumElements;
+      mMask = o.mMask;
+      mMaxNumElementsAllowed = o.mMaxNumElementsAllowed;
+      mInfoInc = o.mInfoInc;
+      mInfoHashShift = o.mInfoHashShift;
+      cloneData(o);
+    }
+  }
+
+  // Creates a copy of the given map. Copy constructor of each entry is used.
+  // Not sure why clang-tidy thinks this doesn't handle self assignment, it does
+  // NOLINTNEXTLINE(bugprone-unhandled-self-assignment,cert-oop54-cpp)
+  Table &operator=(Table const &o) {
+    ROBIN_HOOD_TRACE(this)
+    if (&o == this) {
+      // prevent assigning of itself
+      return *this;
+    }
+
+    // we keep using the old allocator and not assign the new one, because we
+    // want to keep
+    // the memory available. when it is the same size.
+    if (o.empty()) {
+      if (0 == mMask) {
+        // nothing to do, we are empty too
+        return *this;
+      }
+
+      // not empty: destroy what we have there
+      // clear also resets mInfo to 0, that's sometimes not necessary.
+      destroy();
+      init();
+      WHash::operator=(static_cast<const WHash &>(o));
+      WKeyEqual::operator=(static_cast<const WKeyEqual &>(o));
+      DataPool::operator=(static_cast<DataPool const &>(o));
+
+      return *this;
+    }
+
+    // clean up old stuff
+    Destroyer<Self, IsFlat && std::is_trivially_destructible<Node>::value>{}
+        .nodes(*this);
+
+    if (mMask != o.mMask) {
+      // no luck: we don't have the same array size allocated, so we need to
+      // realloc.
+      if (0 != mMask) {
+        // only deallocate if we actually have data!
+        ROBIN_HOOD_LOG("std::free")
+        std::free(mKeyVals);
+      }
+
+      auto const numElementsWithBuffer = calcNumElementsWithBuffer(o.mMask + 1);
+      auto const numBytesTotal = calcNumBytesTotal(numElementsWithBuffer);
+      ROBIN_HOOD_LOG("std::malloc " << numBytesTotal << " = calcNumBytesTotal("
+                                    << numElementsWithBuffer << ")")
+      mKeyVals = static_cast<Node *>(
+          detail::assertNotNull<std::bad_alloc>(std::malloc(numBytesTotal)));
+
+      // no need for calloc here because cloneData performs a memcpy.
+      mInfo = reinterpret_cast<uint8_t *>(mKeyVals + numElementsWithBuffer);
+      // sentinel is set in cloneData
+    }
+    WHash::operator=(static_cast<const WHash &>(o));
+    WKeyEqual::operator=(static_cast<const WKeyEqual &>(o));
+    DataPool::operator=(static_cast<DataPool const &>(o));
+    mHashMultiplier = o.mHashMultiplier;
+    mNumElements = o.mNumElements;
+    mMask = o.mMask;
+    mMaxNumElementsAllowed = o.mMaxNumElementsAllowed;
+    mInfoInc = o.mInfoInc;
+    mInfoHashShift = o.mInfoHashShift;
+    cloneData(o);
+
+    return *this;
+  }
+
+  // Swaps everything between the two maps.
+  void swap(Table &o) {
+    ROBIN_HOOD_TRACE(this)
+    using std::swap;
+    swap(o, *this);
+  }
+
+  // Clears all data, without resizing.
+  void clear() {
+    ROBIN_HOOD_TRACE(this)
+    if (empty()) {
+      // don't do anything! also important because we don't want to write to
+      // DummyInfoByte::b, even though we would just write 0 to it.
+      return;
+    }
+
+    Destroyer<Self, IsFlat && std::is_trivially_destructible<Node>::value>{}
+        .nodes(*this);
+
+    auto const numElementsWithBuffer = calcNumElementsWithBuffer(mMask + 1);
+    // clear everything, then set the sentinel again
+    uint8_t const z = 0;
+    std::fill(mInfo, mInfo + calcNumBytesInfo(numElementsWithBuffer), z);
+    mInfo[numElementsWithBuffer] = 1;
+
+    mInfoInc = InitialInfoInc;
+    mInfoHashShift = InitialInfoHashShift;
+  }
+
+  // Destroys the map and all it's contents.
+  ~Table() {
+    ROBIN_HOOD_TRACE(this)
+    destroy();
+  }
+
+  // Checks if both tables contain the same entries. Order is irrelevant.
+  bool operator==(const Table &other) const {
+    ROBIN_HOOD_TRACE(this)
+    if (other.size() != size()) {
+      return false;
+    }
+    for (auto const &otherEntry : other) {
+      if (!has(otherEntry)) {
+        return false;
+      }
+    }
+
+    return true;
+  }
+
+  bool operator!=(const Table &other) const {
+    ROBIN_HOOD_TRACE(this)
+    return !operator==(other);
+  }
+
+  template <typename Q = mapped_type>
+  typename std::enable_if<!std::is_void<Q>::value, Q &>::type operator[](
+      const key_type &key) {
+    ROBIN_HOOD_TRACE(this)
+    auto idxAndState = insertKeyPrepareEmptySpot(key);
+    switch (idxAndState.second) {
+      case InsertionState::key_found:
+        break;
+
+      case InsertionState::new_node:
+        ::new (static_cast<void *>(&mKeyVals[idxAndState.first]))
+            Node(*this, std::piecewise_construct, std::forward_as_tuple(key),
+                 std::forward_as_tuple());
+        break;
+
+      case InsertionState::overwrite_node:
+        mKeyVals[idxAndState.first] =
+            Node(*this, std::piecewise_construct, std::forward_as_tuple(key),
+                 std::forward_as_tuple());
+        break;
+
+      case InsertionState::overflow_error:
+        throwOverflowError();
+    }
+
+    return mKeyVals[idxAndState.first].getSecond();
+  }
+
+  template <typename Q = mapped_type>
+  typename std::enable_if<!std::is_void<Q>::value, Q &>::type operator[](
+      key_type &&key) {
+    ROBIN_HOOD_TRACE(this)
+    auto idxAndState = insertKeyPrepareEmptySpot(key);
+    switch (idxAndState.second) {
+      case InsertionState::key_found:
+        break;
+
+      case InsertionState::new_node:
+        ::new (static_cast<void *>(&mKeyVals[idxAndState.first])) Node(
+            *this, std::piecewise_construct,
+            std::forward_as_tuple(std::move(key)), std::forward_as_tuple());
+        break;
+
+      case InsertionState::overwrite_node:
+        mKeyVals[idxAndState.first] = Node(
+            *this, std::piecewise_construct,
+            std::forward_as_tuple(std::move(key)), std::forward_as_tuple());
+        break;
+
+      case InsertionState::overflow_error:
+        throwOverflowError();
+    }
+
+    return mKeyVals[idxAndState.first].getSecond();
+  }
+
+  template <typename Iter>
+  void insert(Iter first, Iter last) {
+    for (; first != last; ++first) {
+      // value_type ctor needed because this might be called with std::pair's
+      insert(value_type(*first));
+    }
+  }
+
+  void insert(std::initializer_list<value_type> ilist) {
+    for (auto &&vt : ilist) {
+      insert(std::move(vt));
+    }
+  }
+
+  template <typename... Args>
+  std::pair<iterator, bool> emplace(Args &&... args) {
+    ROBIN_HOOD_TRACE(this)
+    Node n{*this, std::forward<Args>(args)...};
+    auto idxAndState = insertKeyPrepareEmptySpot(getFirstConst(n));
+    switch (idxAndState.second) {
+      case InsertionState::key_found:
+        n.destroy(*this);
+        break;
+
+      case InsertionState::new_node:
+        ::new (static_cast<void *>(&mKeyVals[idxAndState.first]))
+            Node(*this, std::move(n));
+        break;
+
+      case InsertionState::overwrite_node:
+        mKeyVals[idxAndState.first] = std::move(n);
+        break;
+
+      case InsertionState::overflow_error:
+        n.destroy(*this);
+        throwOverflowError();
+        break;
+    }
+
+    return std::make_pair(
+        iterator(mKeyVals + idxAndState.first, mInfo + idxAndState.first),
+        InsertionState::key_found != idxAndState.second);
+  }
+
+  template <typename... Args>
+  std::pair<iterator, bool> try_emplace(const key_type &key, Args &&... args) {
+    return try_emplace_impl(key, std::forward<Args>(args)...);
+  }
+
+  template <typename... Args>
+  std::pair<iterator, bool> try_emplace(key_type &&key, Args &&... args) {
+    return try_emplace_impl(std::move(key), std::forward<Args>(args)...);
+  }
+
+  template <typename... Args>
+  std::pair<iterator, bool> try_emplace(const_iterator hint,
+                                        const key_type &key, Args &&... args) {
+    (void)hint;
+    return try_emplace_impl(key, std::forward<Args>(args)...);
+  }
+
+  template <typename... Args>
+  std::pair<iterator, bool> try_emplace(const_iterator hint, key_type &&key,
+                                        Args &&... args) {
+    (void)hint;
+    return try_emplace_impl(std::move(key), std::forward<Args>(args)...);
+  }
+
+  template <typename Mapped>
+  std::pair<iterator, bool> insert_or_assign(const key_type &key,
+                                             Mapped &&obj) {
+    return insertOrAssignImpl(key, std::forward<Mapped>(obj));
+  }
+
+  template <typename Mapped>
+  std::pair<iterator, bool> insert_or_assign(key_type &&key, Mapped &&obj) {
+    return insertOrAssignImpl(std::move(key), std::forward<Mapped>(obj));
+  }
+
+  template <typename Mapped>
+  std::pair<iterator, bool> insert_or_assign(const_iterator hint,
+                                             const key_type &key,
+                                             Mapped &&obj) {
+    (void)hint;
+    return insertOrAssignImpl(key, std::forward<Mapped>(obj));
+  }
+
+  template <typename Mapped>
+  std::pair<iterator, bool> insert_or_assign(const_iterator hint,
+                                             key_type &&key, Mapped &&obj) {
+    (void)hint;
+    return insertOrAssignImpl(std::move(key), std::forward<Mapped>(obj));
+  }
+
+  std::pair<iterator, bool> insert(const value_type &keyval) {
+    ROBIN_HOOD_TRACE(this)
+    return emplace(keyval);
+  }
+
+  std::pair<iterator, bool> insert(value_type &&keyval) {
+    return emplace(std::move(keyval));
+  }
+
+  // Returns 1 if key is found, 0 otherwise.
+  size_t count(const key_type &key) const {  // NOLINT(modernize-use-nodiscard)
+    ROBIN_HOOD_TRACE(this)
+    auto kv = mKeyVals + findIdx(key);
+    if (kv != reinterpret_cast_no_cast_align_warning<Node *>(mInfo)) {
+      return 1;
+    }
+    return 0;
+  }
+
+  template <typename OtherKey, typename Self_ = Self>
+  // NOLINTNEXTLINE(modernize-use-nodiscard)
+  typename std::enable_if<Self_::is_transparent, size_t>::type count(
+      const OtherKey &key) const {
+    ROBIN_HOOD_TRACE(this)
+    auto kv = mKeyVals + findIdx(key);
+    if (kv != reinterpret_cast_no_cast_align_warning<Node *>(mInfo)) {
+      return 1;
+    }
+    return 0;
+  }
+
+  bool contains(const key_type &key) const {  // NOLINT(modernize-use-nodiscard)
+    return 1U == count(key);
+  }
+
+  template <typename OtherKey, typename Self_ = Self>
+  // NOLINTNEXTLINE(modernize-use-nodiscard)
+  typename std::enable_if<Self_::is_transparent, bool>::type contains(
+      const OtherKey &key) const {
+    return 1U == count(key);
+  }
+
+  // Returns a reference to the value found for key.
+  // Throws std::out_of_range if element cannot be found
+  template <typename Q = mapped_type>
+  // NOLINTNEXTLINE(modernize-use-nodiscard)
+  typename std::enable_if<!std::is_void<Q>::value, Q &>::type at(
+      key_type const &key) {
+    ROBIN_HOOD_TRACE(this)
+    auto kv = mKeyVals + findIdx(key);
+    if (kv == reinterpret_cast_no_cast_align_warning<Node *>(mInfo)) {
+      doThrow<std::out_of_range>("key not found");
+    }
+    return kv->getSecond();
+  }
+
+  // Returns a reference to the value found for key.
+  // Throws std::out_of_range if element cannot be found
+  template <typename Q = mapped_type>
+  // NOLINTNEXTLINE(modernize-use-nodiscard)
+  typename std::enable_if<!std::is_void<Q>::value, Q const &>::type at(
+      key_type const &key) const {
+    ROBIN_HOOD_TRACE(this)
+    auto kv = mKeyVals + findIdx(key);
+    if (kv == reinterpret_cast_no_cast_align_warning<Node *>(mInfo)) {
+      doThrow<std::out_of_range>("key not found");
+    }
+    return kv->getSecond();
+  }
+
+  const_iterator find(
+      const key_type &key) const {  // NOLINT(modernize-use-nodiscard)
+    ROBIN_HOOD_TRACE(this)
+    const size_t idx = findIdx(key);
+    return const_iterator{mKeyVals + idx, mInfo + idx};
+  }
+
+  template <typename OtherKey>
+  const_iterator find(const OtherKey &key,
+                      is_transparent_tag /*unused*/) const {
+    ROBIN_HOOD_TRACE(this)
+    const size_t idx = findIdx(key);
+    return const_iterator{mKeyVals + idx, mInfo + idx};
+  }
+
+  template <typename OtherKey, typename Self_ = Self>
+  typename std::enable_if<
+      Self_::is_transparent,             // NOLINT(modernize-use-nodiscard)
+      const_iterator>::type              // NOLINT(modernize-use-nodiscard)
+      find(const OtherKey &key) const {  // NOLINT(modernize-use-nodiscard)
+    ROBIN_HOOD_TRACE(this)
+    const size_t idx = findIdx(key);
+    return const_iterator{mKeyVals + idx, mInfo + idx};
+  }
+
+  iterator find(const key_type &key) {
+    ROBIN_HOOD_TRACE(this)
+    const size_t idx = findIdx(key);
+    return iterator{mKeyVals + idx, mInfo + idx};
+  }
+
+  template <typename OtherKey>
+  iterator find(const OtherKey &key, is_transparent_tag /*unused*/) {
+    ROBIN_HOOD_TRACE(this)
+    const size_t idx = findIdx(key);
+    return iterator{mKeyVals + idx, mInfo + idx};
+  }
+
+  template <typename OtherKey, typename Self_ = Self>
+  typename std::enable_if<Self_::is_transparent, iterator>::type find(
+      const OtherKey &key) {
+    ROBIN_HOOD_TRACE(this)
+    const size_t idx = findIdx(key);
+    return iterator{mKeyVals + idx, mInfo + idx};
+  }
+
+  iterator begin() {
+    ROBIN_HOOD_TRACE(this)
+    if (empty()) {
+      return end();
+    }
+    return iterator(mKeyVals, mInfo, fast_forward_tag{});
+  }
+  const_iterator begin() const {  // NOLINT(modernize-use-nodiscard)
+    ROBIN_HOOD_TRACE(this)
+    return cbegin();
+  }
+  const_iterator cbegin() const {  // NOLINT(modernize-use-nodiscard)
+    ROBIN_HOOD_TRACE(this)
+    if (empty()) {
+      return cend();
+    }
+    return const_iterator(mKeyVals, mInfo, fast_forward_tag{});
+  }
+
+  iterator end() {
+    ROBIN_HOOD_TRACE(this)
+    // no need to supply valid info pointer: end() must not be dereferenced, and
+    // only node
+    // pointer is compared.
+    return iterator{reinterpret_cast_no_cast_align_warning<Node *>(mInfo),
+                    nullptr};
+  }
+  const_iterator end() const {  // NOLINT(modernize-use-nodiscard)
+    ROBIN_HOOD_TRACE(this)
+    return cend();
+  }
+  const_iterator cend() const {  // NOLINT(modernize-use-nodiscard)
+    ROBIN_HOOD_TRACE(this)
+    return const_iterator{reinterpret_cast_no_cast_align_warning<Node *>(mInfo),
+                          nullptr};
+  }
+
+  iterator erase(const_iterator pos) {
+    ROBIN_HOOD_TRACE(this)
+    // its safe to perform const cast here
+    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
+    return erase(iterator{const_cast<Node *>(pos.mKeyVals),
+                          const_cast<uint8_t *>(pos.mInfo)});
+  }
+
+  // Erases element at pos, returns iterator to the next element.
+  iterator erase(iterator pos) {
+    ROBIN_HOOD_TRACE(this)
+    // we assume that pos always points to a valid entry, and not end().
+    auto const idx = static_cast<size_t>(pos.mKeyVals - mKeyVals);
+
+    shiftDown(idx);
+    --mNumElements;
+
+    if (*pos.mInfo) {
+      // we've backward shifted, return this again
+      return pos;
+    }
+
+    // no backward shift, return next element
+    return ++pos;
+  }
+
+  size_t erase(const key_type &key) {
+    ROBIN_HOOD_TRACE(this)
+    size_t idx{};
+    InfoType info{};
+    keyToIdx(key, &idx, &info);
+
+    // check while info matches with the source idx
+    do {
+      if (info == mInfo[idx] &&
+          WKeyEqual::operator()(key, mKeyVals[idx].getFirst())) {
+        shiftDown(idx);
+        --mNumElements;
+        return 1;
+      }
+      next(&info, &idx);
+    } while (info <= mInfo[idx]);
+
+    // nothing found to delete
+    return 0;
+  }
+
+  // reserves space for the specified number of elements. Makes sure the old
+  // data fits.
+  // exactly the same as reserve(c).
+  void rehash(size_t c) {
+    // forces a reserve
+    reserve(c, true);
+  }
+
+  // reserves space for the specified number of elements. Makes sure the old
+  // data fits.
+  // Exactly the same as rehash(c). Use rehash(0) to shrink to fit.
+  void reserve(size_t c) {
+    // reserve, but don't force rehash
+    reserve(c, false);
+  }
+
+  // If possible reallocates the map to a smaller one. This frees the underlying
+  // table.
+  // Does not do anything if load_factor is too large for decreasing the table's
+  // size.
+  void compact() {
+    ROBIN_HOOD_TRACE(this)
+    auto newSize = InitialNumElements;
+    while (calcMaxNumElementsAllowed(newSize) < mNumElements && newSize != 0) {
+      newSize *= 2;
+    }
+    if (ROBIN_HOOD_UNLIKELY(newSize == 0)) {
+      throwOverflowError();
+    }
+
+    ROBIN_HOOD_LOG("newSize > mMask + 1: " << newSize << " > " << mMask
+                                           << " + 1")
+
+    // only actually do anything when the new size is bigger than the old one.
+    // This prevents to
+    // continuously allocate for each reserve() call.
+    if (newSize < mMask + 1) {
+      rehashPowerOfTwo(newSize, true);
+    }
+  }
+
+  size_type size() const noexcept {  // NOLINT(modernize-use-nodiscard)
+    ROBIN_HOOD_TRACE(this)
+    return mNumElements;
+  }
+
+  size_type max_size() const noexcept {  // NOLINT(modernize-use-nodiscard)
+    ROBIN_HOOD_TRACE(this)
+    return static_cast<size_type>(-1);
+  }
+
+  ROBIN_HOOD(NODISCARD) bool empty() const noexcept {
+    ROBIN_HOOD_TRACE(this)
+    return 0 == mNumElements;
+  }
+
+  float max_load_factor() const noexcept {  // NOLINT(modernize-use-nodiscard)
+    ROBIN_HOOD_TRACE(this)
+    return MaxLoadFactor100 / 100.0F;
+  }
+
+  // Average number of elements per bucket. Since we allow only 1 per bucket
+  float load_factor() const noexcept {  // NOLINT(modernize-use-nodiscard)
+    ROBIN_HOOD_TRACE(this)
+    return static_cast<float>(size()) / static_cast<float>(mMask + 1);
+  }
+
+  ROBIN_HOOD(NODISCARD) size_t mask() const noexcept {
+    ROBIN_HOOD_TRACE(this)
+    return mMask;
+  }
+
+  ROBIN_HOOD(NODISCARD)
+  size_t calcMaxNumElementsAllowed(size_t maxElements) const noexcept {
+    if (ROBIN_HOOD_LIKELY(maxElements <=
+                          (std::numeric_limits<size_t>::max)() / 100)) {
+      return maxElements * MaxLoadFactor100 / 100;
+    }
+
+    // we might be a bit inprecise, but since maxElements is quite large that
+    // doesn't matter
+    return (maxElements / 100) * MaxLoadFactor100;
+  }
+
+  ROBIN_HOOD(NODISCARD)
+  size_t calcNumBytesInfo(size_t numElements) const noexcept {
+    // we add a uint64_t, which houses the sentinel (first byte) and padding so
+    // we can load
+    // 64bit types.
+    return numElements + sizeof(uint64_t);
+  }
+
+  ROBIN_HOOD(NODISCARD)
+  size_t calcNumElementsWithBuffer(size_t numElements) const noexcept {
+    auto maxNumElementsAllowed = calcMaxNumElementsAllowed(numElements);
+    return numElements +
+           (std::min)(maxNumElementsAllowed, (static_cast<size_t>(0xFF)));
+  }
+
+  // calculation only allowed for 2^n values
+  ROBIN_HOOD(NODISCARD) size_t calcNumBytesTotal(size_t numElements) const {
+#if ROBIN_HOOD(BITNESS) == 64
+    return numElements * sizeof(Node) + calcNumBytesInfo(numElements);
+#else
+    // make sure we're doing 64bit operations, so we are at least safe against
+    // 32bit overflows.
+    auto const ne = static_cast<uint64_t>(numElements);
+    auto const s = static_cast<uint64_t>(sizeof(Node));
+    auto const infos = static_cast<uint64_t>(calcNumBytesInfo(numElements));
+
+    auto const total64 = ne * s + infos;
+    auto const total = static_cast<size_t>(total64);
+
+    if (ROBIN_HOOD_UNLIKELY(static_cast<uint64_t>(total) != total64)) {
+      throwOverflowError();
+    }
+    return total;
+#endif
+  }
+
+ private:
+  template <typename Q = mapped_type>
+  ROBIN_HOOD(NODISCARD)
+  typename std::enable_if<!std::is_void<Q>::value, bool>::type
+      has(const value_type &e) const {
+    ROBIN_HOOD_TRACE(this)
+    auto it = find(e.first);
+    return it != end() && it->second == e.second;
+  }
+
+  template <typename Q = mapped_type>
+  ROBIN_HOOD(NODISCARD)
+  typename std::enable_if<std::is_void<Q>::value, bool>::type
+      has(const value_type &e) const {
+    ROBIN_HOOD_TRACE(this)
+    return find(e) != end();
+  }
+
+  void reserve(size_t c, bool forceRehash) {
+    ROBIN_HOOD_TRACE(this)
+    auto const minElementsAllowed = (std::max)(c, mNumElements);
+    auto newSize = InitialNumElements;
+    while (calcMaxNumElementsAllowed(newSize) < minElementsAllowed &&
+           newSize != 0) {
+      newSize *= 2;
+    }
+    if (ROBIN_HOOD_UNLIKELY(newSize == 0)) {
+      throwOverflowError();
+    }
+
+    ROBIN_HOOD_LOG("newSize > mMask + 1: " << newSize << " > " << mMask
+                                           << " + 1")
+
+    // only actually do anything when the new size is bigger than the old one.
+    // This prevents to
+    // continuously allocate for each reserve() call.
+    if (forceRehash || newSize > mMask + 1) {
+      rehashPowerOfTwo(newSize, false);
+    }
+  }
+
+  // reserves space for at least the specified number of elements.
+  // only works if numBuckets if power of two
+  // True on success, false otherwise
+  void rehashPowerOfTwo(size_t numBuckets, bool forceFree) {
+    ROBIN_HOOD_TRACE(this)
+
+    Node *const oldKeyVals = mKeyVals;
+    uint8_t const *const oldInfo = mInfo;
+
+    const size_t oldMaxElementsWithBuffer =
+        calcNumElementsWithBuffer(mMask + 1);
+
+    // resize operation: move stuff
+    initData(numBuckets);
+    if (oldMaxElementsWithBuffer > 1) {
+      for (size_t i = 0; i < oldMaxElementsWithBuffer; ++i) {
+        if (oldInfo[i] != 0) {
+          // might throw an exception, which is really bad since we are in the
+          // middle of
+          // moving stuff.
+          insert_move(std::move(oldKeyVals[i]));
+          // destroy the node but DON'T destroy the data.
+          oldKeyVals[i].~Node();
+        }
+      }
+
+      // this check is not necessary as it's guarded by the previous if, but it
+      // helps
+      // silence g++'s overeager "attempt to free a non-heap object 'map'
+      // [-Werror=free-nonheap-object]" warning.
+      if (oldKeyVals !=
+          reinterpret_cast_no_cast_align_warning<Node *>(&mMask)) {
+        // don't destroy old data: put it into the pool instead
+        if (forceFree) {
+          std::free(oldKeyVals);
+        } else {
+          DataPool::addOrFree(oldKeyVals,
+                              calcNumBytesTotal(oldMaxElementsWithBuffer));
+        }
+      }
+    }
+  }
+
+  ROBIN_HOOD(NOINLINE) void throwOverflowError() const {
+#if ROBIN_HOOD(HAS_EXCEPTIONS)
+    throw std::overflow_error("robin_hood::map overflow");
+#else
+    abort();
+#endif
+  }
+
+  template <typename OtherKey, typename... Args>
+  std::pair<iterator, bool> try_emplace_impl(OtherKey &&key, Args &&... args) {
+    ROBIN_HOOD_TRACE(this)
+    auto idxAndState = insertKeyPrepareEmptySpot(key);
+    switch (idxAndState.second) {
+      case InsertionState::key_found:
+        break;
+
+      case InsertionState::new_node:
+        ::new (static_cast<void *>(&mKeyVals[idxAndState.first]))
+            Node(*this, std::piecewise_construct,
+                 std::forward_as_tuple(std::forward<OtherKey>(key)),
+                 std::forward_as_tuple(std::forward<Args>(args)...));
+        break;
+
+      case InsertionState::overwrite_node:
+        mKeyVals[idxAndState.first] =
+            Node(*this, std::piecewise_construct,
+                 std::forward_as_tuple(std::forward<OtherKey>(key)),
+                 std::forward_as_tuple(std::forward<Args>(args)...));
+        break;
+
+      case InsertionState::overflow_error:
+        throwOverflowError();
+        break;
+    }
+
+    return std::make_pair(
+        iterator(mKeyVals + idxAndState.first, mInfo + idxAndState.first),
+        InsertionState::key_found != idxAndState.second);
+  }
+
+  template <typename OtherKey, typename Mapped>
+  std::pair<iterator, bool> insertOrAssignImpl(OtherKey &&key, Mapped &&obj) {
+    ROBIN_HOOD_TRACE(this)
+    auto idxAndState = insertKeyPrepareEmptySpot(key);
+    switch (idxAndState.second) {
+      case InsertionState::key_found:
+        mKeyVals[idxAndState.first].getSecond() = std::forward<Mapped>(obj);
+        break;
+
+      case InsertionState::new_node:
+        ::new (static_cast<void *>(&mKeyVals[idxAndState.first]))
+            Node(*this, std::piecewise_construct,
+                 std::forward_as_tuple(std::forward<OtherKey>(key)),
+                 std::forward_as_tuple(std::forward<Mapped>(obj)));
+        break;
+
+      case InsertionState::overwrite_node:
+        mKeyVals[idxAndState.first] =
+            Node(*this, std::piecewise_construct,
+                 std::forward_as_tuple(std::forward<OtherKey>(key)),
+                 std::forward_as_tuple(std::forward<Mapped>(obj)));
+        break;
+
+      case InsertionState::overflow_error:
+        throwOverflowError();
+        break;
+    }
+
+    return std::make_pair(
+        iterator(mKeyVals + idxAndState.first, mInfo + idxAndState.first),
+        InsertionState::key_found != idxAndState.second);
+  }
+
+  void initData(size_t max_elements) {
+    mNumElements = 0;
+    mMask = max_elements - 1;
+    mMaxNumElementsAllowed = calcMaxNumElementsAllowed(max_elements);
+
+    auto const numElementsWithBuffer = calcNumElementsWithBuffer(max_elements);
+
+    // calloc also zeroes everything
+    auto const numBytesTotal = calcNumBytesTotal(numElementsWithBuffer);
+    ROBIN_HOOD_LOG("std::calloc " << numBytesTotal << " = calcNumBytesTotal("
+                                  << numElementsWithBuffer << ")")
+    mKeyVals = reinterpret_cast<Node *>(
+        detail::assertNotNull<std::bad_alloc>(std::calloc(1, numBytesTotal)));
+    mInfo = reinterpret_cast<uint8_t *>(mKeyVals + numElementsWithBuffer);
+
+    // set sentinel
+    mInfo[numElementsWithBuffer] = 1;
+
+    mInfoInc = InitialInfoInc;
+    mInfoHashShift = InitialInfoHashShift;
+  }
+
+  enum class InsertionState {
+    overflow_error,
+    key_found,
+    new_node,
+    overwrite_node
+  };
+
+  // Finds key, and if not already present prepares a spot where to pot the key
+  // & value.
+  // This potentially shifts nodes out of the way, updates mInfo and number of
+  // inserted
+  // elements, so the only operation left to do is create/assign a new node at
+  // that spot.
+  template <typename OtherKey>
+  std::pair<size_t, InsertionState> insertKeyPrepareEmptySpot(OtherKey &&key) {
+    for (int i = 0; i < 256; ++i) {
+      size_t idx{};
+      InfoType info{};
+      keyToIdx(key, &idx, &info);
+      nextWhileLess(&info, &idx);
+
+      // while we potentially have a match
+      while (info == mInfo[idx]) {
+        if (WKeyEqual::operator()(key, mKeyVals[idx].getFirst())) {
+          // key already exists, do NOT insert.
+          // see http://en.cppreference.com/w/cpp/container/unordered_map/insert
+          return std::make_pair(idx, InsertionState::key_found);
+        }
+        next(&info, &idx);
+      }
+
+      // unlikely that this evaluates to true
+      if (ROBIN_HOOD_UNLIKELY(mNumElements >= mMaxNumElementsAllowed)) {
+        if (!increase_size()) {
+          return std::make_pair(size_t(0), InsertionState::overflow_error);
+        }
+        continue;
+      }
+
+      // key not found, so we are now exactly where we want to insert it.
+      auto const insertion_idx = idx;
+      auto const insertion_info = info;
+      if (ROBIN_HOOD_UNLIKELY(insertion_info + mInfoInc > 0xFF)) {
+        mMaxNumElementsAllowed = 0;
+      }
+
+      // find an empty spot
+      while (0 != mInfo[idx]) {
+        next(&info, &idx);
+      }
+
+      if (idx != insertion_idx) {
+        shiftUp(idx, insertion_idx);
+      }
+      // put at empty spot
+      mInfo[insertion_idx] = static_cast<uint8_t>(insertion_info);
+      ++mNumElements;
+      return std::make_pair(
+          insertion_idx, idx == insertion_idx ? InsertionState::new_node
+                                              : InsertionState::overwrite_node);
+    }
+
+    // enough attempts failed, so finally give up.
+    return std::make_pair(size_t(0), InsertionState::overflow_error);
+  }
+
+  bool try_increase_info() {
+    ROBIN_HOOD_LOG("mInfoInc=" << mInfoInc << ", numElements=" << mNumElements
+                               << ", maxNumElementsAllowed="
+                               << calcMaxNumElementsAllowed(mMask + 1))
+    if (mInfoInc <= 2) {
+      // need to be > 2 so that shift works (otherwise undefined behavior!)
+      return false;
+    }
+    // we got space left, try to make info smaller
+    mInfoInc = static_cast<uint8_t>(mInfoInc >> 1U);
+
+    // remove one bit of the hash, leaving more space for the distance info.
+    // This is extremely fast because we can operate on 8 bytes at once.
+    ++mInfoHashShift;
+    auto const numElementsWithBuffer = calcNumElementsWithBuffer(mMask + 1);
+
+    for (size_t i = 0; i < numElementsWithBuffer; i += 8) {
+      auto val = unaligned_load<uint64_t>(mInfo + i);
+      val = (val >> 1U) & UINT64_C(0x7f7f7f7f7f7f7f7f);
+      std::memcpy(mInfo + i, &val, sizeof(val));
+    }
+    // update sentinel, which might have been cleared out!
+    mInfo[numElementsWithBuffer] = 1;
+
+    mMaxNumElementsAllowed = calcMaxNumElementsAllowed(mMask + 1);
+    return true;
+  }
+
+  // True if resize was possible, false otherwise
+  bool increase_size() {
+    // nothing allocated yet? just allocate InitialNumElements
+    if (0 == mMask) {
+      initData(InitialNumElements);
+      return true;
+    }
+
+    auto const maxNumElementsAllowed = calcMaxNumElementsAllowed(mMask + 1);
+    if (mNumElements < maxNumElementsAllowed && try_increase_info()) {
+      return true;
+    }
+
+    ROBIN_HOOD_LOG("mNumElements="
+                   << mNumElements
+                   << ", maxNumElementsAllowed=" << maxNumElementsAllowed
+                   << ", load=" << (static_cast<double>(mNumElements) * 100.0 /
+                                    (static_cast<double>(mMask) + 1)))
+
+    nextHashMultiplier();
+    if (mNumElements * 2 < calcMaxNumElementsAllowed(mMask + 1)) {
+      // we have to resize, even though there would still be plenty of space
+      // left!
+      // Try to rehash instead. Delete freed memory so we don't steadyily
+      // increase mem in case
+      // we have to rehash a few times
+      rehashPowerOfTwo(mMask + 1, true);
+    } else {
+      // Each resize use a different hash so we don't so easily overflow.
+      // Make sure we only have odd numbers, so that the multiplication is
+      // reversible!
+      rehashPowerOfTwo((mMask + 1) * 2, false);
+    }
+    return true;
+  }
+
+  void nextHashMultiplier() {
+    // adding an *even* number, so that the multiplier will always stay odd.
+    // This is necessary
+    // so that the hash stays a mixing function (and thus doesn't have any
+    // information loss).
+    mHashMultiplier += UINT64_C(0xc4ceb9fe1a85ec54);
+  }
+
+  void destroy() {
+    if (0 == mMask) {
+      // don't deallocate!
+      return;
+    }
+
+    Destroyer<Self, IsFlat && std::is_trivially_destructible<Node>::value>{}
+        .nodesDoNotDeallocate(*this);
+
+    // This protection against not deleting mMask shouldn't be needed as it's
+    // sufficiently
+    // protected with the 0==mMask check, but I have this anyways because g++ 7
+    // otherwise
+    // reports a compile error: attempt to free a non-heap object 'fm'
+    // [-Werror=free-nonheap-object]
+    if (mKeyVals != reinterpret_cast_no_cast_align_warning<Node *>(&mMask)) {
+      ROBIN_HOOD_LOG("std::free")
+      std::free(mKeyVals);
+    }
+  }
+
+  void init() noexcept {
+    mKeyVals = reinterpret_cast_no_cast_align_warning<Node *>(&mMask);
+    mInfo = reinterpret_cast<uint8_t *>(&mMask);
+    mNumElements = 0;
+    mMask = 0;
+    mMaxNumElementsAllowed = 0;
+    mInfoInc = InitialInfoInc;
+    mInfoHashShift = InitialInfoHashShift;
+  }
+
+  // members are sorted so no padding occurs
+  uint64_t mHashMultiplier = UINT64_C(0xc4ceb9fe1a85ec53);  // 8 byte  8
+  Node *mKeyVals =
+      reinterpret_cast_no_cast_align_warning<Node *>(&mMask);  // 8 byte 16
+  uint8_t *mInfo = reinterpret_cast<uint8_t *>(&mMask);        // 8 byte 24
+  size_t mNumElements = 0;                                     // 8 byte 32
+  size_t mMask = 0;                                            // 8 byte 40
+  size_t mMaxNumElementsAllowed = 0;                           // 8 byte 48
+  InfoType mInfoInc = InitialInfoInc;                          // 4 byte 52
+  InfoType mInfoHashShift = InitialInfoHashShift;              // 4 byte 56
+  // 16 byte 56 if NodeAllocator
+};
+
+}  // namespace detail
+
+// map
+
+template <typename Key, typename T, typename Hash = hash<Key>,
+          typename KeyEqual = std::equal_to<Key>, size_t MaxLoadFactor100 = 80>
+using unordered_flat_map =
+    detail::Table<true, MaxLoadFactor100, Key, T, Hash, KeyEqual>;
+
+template <typename Key, typename T, typename Hash = hash<Key>,
+          typename KeyEqual = std::equal_to<Key>, size_t MaxLoadFactor100 = 80>
+using unordered_node_map =
+    detail::Table<false, MaxLoadFactor100, Key, T, Hash, KeyEqual>;
+
+template <typename Key, typename T, typename Hash = hash<Key>,
+          typename KeyEqual = std::equal_to<Key>, size_t MaxLoadFactor100 = 80>
+using unordered_map = detail::Table<
+    sizeof(robin_hood::pair<Key, T>) <= sizeof(size_t) * 6 &&
+        std::is_nothrow_move_constructible<robin_hood::pair<Key, T>>::value &&
+        std::is_nothrow_move_assignable<robin_hood::pair<Key, T>>::value,
+    MaxLoadFactor100, Key, T, Hash, KeyEqual>;
+
+// set
+
+template <typename Key, typename Hash = hash<Key>,
+          typename KeyEqual = std::equal_to<Key>, size_t MaxLoadFactor100 = 80>
+using unordered_flat_set =
+    detail::Table<true, MaxLoadFactor100, Key, void, Hash, KeyEqual>;
+
+template <typename Key, typename Hash = hash<Key>,
+          typename KeyEqual = std::equal_to<Key>, size_t MaxLoadFactor100 = 80>
+using unordered_node_set =
+    detail::Table<false, MaxLoadFactor100, Key, void, Hash, KeyEqual>;
+
+template <typename Key, typename Hash = hash<Key>,
+          typename KeyEqual = std::equal_to<Key>, size_t MaxLoadFactor100 = 80>
+using unordered_set =
+    detail::Table<sizeof(Key) <= sizeof(size_t) * 6 &&
+                      std::is_nothrow_move_constructible<Key>::value &&
+                      std::is_nothrow_move_assignable<Key>::value,
+                  MaxLoadFactor100, Key, void, Hash, KeyEqual>;
+
+}  // namespace robin_hood
+
+#endif
diff --git a/paddle/fluid/extension/include/ext_tensor.h b/paddle/fluid/extension/include/ext_tensor.h
index 52606b2a7f59e..fa91490e6cd8a 100644
--- a/paddle/fluid/extension/include/ext_tensor.h
+++ b/paddle/fluid/extension/include/ext_tensor.h
@@ -113,6 +113,9 @@ class PD_DLL_DECL Tensor {
   /// \brief Cast datatype from one to another
   Tensor cast(const DataType& target_type) const;
 
+  /// \brief Check Tensor is initialized
+  bool is_initialized() const;
+
 #ifdef PADDLE_WITH_CUDA
   /// \bref Get current stream of Tensor
   cudaStream_t stream() const;
diff --git a/paddle/fluid/extension/src/ext_tensor.cc b/paddle/fluid/extension/src/ext_tensor.cc
index e9705e2101cc3..8b2f7cc5bf13c 100644
--- a/paddle/fluid/extension/src/ext_tensor.cc
+++ b/paddle/fluid/extension/src/ext_tensor.cc
@@ -103,15 +103,6 @@ void GpuCopy(T *src, T *dst, PlaceType src_plc, PlaceType dst_plc,
 void Tensor::reshape(const std::vector<int64_t> &shape) {
   GET_CASTED_TENSOR
   auto new_dim = framework::make_ddim(shape);
-  if (tensor->numel() != framework::product(new_dim)) {
-    LOG(WARNING) << "Custom Op: Calling reshape to a new shape which is bigger "
-                    "or smaller"
-                 << "than original shape will not change your tensor's memory "
-                    "Please call"
-                 << "paddle::Tensor::mutable_data<T>() after to reallocate "
-                    "your tensor's size."
-                 << std::endl;
-  }
   tensor->Resize(new_dim);
 }
 
@@ -393,6 +384,15 @@ int64_t Tensor::size() const {
   return tensor->numel();
 }
 
+bool Tensor::is_initialized() const {
+  GET_CASTED_TENSOR;
+  if (tensor->IsInitialized()) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
 #ifdef PADDLE_WITH_CUDA
 cudaStream_t Tensor::stream() const {
   if (!stream_.IsStreamSet()) {
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 13c37b93d7c98..24bed27728083 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -191,13 +191,15 @@ if(WITH_PYTHON)
   py_proto_compile(distributed_strategy_py_proto SRCS distributed_strategy.proto)
 #Generate an empty \
     #__init__.py to make framework_py_proto as a valid python module.
+  add_custom_target(fleet_proto_init ALL  
+    COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto
+    COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto/__init__.py 
+  )
   add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
-  add_dependencies(framework_py_proto framework_py_proto_init trainer_py_proto distributed_strategy_py_proto)
+  add_dependencies(framework_py_proto framework_py_proto_init trainer_py_proto distributed_strategy_py_proto fleet_proto_init)
   if (NOT WIN32)
     add_custom_command(TARGET framework_py_proto POST_BUILD
       COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto
-      COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto
-      COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto/__init__.py
       COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/
       COMMAND cp distributed_strategy_*.py ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto
       COMMENT "Copy generated python proto into directory paddle/fluid/proto."
@@ -207,8 +209,6 @@ if(WITH_PYTHON)
     string(REPLACE "/" "\\" fleet_proto_dstpath "${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto/")
     add_custom_command(TARGET framework_py_proto POST_BUILD
           COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto
-	  COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto
-	  COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto/__init__.py
           COMMAND copy /Y *.py ${proto_dstpath}
 	  COMMAND copy /Y distributed_strategy_*.py ${fleet_proto_dstpath}
           COMMENT "Copy generated python proto into directory paddle/fluid/proto."
@@ -217,6 +217,12 @@ if(WITH_PYTHON)
   endif(NOT WIN32)
 endif()
 
+if (WITH_PSCORE)
+  add_custom_target(index_dataset_proto_init ALL DEPENDS fleet_proto_init index_dataset_py_proto
+    COMMAND cp ${PADDLE_BINARY_DIR}/paddle/fluid/distributed/index_dataset/index_dataset_*.py ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto
+    COMMENT "Copy generated python proto into directory paddle/distributed/fleet/proto.")
+endif(WITH_PSCORE)
+
 cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)
 
 cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog)
diff --git a/paddle/fluid/framework/custom_tensor_test.cc b/paddle/fluid/framework/custom_tensor_test.cc
index 8d6fd4efd5ae3..a65dcbd55f946 100644
--- a/paddle/fluid/framework/custom_tensor_test.cc
+++ b/paddle/fluid/framework/custom_tensor_test.cc
@@ -220,6 +220,21 @@ void GroupTestDtypeConvert() {
         paddle::DataType::FLOAT16);
 }
 
+void TestInitilized() {
+  paddle::Tensor test_tensor(paddle::PlaceType::kCPU);
+  CHECK(test_tensor.is_initialized() == false);
+  test_tensor.reshape({1, 1});
+  test_tensor.mutable_data<float>();
+  CHECK(test_tensor.is_initialized() == true);
+  float* tensor_data = test_tensor.data<float>();
+  for (int i = 0; i < test_tensor.size(); i++) {
+    tensor_data[i] = 0.5;
+  }
+  for (int i = 0; i < test_tensor.size(); i++) {
+    CHECK(tensor_data[i] == 0.5);
+  }
+}
+
 TEST(CustomTensor, copyTest) {
   VLOG(2) << "TestCopy";
   GroupTestCopy();
@@ -233,4 +248,6 @@ TEST(CustomTensor, copyTest) {
   GroupTestCast();
   VLOG(2) << "TestDtypeConvert";
   GroupTestDtypeConvert();
+  VLOG(2) << "TestInitilized";
+  TestInitilized();
 }
diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cc b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
index 103dd0c5ae599..0fdb97db20af9 100644
--- a/paddle/fluid/framework/details/nan_inf_utils_detail.cc
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
@@ -354,8 +354,36 @@ void CheckVarHasNanOrInf(const std::string& op_type,
         var_name));
 #endif
     return;
-  }
+  } else if (platform::is_npu_place(tensor->place())) {
+#ifdef PADDLE_WITH_ASCEND_CL
+    if (tensor->type() != proto::VarType::FP32) {
+      return;
+    }
+
+    framework::LoDTensor cpu_tensor;
+    cpu_tensor.Resize(tensor->dims());
+    float* cpu_data = static_cast<float*>(
+        cpu_tensor.mutable_data(platform::CPUPlace(), tensor->type()));
 
+    framework::TensorCopySync(*tensor, platform::CPUPlace(), &cpu_tensor);
+    bool flag = false;
+    for (int i = 0; i < cpu_tensor.numel(); i++) {
+      if (isnan(cpu_data[i]) || isinf(cpu_data[i])) {
+        flag = true;
+        break;
+      }
+    }
+    PADDLE_ENFORCE_NE(
+        flag, true,
+        platform::errors::Fatal("Operator %s output Tensor %s contains Inf.",
+                                op_type, var_name));
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "Tensor[%s] use npu place. PaddlePaddle must compile with NPU.",
+        var_name));
+#endif
+    return;
+  }
   tensor_check<platform::CPUDeviceContext>(op_type, var_name, *tensor, place);
 }
 
diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h
index 9ced4221e1dd6..a49e492e48028 100644
--- a/paddle/fluid/framework/device_worker.h
+++ b/paddle/fluid/framework/device_worker.h
@@ -562,7 +562,6 @@ class PSGPUWorker : public HogwildWorker {
   void ResetStat();
 
  protected:
-  std::shared_ptr<paddle::framework::FleetWrapper> fleet_ptr_;
   void PushGradients();
   void DumpParam();
   void CopySparseTable();
@@ -639,7 +638,8 @@ class PSGPUWorker : public HogwildWorker {
 };
 #endif
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+    defined(WITH_ASCEND_CL)
 class SectionWorker : public DeviceWorker {
  public:
   SectionWorker() {}
diff --git a/paddle/fluid/framework/device_worker_factory.cc b/paddle/fluid/framework/device_worker_factory.cc
index a539a5d5f96b5..5780a95343385 100644
--- a/paddle/fluid/framework/device_worker_factory.cc
+++ b/paddle/fluid/framework/device_worker_factory.cc
@@ -79,7 +79,8 @@ REGISTER_DEVICE_WORKER_CLASS(HeterBoxWorker);
 REGISTER_DEVICE_WORKER_CLASS(PSGPUWorker);
 #endif
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+    defined(WITH_ASCEND_CL)
 REGISTER_DEVICE_WORKER_CLASS(SectionWorker);
 #endif
 }  // namespace framework
diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto
old mode 100755
new mode 100644
index 6363eedc80a20..654b88920acaf
--- a/paddle/fluid/framework/distributed_strategy.proto
+++ b/paddle/fluid/framework/distributed_strategy.proto
@@ -43,6 +43,12 @@ message ShardingConfig {
   optional int32 pp_degree = 11 [ default = 1 ];
 }
 
+message HybridConfig {
+  optional int32 dp_degree = 1 [ default = -1 ];
+  optional int32 mp_degree = 2 [ default = 1 ];
+  optional int32 pp_degree = 3 [ default = 1 ];
+}
+
 message AMPConfig {
   optional float init_loss_scaling = 1 [ default = 32768.0 ];
   optional int32 incr_every_n_steps = 2 [ default = 1000 ];
@@ -124,6 +130,7 @@ message AsyncConfig {
   optional bool launch_barrier = 9 [ default = true ];
   optional string heter_worker_device_guard = 10 [ default = 'cpu' ];
   optional int32 lr_decay_steps = 11 [ default = 10 ];
+  optional int32 use_ps_gpu = 12 [ default = 0 ];
 }
 
 message PipelineConfig {
@@ -132,6 +139,10 @@ message PipelineConfig {
   optional string schedule_mode = 3 [ default = '1F1B' ];
 }
 
+message TensorParallelConfig {
+  optional int32 tensor_parallel_degree = 1 [ default = 1 ];
+}
+
 message DistributedStrategy {
   // bool options
   optional Mode mode = 1 [ default = COLLECTIVE ];
@@ -162,6 +173,7 @@ message DistributedStrategy {
   optional bool sharding = 26 [ default = false ];
   optional float last_comm_group_size_MB = 27 [ default = 1 ];
   optional bool find_unused_parameters = 28 [ default = true ];
+  optional bool tensor_parallel = 29 [ default = false ];
 
   optional RecomputeConfig recompute_configs = 101;
   optional AMPConfig amp_configs = 102;
@@ -174,6 +186,8 @@ message DistributedStrategy {
   optional LambConfig lamb_configs = 109;
   optional AdaptiveLocalSGDConfig adaptive_localsgd_configs = 110;
   optional ShardingConfig sharding_configs = 111;
+  optional HybridConfig hybrid_configs = 112;
+  optional TensorParallelConfig tensor_parallel_configs = 113;
   optional BuildStrategy build_strategy = 201;
   optional ExecutionStrategy execution_strategy = 202;
 }
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 101991d2c1ba0..e5bfbf4a8f779 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -456,11 +456,22 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
 #endif
     } else if (platform::is_npu_place(place_)) {
 #ifdef PADDLE_WITH_ASCEND_CL
-      // TODO(ascendrc): Support garbage collector on NPUPlace
-      VLOG(4) << "Skip NPU gc because it is not implemented now.";
+      if (IsFastEagerDeletionModeEnabled()) {
+        VLOG(4) << "Use unsafe fast gc for NPU.";
+        gc.reset(new NPUUnsafeFastGarbageCollector(
+            BOOST_GET_CONST(platform::NPUPlace, place_), max_memory_size));
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Please set FLAGS_fast_eager_deletion_mode=true to use "
+            "GarbageCollector on NPU."));
+        // TODO(zhiqiu): fix bugs and enable NPUDefaultStreamGarbageCollector.
+        VLOG(4) << "Use default stream gc for NPU.";
+        gc.reset(new NPUDefaultStreamGarbageCollector(
+            BOOST_GET_CONST(platform::NPUPlace, place_), max_memory_size));
+      }
 #else
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "No NPU gc found in CPU/GPU/XPU paddle"));
+      PADDLE_THROW(
+          platform::errors::Unimplemented("No NPU gc found in CPU/NPU paddle"));
 #endif
     }
   }
diff --git a/paddle/fluid/framework/fleet/CMakeLists.txt b/paddle/fluid/framework/fleet/CMakeLists.txt
index ce0a905afc628..03dd2cff655c0 100644
--- a/paddle/fluid/framework/fleet/CMakeLists.txt
+++ b/paddle/fluid/framework/fleet/CMakeLists.txt
@@ -1,5 +1,10 @@
 if(WITH_PSLIB)
     cc_library(fleet_wrapper SRCS fleet_wrapper.cc DEPS framework_proto variable_helper scope pslib_brpc pslib)
+else()
+    cc_library(fleet_wrapper SRCS fleet_wrapper.cc DEPS framework_proto variable_helper scope)
+endif(WITH_PSLIB)
+
+if(WITH_HETERPS)
     if(WITH_NCCL)
         nv_library(ps_gpu_wrapper SRCS ps_gpu_wrapper.cu ps_gpu_wrapper.cc
         DEPS heter_ps)
@@ -8,13 +13,10 @@ if(WITH_PSLIB)
         hip_library(ps_gpu_wrapper SRCS ps_gpu_wrapper.cu ps_gpu_wrapper.cc
         DEPS heter_ps)
         add_subdirectory(heter_ps)
-    else()
-        cc_library(ps_gpu_wrapper SRCS ps_gpu_wrapper.cc)
     endif(WITH_NCCL)
 else()
-    cc_library(fleet_wrapper SRCS fleet_wrapper.cc DEPS framework_proto variable_helper scope)
     cc_library(ps_gpu_wrapper SRCS ps_gpu_wrapper.cc)
-endif(WITH_PSLIB)
+endif(WITH_HETERPS)
 
 if(WITH_NCCL OR WITH_RCCL)
     cc_library(nccl_wrapper SRCS nccl_wrapper.cc DEPS framework_proto variable_helper scope)
@@ -41,6 +43,6 @@ cc_library(heter_wrapper SRCS heter_wrapper.cc DEPS framework_proto device_conte
 
 cc_test(test_fleet_cc SRCS test_fleet.cc DEPS fleet_wrapper gloo_wrapper fs shell)
 
-if(WITH_ASCEND)
+if(WITH_ASCEND OR WITH_ASCEND_CL)
     cc_library(ascend_wrapper SRCS ascend_wrapper.cc DEPS framework_proto lod_tensor ascend_ge ascend_graph)
-endif(WITH_ASCEND)
+endif()
diff --git a/paddle/fluid/framework/fleet/ascend_wrapper.cc b/paddle/fluid/framework/fleet/ascend_wrapper.cc
index d1b2f51f70036..273939f6bee61 100644
--- a/paddle/fluid/framework/fleet/ascend_wrapper.cc
+++ b/paddle/fluid/framework/fleet/ascend_wrapper.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifdef PADDLE_WITH_ASCEND
+#ifdef PADDLE_WITH_ASCEND_CL
 #include "paddle/fluid/framework/fleet/ascend_wrapper.h"
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/fleet/ascend_wrapper.h b/paddle/fluid/framework/fleet/ascend_wrapper.h
index baa2fd126a4b7..f749ee8cfa0ba 100644
--- a/paddle/fluid/framework/fleet/ascend_wrapper.h
+++ b/paddle/fluid/framework/fleet/ascend_wrapper.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#ifdef PADDLE_WITH_ASCEND
+#ifdef PADDLE_WITH_ASCEND_CL
 #include <glog/logging.h>
 
 #include <map>
@@ -29,7 +29,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/timer.h"
 
 #include "ge/ge_api.h"
-#include "ge/ge_api_types.h"
 #include "graph/attr_value.h"
 #include "graph/tensor.h"
 #include "graph/types.h"
diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.h b/paddle/fluid/framework/fleet/fleet_wrapper.h
index e584fb5e2b9ca..613b2803637d2 100644
--- a/paddle/fluid/framework/fleet/fleet_wrapper.h
+++ b/paddle/fluid/framework/fleet/fleet_wrapper.h
@@ -34,6 +34,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/variable_helper.h"
 #include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
+#ifdef PADDLE_WITH_HETERPS
+#include "paddle/fluid/platform/type_defs.h"
+#endif
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/fleet/heter_context.h b/paddle/fluid/framework/fleet/heter_context.h
index a02931b3f5c28..1fb2f0fab4aff 100644
--- a/paddle/fluid/framework/fleet/heter_context.h
+++ b/paddle/fluid/framework/fleet/heter_context.h
@@ -14,15 +14,21 @@ limitations under the License. */
 
 #pragma once
 
-#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
-    (defined PADDLE_WITH_PSLIB)
+#ifdef PADDLE_WITH_HETERPS
 
 #include <algorithm>
 #include <map>
 #include <unordered_map>
 #include <vector>
 
+#ifdef PADDLE_WITH_PSLIB
 #include "common_value.h"  // NOLINT
+#endif
+
+#ifdef PADDLE_WITH_PSCORE
+#include "paddle/fluid/distributed/table/depends/large_scale_kv.h"
+#endif
+
 #include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
 #include "paddle/fluid/framework/scope.h"
 
@@ -39,7 +45,12 @@ class HeterContext {
   }
   Scope* scope_{nullptr};
   std::vector<std::vector<FeatureKey>> feature_keys_;
+#ifdef PADDLE_WITH_PSLIB
   std::vector<std::vector<paddle::ps::DownpourFixedFeatureValue*>> value_ptr_;
+#endif
+#ifdef PADDLE_WITH_PSCORE
+  std::vector<std::vector<paddle::distributed::VALUE*>> value_ptr_;
+#endif
   std::vector<std::vector<FeatureValue>> device_values_;
   std::vector<std::vector<FeatureKey>> device_keys_;
   std::vector<std::mutex*> mutex_;
@@ -66,6 +77,21 @@ class HeterContext {
       mutex_[i] = new std::mutex();
     }
   }
+
+  void Reset() {
+    for (size_t i = 0; i < feature_keys_.size(); ++i) {
+      feature_keys_[i].clear();
+    }
+    for (size_t i = 0; i < value_ptr_.size(); ++i) {
+      value_ptr_[i].clear();
+    }
+    for (size_t i = 0; i < device_values_.size(); ++i) {
+      device_values_[i].clear();
+    }
+    for (size_t i = 0; i < device_keys_.size(); ++i) {
+      device_keys_[i].clear();
+    }
+  }
   void batch_add_keys(
       const std::vector<std::unordered_set<uint64_t>>& thread_keys) {
     assert(thread_keys.size() == feature_keys_.size());
@@ -79,6 +105,15 @@ class HeterContext {
     }
   }
 
+  void batch_add_keys(int shard_num,
+                      const std::unordered_set<uint64_t>& shard_keys) {
+    int idx = feature_keys_[shard_num].size();
+    feature_keys_[shard_num].resize(feature_keys_[shard_num].size() +
+                                    shard_keys.size());
+    std::copy(shard_keys.begin(), shard_keys.end(),
+              feature_keys_[shard_num].begin() + idx);
+  }
+
   void UniqueKeys() {
     std::vector<std::thread> threads;
     auto unique_func = [this](int i) {
diff --git a/paddle/fluid/framework/fleet/heter_ps/feature_value.h b/paddle/fluid/framework/fleet/heter_ps/feature_value.h
index 698ece09de6c5..c3bf33b32c2da 100644
--- a/paddle/fluid/framework/fleet/heter_ps/feature_value.h
+++ b/paddle/fluid/framework/fleet/heter_ps/feature_value.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#ifdef PADDLE_WITH_PSLIB
+#ifdef PADDLE_WITH_HETERPS
 
 #include <iostream>
 
diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable.h b/paddle/fluid/framework/fleet/heter_ps/hashtable.h
index e5c0972763bed..089130f6da8c7 100644
--- a/paddle/fluid/framework/fleet/heter_ps/hashtable.h
+++ b/paddle/fluid/framework/fleet/heter_ps/hashtable.h
@@ -17,11 +17,17 @@ limitations under the License. */
 #include <limits>
 #include <memory>
 #include <vector>
+#ifdef PADDLE_WTIH_PSLIB
 #include "common_value.h"  // NOLINT
+#endif
+#ifdef PADDLE_WITH_PSCORE
+#endif
 #include "thrust/pair.h"
 //#include "cudf/concurrent_unordered_map.cuh.h"
 #include "paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h"
-#ifdef PADDLE_WITH_PSLIB
+#ifdef PADDLE_WITH_HETERPS
+#include "paddle/fluid/distributed/table/depends/large_scale_kv.h"
+#include "paddle/fluid/platform/type_defs.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h b/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h
index 871f9c7857af4..098c795fc7e1f 100644
--- a/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h
+++ b/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifdef PADDLE_WITH_PSLIB
+#ifdef PADDLE_WITH_HETERPS
 
 namespace paddle {
 namespace framework {
@@ -119,6 +119,7 @@ void HashTable<KeyType, ValType>::dump_to_cpu(int devid, cudaStream_t stream) {
       continue;
     }
     ValType& gpu_val = kv[i].second;
+#ifdef PADDLE_WITH_PSLIB
     auto* downpour_value =
         (paddle::ps::DownpourFixedFeatureValue*)(gpu_val.cpu_ptr);
     int downpour_value_size = downpour_value->size();
@@ -138,6 +139,14 @@ void HashTable<KeyType, ValType>::dump_to_cpu(int devid, cudaStream_t stream) {
         cpu_val[x + 7] = gpu_val.mf[x];
       }
     }
+#endif
+#ifdef PADDLE_WITH_PSCORE
+    auto* downpour_value = (paddle::distributed::VALUE*)(gpu_val.cpu_ptr);
+    downpour_value->count_ = gpu_val.show;
+    for (int x = 0; x < gpu_val.mf_size; x++) {
+      downpour_value->data_[x] = gpu_val.mf[x];
+    }
+#endif
   }
 
   container_->prefetch(devid, stream);
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h
index 0e38ebbd7f4e7..2ec2a8a1f1e22 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h
@@ -25,7 +25,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/place.h"
 #include "thrust/pair.h"
 
-#ifdef PADDLE_WITH_PSLIB
+#ifdef PADDLE_WITH_HETERPS
 
 namespace paddle {
 namespace framework {
@@ -182,7 +182,7 @@ class HeterComm {
   std::vector<std::vector<Path>> path_;
   std::vector<LocalStorage> storage_;
   int feanum_{1800 * 2048};
-  int multi_node_{1};
+  int multi_node_{0};
   std::vector<ncclComm_t> nccl_inner_comms_;
   std::vector<ncclComm_t> nccl_inter_comms_;
   int node_size_;
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
index 2f1c809c01eaa..1b4205e3c38fe 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+#ifdef PADDLE_WITH_HETERPS
 #include <queue>
 
-#ifdef PADDLE_WITH_PSLIB
 namespace paddle {
 namespace framework {
 
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu
index f2e129ded9fef..581b0d511c23e 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/fleet/heter_ps/heter_ps.h"
 
-#ifdef PADDLE_WITH_PSLIB
+#ifdef PADDLE_WITH_HETERPS
 
 namespace paddle {
 namespace framework {
@@ -54,8 +54,8 @@ void HeterPs::show_one_table(int gpu_num) { comm_->show_one_table(gpu_num); }
 
 void HeterPs::push_sparse(int num, FeatureKey* d_keys,
                           FeaturePushValue* d_grads, size_t len) {
-  // comm_->push_sparse(num, d_keys, d_grads, len, opt_);
-  comm_->push_sparse_multi_node(num, d_keys, d_grads, len, opt_);
+  comm_->push_sparse(num, d_keys, d_grads, len, opt_);
+  // comm_->push_sparse_multi_node(num, d_keys, d_grads, len, opt_);
 }
 
 void HeterPs::set_nccl_comm_and_size(const std::vector<ncclComm_t>& inner_comms,
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps.h b/paddle/fluid/framework/fleet/heter_ps/heter_ps.h
index 142f4a93b93a2..d78b6b492074d 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.h
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h"
 #include "paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h"
 
-#ifdef PADDLE_WITH_PSLIB
+#ifdef PADDLE_WITH_HETERPS
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h b/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h
index 7980220eab9b9..05b3ecf9c3c12 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
 #include "paddle/fluid/framework/fleet/heter_ps/heter_resource.h"
 
-#ifdef PADDLE_WITH_PSLIB
+#ifdef PADDLE_WITH_HETERPS
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_resource.cc b/paddle/fluid/framework/fleet/heter_ps/heter_resource.cc
index f65b664f83ba0..0f2af2a522e28 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_resource.cc
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_resource.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifdef PADDLE_WITH_PSLIB
+#ifdef PADDLE_WITH_HETERPS
 #include "heter_resource.h"
 #include "paddle/fluid/platform/cuda_device_guard.h"
 
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_resource.h b/paddle/fluid/framework/fleet/heter_ps/heter_resource.h
index ad7649a8a33cb..7b23379994c73 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_resource.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_resource.h
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #include "paddle/fluid/platform/enforce.h"
 
-#ifdef PADDLE_WITH_PSLIB
+#ifdef PADDLE_WITH_HETERPS
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h b/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h
index b3ec9e752e62b..7e82a8e014fd3 100644
--- a/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h
+++ b/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "optimizer_conf.h"
 #include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
 
-#ifdef PADDLE_WITH_PSLIB
+#ifdef PADDLE_WITH_HETERPS
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
index 4274876c9975e..67ff6b6acaefb 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
@@ -26,8 +26,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
-    (defined PADDLE_WITH_PSLIB)
+#ifdef PADDLE_WITH_HETERPS
 
 #include <algorithm>
 #include <deque>
@@ -58,7 +57,12 @@ void PSGPUWrapper::BuildTask(std::shared_ptr<HeterContext> gpu_task,
   auto& device_mutex = gpu_task->mutex_;
 
   std::vector<std::thread> threads;
+#ifdef PADDLE_WITH_PSLIB
   auto fleet_ptr = FleetWrapper::GetInstance();
+#endif
+#ifdef PADDLE_WITH_PSCORE
+  auto fleet_ptr = paddle::distributed::Communicator::GetInstance();
+#endif
 
   // data should be in input channel
   thread_keys_.resize(thread_keys_thread_num_);
@@ -99,12 +103,26 @@ void PSGPUWrapper::BuildTask(std::shared_ptr<HeterContext> gpu_task,
 
   timeline.Start();
 
+  threads.clear();
   // merge thread_keys to shard_keys
-  for (size_t i = 0; i < thread_keys_.size(); i++) {
-    gpu_task->batch_add_keys(thread_keys_[i]);
-    for (int j = 0; j < thread_keys_thread_num_; j++) {
-      thread_keys_[i][j].clear();
+  auto merge_ins_func = [this, gpu_task](int shard_num) {
+    for (int i = 0; i < thread_keys_thread_num_; ++i) {
+      gpu_task->batch_add_keys(shard_num, thread_keys_[i][shard_num]);
+      thread_keys_[i][shard_num].clear();
     }
+  };
+
+  // for (size_t i = 0; i < thread_keys_.size(); i++) {
+  //  gpu_task->batch_add_keys(thread_keys_[i]);
+  //  for (int j = 0; j < thread_keys_thread_num_; j++) {
+  //    thread_keys_[i][j].clear();
+  //  }
+  //}
+  for (int i = 0; i < thread_keys_shard_num_; ++i) {
+    threads.push_back(std::thread(merge_ins_func, i));
+  }
+  for (auto& t : threads) {
+    t.join();
   }
   timeline.Pause();
 
@@ -124,9 +142,16 @@ void PSGPUWrapper::BuildTask(std::shared_ptr<HeterContext> gpu_task,
   auto ptl_func = [this, &local_keys, &local_ptr, &table_id,
                    &fleet_ptr](int i) {
     size_t key_size = local_keys[i].size();
+#ifdef PADDLE_WITH_PSLIB
     auto tt = fleet_ptr->pslib_ptr_->_worker_ptr->pull_sparse_ptr(
         reinterpret_cast<char**>(local_ptr[i].data()), table_id,
         local_keys[i].data(), key_size);
+#endif
+#ifdef PADDLE_WITH_PSCORE
+    auto tt = fleet_ptr->_worker_ptr->pull_sparse_ptr(
+        reinterpret_cast<char**>(local_ptr[i].data()), table_id,
+        local_keys[i].data(), key_size);
+#endif
     tt.wait();
     auto status = tt.get();
     // auto status = 0;
@@ -153,8 +178,14 @@ void PSGPUWrapper::BuildTask(std::shared_ptr<HeterContext> gpu_task,
   auto build_func = [device_num, &local_keys, &local_ptr, &device_keys,
                      &device_vals, &device_mutex](int i) {
     std::vector<std::vector<FeatureKey>> task_keys(device_num);
+#ifdef PADDLE_WITH_PSLIB
     std::vector<std::vector<paddle::ps::DownpourFixedFeatureValue*>> task_ptrs(
         device_num);
+#endif
+
+#ifdef PADDLE_WITH_PSCORE
+    std::vector<std::vector<paddle::distributed::VALUE*>> task_ptrs(device_num);
+#endif
 
     for (size_t j = 0; j < local_keys[i].size(); j++) {
       int shard = local_keys[i][j] % device_num;
@@ -169,7 +200,7 @@ void PSGPUWrapper::BuildTask(std::shared_ptr<HeterContext> gpu_task,
       int cur = device_keys[dev].size();
       device_keys[dev].resize(device_keys[dev].size() + len);
       device_vals[dev].resize(device_vals[dev].size() + len);
-
+#ifdef PADDLE_WITH_PSLIB
       for (int j = 0; j < len; ++j) {
         device_keys[dev][cur + j] = task_keys[dev][j];
         float* ptr_val = task_ptrs[dev][j]->data();
@@ -196,6 +227,35 @@ void PSGPUWrapper::BuildTask(std::shared_ptr<HeterContext> gpu_task,
           }
         }
       }
+#endif
+#ifdef PADDLE_WITH_PSCORE
+      for (int j = 0; j < len; ++j) {
+        device_keys[dev][cur + j] = task_keys[dev][j];
+        distributed::VALUE* ptr_val = task_ptrs[dev][j];
+        FeatureValue& val = device_vals[dev][cur + j];
+        bool has_mf = 1;
+        val.delta_score = 0;
+        val.show = ptr_val->count_;
+        val.clk = 0;
+        val.slot = 0;
+        val.lr = 0;
+        val.lr_g2sum = 0;
+        val.cpu_ptr = (uint64_t)(task_ptrs[dev][j]);
+
+        if (has_mf) {
+          val.mf_size = MF_DIM + 1;
+          for (int x = 0; x < val.mf_size; x++) {
+            val.mf[x] = ptr_val->data_[x];
+          }
+        } else {
+          val.mf_size = 0;
+          for (int x = 0; x < MF_DIM + 1; x++) {
+            val.mf[x] = 0;
+          }
+        }
+      }
+#endif
+      VLOG(1) << "GpuPs build hbmps done";
 
       device_mutex[dev]->unlock();
     }
@@ -215,6 +275,7 @@ void PSGPUWrapper::BuildTask(std::shared_ptr<HeterContext> gpu_task,
 void PSGPUWrapper::BuildGPUPS(uint64_t table_id, int feature_dim) {
   int device_num = heter_devices_.size();
   std::shared_ptr<HeterContext> gpu_task = gpu_task_pool_.Get();
+  gpu_task->Reset();
   BuildTask(gpu_task, table_id, feature_dim);
   platform::Timer timeline;
   timeline.Start();
@@ -227,8 +288,8 @@ void PSGPUWrapper::BuildGPUPS(uint64_t table_id, int feature_dim) {
     size_max = std::max(size_max, feature_keys_count[i]);
   }
   if (HeterPs_) {
-    HeterPs_->show_one_table(0);
-    return;
+    delete HeterPs_;
+    HeterPs_ = nullptr;
   }
   std::vector<std::thread> threads(device_num);
   HeterPs_ = HeterPsBase::get_instance(size_max, resource_);
@@ -249,6 +310,7 @@ void PSGPUWrapper::BuildGPUPS(uint64_t table_id, int feature_dim) {
   timeline.Pause();
   VLOG(1) << "GpuPs build table total costs: " << timeline.ElapsedSec()
           << " s.";
+  gpu_task_pool_.Push(gpu_task);
 }
 
 void PSGPUWrapper::PullSparse(const paddle::platform::Place& place,
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
index 2eedcd5f1c700..2bf564d3f76d5 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifdef PADDLE_WITH_PSLIB
+#ifdef PADDLE_WITH_HETERPS
 #include <algorithm>
 #include <ctime>
 #include <memory>
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
index ef586b41fe05d..cfb23d1be2acf 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
@@ -14,8 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
-    (defined PADDLE_WITH_PSLIB)
+#ifdef PADDLE_WITH_HETERPS
 
 #include <atomic>
 #include <ctime>
@@ -26,7 +25,6 @@ limitations under the License. */
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
-
 #ifdef PADDLE_WITH_GLOO
 #include <gloo/broadcast.h>
 #include "paddle/fluid/framework/fleet/gloo_wrapper.h"
@@ -42,6 +40,9 @@ limitations under the License. */
 #include "paddle/fluid/platform/gpu_info.h"
 #include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
 #include "paddle/fluid/platform/place.h"
+#ifdef PADDLE_WITH_PSCORE
+#include "paddle/fluid/distributed/service/communicator.h"
+#endif
 
 namespace paddle {
 namespace framework {
@@ -219,7 +220,7 @@ class PSGPUWrapper {
   std::shared_ptr<HeterPsResource> resource_;
   int32_t sleep_seconds_before_fail_exit_;
   std::vector<int> slot_vector_;
-  int multi_node_{1};
+  int multi_node_{0};
   int node_size_;
   std::vector<ncclComm_t> inner_comms_;
   std::vector<ncclComm_t> inter_comms_;
diff --git a/paddle/fluid/framework/garbage_collector.cc b/paddle/fluid/framework/garbage_collector.cc
index 8dfbd3c268b86..9ab6b5d8c178b 100644
--- a/paddle/fluid/framework/garbage_collector.cc
+++ b/paddle/fluid/framework/garbage_collector.cc
@@ -122,6 +122,32 @@ void CUDAPinnedGarbageCollector::ClearCallback(
 }
 #endif
 
+#ifdef PADDLE_WITH_ASCEND_CL
+NPUDefaultStreamGarbageCollector::NPUDefaultStreamGarbageCollector(
+    const platform::NPUPlace &place, size_t max_memory_size)
+    : GarbageCollector(place, max_memory_size) {}
+
+void NPUDefaultStreamGarbageCollector::Wait() const {
+  static_cast<platform::NPUDeviceContext *>(this->dev_ctx_)
+      ->WaitStreamCallback();
+}
+
+void NPUDefaultStreamGarbageCollector::ClearCallback(
+    const std::function<void()> &callback) {
+  static_cast<platform::NPUDeviceContext *>(this->dev_ctx_)
+      ->AddStreamCallback(callback);
+}
+NPUUnsafeFastGarbageCollector::NPUUnsafeFastGarbageCollector(
+    const platform::NPUPlace &place, size_t max_memory_size)
+    : GarbageCollector(place, max_memory_size) {}
+
+void NPUUnsafeFastGarbageCollector::ClearCallback(
+    const std::function<void()> &callback) {
+  callback();
+}
+
+#endif
+
 int64_t GetEagerDeletionThreshold() {
   return FLAGS_eager_delete_tensor_gb < 0
              ? -1
diff --git a/paddle/fluid/framework/garbage_collector.h b/paddle/fluid/framework/garbage_collector.h
index 572c79d21a045..2c2b57bbe420a 100644
--- a/paddle/fluid/framework/garbage_collector.h
+++ b/paddle/fluid/framework/garbage_collector.h
@@ -131,6 +131,28 @@ class CUDAPinnedGarbageCollector : public GarbageCollector {
 };
 #endif
 
+#ifdef PADDLE_WITH_ASCEND_CL
+class NPUDefaultStreamGarbageCollector : public GarbageCollector {
+ public:
+  NPUDefaultStreamGarbageCollector(const platform::NPUPlace &place,
+                                   size_t max_memory_size);
+
+  void Wait() const override;
+
+ protected:
+  void ClearCallback(const std::function<void()> &callback) override;
+};
+
+class NPUUnsafeFastGarbageCollector : public GarbageCollector {
+ public:
+  NPUUnsafeFastGarbageCollector(const platform::NPUPlace &place,
+                                size_t max_memory_size);
+
+ protected:
+  void ClearCallback(const std::function<void()> &callback) override;
+};
+#endif
+
 template <typename Container>
 void GarbageCollector::Add(Container &&objs) {
   Add(std::forward<Container>(objs), []() {});
diff --git a/paddle/fluid/framework/heter_service.h b/paddle/fluid/framework/heter_service.h
index 8f52235c96244..3f65eaf3aa121 100644
--- a/paddle/fluid/framework/heter_service.h
+++ b/paddle/fluid/framework/heter_service.h
@@ -30,10 +30,12 @@ limitations under the License. */
 #include "brpc/controller.h"
 #include "brpc/server.h"
 #include "paddle/fluid/platform/timer.h"
+#endif
 
 namespace paddle {
 namespace framework {
 
+#ifdef PADDLE_WITH_PSLIB
 typedef std::function<int(const HeterRequest*, HeterResponse*)>
     HeterServiceHandler;
 class DataFeed;
@@ -142,7 +144,7 @@ class HeterTask {
   double cpu_2_gpu_time{0};
   platform::Timer timeline;
 };
-
+#endif
 template <class T>
 class HeterObjectPool {
  public:
@@ -153,7 +155,7 @@ class HeterObjectPool {
     if (pool_.empty()) {
       num_ += 1;
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      VLOG(0) << "pool construct size: " << num_;
+      VLOG(3) << "pool construct size: " << num_;
 #endif
       return std::make_shared<T>();
     } else {
@@ -178,6 +180,7 @@ class HeterObjectPool {
   int num_{0};
 };
 
+#ifdef PADDLE_WITH_PSLIB
 struct BthreadMutextGuard {
   BthreadMutextGuard(bthread_mutex_t* rho) {
     mutex_ = rho;
@@ -258,7 +261,6 @@ class HeterList {
     std::unique_lock<std::mutex> lock(mutex_);
     cond_.wait(lock, [this] { return size < cap_; });
     if (task_map_.find(key) != task_map_.end()) {
-      // std::cout << "try put key=" << key << " false" << std::endl;
       task_map_.erase(key);
       return false;
     } else {
@@ -267,7 +269,6 @@ class HeterList {
       node->value = value;
       map_[node->key] = node;
       attach(node);
-      // std::cout << "try put key=" << key << " true" << std::endl;
       return true;
     }
   }
@@ -276,7 +277,6 @@ class HeterList {
     std::unique_lock<std::mutex> lock(mutex_);
     cond_.wait(lock, [this] { return size < cap_; });
     HeterNode<K, T>* node = new HeterNode<K, T>;
-    // std::cout << "put key=" << key << " true" << std::endl;
     node->key = key;
     node->value = value;
     map_[node->key] = node;
@@ -288,7 +288,6 @@ class HeterList {
     std::lock_guard<std::mutex> lock(mutex_);
     auto iter = map_.find(key);
     if (iter != map_.end()) {
-      // std::cout << "try get key=" << key << " true" << std::endl;
       HeterNode<K, T>* node = iter->second;
       detach(node);
       cond_.notify_one();
@@ -298,7 +297,6 @@ class HeterList {
       return ret;
     }
     task_map_.insert(key);
-    // std::cout << "try get key=" << key << " false" << std::endl;
     return nullptr;
   }
 
@@ -306,7 +304,6 @@ class HeterList {
     std::lock_guard<std::mutex> lock(mutex_);
     auto iter = map_.find(key);
     if (iter != map_.end()) {
-      // std::cout << "get key=" << key << " true" << std::endl;
       HeterNode<K, T>* node = iter->second;
       detach(node);
       cond_.notify_one();
@@ -315,7 +312,6 @@ class HeterList {
       delete node;
       return ret;
     }
-    // std::cout << "get key=" << key << " false" << std::endl;
     return nullptr;
   }
 
@@ -323,14 +319,12 @@ class HeterList {
     std::lock_guard<std::mutex> lock(mutex_);
     HeterNode<K, T>* node = head_->next;
     if (node == tail_) {
-      // std::cout << "get2 false" << std::endl;
       return nullptr;
     } else {
       detach(node);
       cond_.notify_one();
       T ret = std::move(node->value);
       map_.erase(node->key);
-      // std::cout << "get2 key=" << node->key << " true" << std::endl;
       delete node;
       return ret;
     }
@@ -371,7 +365,7 @@ class HeterList {
   int cap_;
   int size;
 };
+#endif
 
 }  // namespace framework
 }  // namespace paddle
-#endif
diff --git a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc
index 44069f61d93ff..59d071e103459 100644
--- a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc
@@ -299,6 +299,11 @@ static int BuildFusion(Graph* graph, const std::string& name_scope
     new_op_desc.SetOutput("Out", {end_pattern_out[k]->Name()});
     new_op_desc.SetAttr("epsilon",
                         end_patter_layernorms[k]->Op()->GetAttr("epsilon"));
+
+    if (end_patter_layernorms[k]->Op()->HasAttr("out_threshold")) {
+      new_op_desc.SetAttr("enable_int8", true);
+    }
+
     auto* embedding_eltwise_layernorm = graph->CreateOpNode(&new_op_desc);
 
     for (size_t iter = 0; iter < start_pattern_in_nodes[i].size(); ++iter) {
diff --git a/paddle/fluid/framework/ir/is_test_pass.cc b/paddle/fluid/framework/ir/is_test_pass.cc
index 0a70440765d44..25bf03f426a1d 100644
--- a/paddle/fluid/framework/ir/is_test_pass.cc
+++ b/paddle/fluid/framework/ir/is_test_pass.cc
@@ -35,7 +35,7 @@ void IsTestPass::ApplyImpl(ir::Graph* graph) const {
                   "hard_shrink", "hard_sigmoid", "relu6",
                   "soft_relu",   "swish",        "thresholded_relu",
                   "log",         "square",       "softplus",
-                  "softsign"};
+                  "softsign",    "silu"};
   for (const Node* n : graph->Nodes()) {
     if (n->IsOp()) {
       auto* op = n->Op();
diff --git a/paddle/fluid/framework/ir/mkldnn/interpolate_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/interpolate_mkldnn_pass.cc
index 06df1caca35b9..4eb532b47cb4b 100644
--- a/paddle/fluid/framework/ir/mkldnn/interpolate_mkldnn_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/interpolate_mkldnn_pass.cc
@@ -43,8 +43,9 @@ void InterpolateMKLDNNPass::ApplyImpl(ir::Graph* graph) const {
 
   int found_count = 0;
   const std::vector<std::string> interpolate_op_types = {
-      "bilinear_interp", "nearest_interp", "trilinear_interp", "bicubic_interp",
-      "linear_interp"};
+      "bilinear_interp",  "nearest_interp", "trilinear_interp",
+      "bicubic_interp",   "linear_interp",  "bilinear_interp_v2",
+      "nearest_interp_v2"};
 
   for (const Node* node : graph->Nodes()) {
     if (node->IsOp() &&
diff --git a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
index e8f4dbd29543c..1e8349e878781 100644
--- a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
@@ -535,6 +535,38 @@ static int BuildFusionV2(Graph* graph, const std::string& name_scope,
     multihead_op_desc.SetAttr("alpha", scale_attr);
     multihead_op_desc.SetAttr("head_number", head_number);
 
+    auto* mul0_op_desc = mul0->Op();
+    auto* mul1_op_desc = mul1->Op();
+    auto* mul2_op_desc = mul2->Op();
+    if (mul0_op_desc->HasAttr("enable_int8")) {
+      multihead_op_desc.SetAttr("enable_int8",
+                                mul0_op_desc->GetAttr("enable_int8"));
+      // all mul op has same input.
+      multihead_op_desc.SetAttr("Input_scale",
+                                mul0_op_desc->GetAttr("X_scale"));
+      auto weight_scale0 = BOOST_GET_CONST(
+          std::vector<float>, mul0_op_desc->GetAttr("weight_scale"));
+      auto weight_scale1 = BOOST_GET_CONST(
+          std::vector<float>, mul1_op_desc->GetAttr("weight_scale"));
+      auto weight_scale2 = BOOST_GET_CONST(
+          std::vector<float>, mul2_op_desc->GetAttr("weight_scale"));
+      auto weight_max = std::max(weight_scale0, weight_scale1);
+      weight_max = std::max(weight_max, weight_scale2);
+      multihead_op_desc.SetAttr("weight_scale", weight_max);
+
+      if (mul0_op_desc->HasAttr("out_threshold")) {
+        auto out_scale0 =
+            BOOST_GET_CONST(float, mul0_op_desc->GetAttr("out_threshold"));
+        auto out_scale1 =
+            BOOST_GET_CONST(float, mul1_op_desc->GetAttr("out_threshold"));
+        auto out_scale2 =
+            BOOST_GET_CONST(float, mul2_op_desc->GetAttr("out_threshold"));
+        auto out_scale_max = std::max(out_scale0, out_scale1);
+        out_scale_max = std::max(out_scale_max, out_scale2);
+        multihead_op_desc.SetAttr("out_threshold", out_scale_max);
+      }
+    }
+
     auto* multihead = graph->CreateOpNode(&multihead_op_desc);
 
     IR_NODE_LINK_TO(input0, multihead);
diff --git a/paddle/fluid/framework/ir/placement_pass_base.cc b/paddle/fluid/framework/ir/placement_pass_base.cc
index fd604ffe7b5de..35ba920060779 100644
--- a/paddle/fluid/framework/ir/placement_pass_base.cc
+++ b/paddle/fluid/framework/ir/placement_pass_base.cc
@@ -77,7 +77,8 @@ bool PlacementPassBase::IsDefaultOpTypes(const std::string& op_type) const {
     // the corresponding pass.
     const std::vector<std::string> not_default_op_types = {
         "bilinear_interp", "nearest_interp", "trilinear_interp",
-        "bicubic_interp", "linear_interp"};
+        "bicubic_interp",  "linear_interp",  "bilinear_interp_v2",
+        "linear_interp_v2"};
     bool is_interpolate_op =
         std::find(not_default_op_types.begin(), not_default_op_types.end(),
                   op_type) != not_default_op_types.end();
diff --git a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
index 5043fce8885cd..2fc39fd25d56c 100644
--- a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
@@ -225,10 +225,13 @@ void FuseDequant(ir::Graph* graph, Scope* scope,
                quantized_op_type == "depthwise_conv2d") {
       PADDLE_ENFORCE_EQ(
           dequant_type, "fake_channel_wise_dequantize_max_abs",
-          platform::errors::InvalidArgument("conv2d op must be dequantized by "
-                                            "[fake_channel_wise_dequantize_max_"
-                                            "abs], but got %s",
-                                            dequant_type));
+          platform::errors::InvalidArgument(
+              "conv2d op must be dequantized by "
+              "[fake_channel_wise_dequantize_max_abs], but got %s. "
+              "If you uses PaddleSlim to generate the quantized "
+              "model, please set the 'weight_quantize_type' params as "
+              "'channel_wise_abs_max' and generate the quantized model again.",
+              dequant_type));
       PADDLE_ENFORCE_EQ(
           weight_scale.size(), static_cast<size_t>(w_dims[0]),
           platform::errors::InvalidArgument(
diff --git a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc
index 0e63320f2f7ad..232e1d8da4ded 100644
--- a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc
@@ -153,6 +153,10 @@ void SkipLayerNormFusePass::ApplyImpl(ir::Graph *graph) const {
     new_desc.SetInput("Scale", {layer_norm_scale->Name()});
     new_desc.SetInput("Bias", {layer_norm_bias->Name()});
 
+    if (elementwise->Op()->HasAttr("out_threshold")) {
+      new_desc.SetAttr("enable_int8", true);
+    }
+
     // outputs
     new_desc.SetOutput("Out", {layer_norm_out->Name()});
 
diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc
index 3a79452e230ef..0a6b5e44452fe 100644
--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
@@ -268,6 +268,21 @@ void SerializeToStream(std::ostream &os, const LoDTensor &tensor,
   TensorToStream(os, static_cast<Tensor>(tensor), dev_ctx);
 }
 
+void SerializeToStream(std::ostream &os, const LoDTensor &tensor) {
+  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+  const platform::DeviceContext *dev_ctx;
+  auto place = tensor.place();
+  dev_ctx = pool.Get(place);
+  SerializeToStream(os, tensor, *dev_ctx);
+}
+
+void DeserializeFromStream(std::ifstream &os, LoDTensor *tensor) {
+  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+  const platform::DeviceContext *dev_ctx;
+  dev_ctx = pool.Get(platform::CPUPlace());
+  DeserializeFromStream(os, tensor, *dev_ctx);
+}
+
 void DeserializeFromStream(std::istream &is, LoDTensor *tensor,
                            const platform::DeviceContext &dev_ctx,
                            const size_t &seek,
diff --git a/paddle/fluid/framework/lod_tensor.h b/paddle/fluid/framework/lod_tensor.h
index b8911154e6bf7..6b357aba1c5f9 100644
--- a/paddle/fluid/framework/lod_tensor.h
+++ b/paddle/fluid/framework/lod_tensor.h
@@ -14,16 +14,11 @@ limitations under the License. */
 
 #pragma once
 
+#include <glog/logging.h>
 #include <memory>
 #include <string>
 #include <utility>
 #include <vector>
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-#include <thrust/device_vector.h>
-#include <thrust/host_vector.h>
-#endif
-
-#include <glog/logging.h>
 
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/mixed_vector.h"
@@ -260,5 +255,9 @@ LoD ConvertToLengthBasedLoD(const LoD& offset_lod);
 
 LoD ConvertToOffsetBasedLoD(const LoD& length_lod);
 
+void SerializeToStream(std::ostream& os, const LoDTensor& tensor);
+
+void DeserializeFromStream(std::ifstream& os, LoDTensor* tensor);
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/multi_trainer.cc b/paddle/fluid/framework/multi_trainer.cc
index ff8e71b92e0ac..198bb65863bb6 100644
--- a/paddle/fluid/framework/multi_trainer.cc
+++ b/paddle/fluid/framework/multi_trainer.cc
@@ -38,6 +38,13 @@ void MultiTrainer::Initialize(const TrainerDesc& trainer_desc,
     need_merge_var_names_.push_back(
         trainer_desc.downpour_param().stat_var_names(i));
   }
+#ifdef PADDLE_WITH_HETERPS
+  for (int i = 0; i < thread_num_; ++i) {
+    int num = trainer_desc.worker_places(i);
+    platform::CUDAPlace place = platform::CUDAPlace(num);
+    places_.push_back(place);
+  }
+#endif
   // get filelist from trainer_desc here
   const std::vector<paddle::framework::DataFeed*> readers =
       dataset->GetReaders();
@@ -102,13 +109,42 @@ void MultiTrainer::InitDumpEnv() {
 void MultiTrainer::InitTrainerEnv(const ProgramDesc& main_program,
                                   const platform::Place& place) {
   for (int i = 0; i < thread_num_; ++i) {
+#ifdef PADDLE_WITH_HETERPS
+    workers_[i]->SetPlace(places_[i]);
+    workers_[i]->SetReaderPlace(places_[i]);
+#else
     workers_[i]->SetPlace(place);
     workers_[i]->SetReaderPlace(place);
+#endif
     workers_[i]->SetRootScope(root_scope_);
     workers_[i]->CreateDeviceResource(main_program);  // Program
     workers_[i]->BindingDataFeedMemory();
     workers_[i]->CacheProgram(main_program);
   }
+#ifdef PADDLE_WITH_HETERPS
+  for (int num = 0; num < thread_num_; ++num) {
+    auto place = places_[num];
+    Scope* scope = workers_[num]->GetThreadScope();
+    auto& block = main_program.Block(0);
+    for (auto& var : block.AllVars()) {
+      if (var->Persistable()) {
+        auto name = var->Name();
+        Variable* root_var = root_scope_->FindVar(name);
+        if (!root_var) {
+          continue;
+        }
+        if (root_var->IsType<SelectedRows>()) {
+          continue;
+        }
+        LoDTensor* root_tensor = root_var->GetMutable<LoDTensor>();
+        auto* ptr = scope->Var(name);
+        InitializeVariable(ptr, proto::VarType::LOD_TENSOR);
+        LoDTensor* thread_tensor = ptr->GetMutable<LoDTensor>();
+        TensorCopy(*root_tensor, place, thread_tensor);
+      }
+    }
+  }
+#endif
 }
 
 void MultiTrainer::InitOtherEnv(const ProgramDesc& main_program) {
@@ -138,10 +174,77 @@ void MultiTrainer::Run() {
   }
 }
 
+#ifdef PADDLE_WITH_HETERPS
+void MultiTrainer::MergeDenseParam() {
+  auto communicator = paddle::distributed::Communicator::GetInstance();
+  auto& recv_ctx = communicator->GetRecvCtxMap();
+  Scope* thread_scope = workers_[0]->GetThreadScope();
+  for (auto& iter : recv_ctx) {
+    auto& varnames = iter.second;
+    for (auto& name : varnames) {
+      Variable* root_var = root_scope_->FindVar(name);
+      LoDTensor* root_tensor = root_var->GetMutable<LoDTensor>();
+      Variable* var = thread_scope->FindVar(name);
+      LoDTensor* tensor = var->GetMutable<LoDTensor>();
+      TensorCopy((*tensor), root_tensor->place(), root_tensor);
+    }
+  }
+}
+#endif
+
+template <typename T>
+void MultiTrainer::MergeToRootScope(LoDTensor* root_tensor, LoDTensor* tensor) {
+  LoDTensor tmp_root;
+  TensorCopy(*root_tensor, platform::CPUPlace(), &tmp_root);
+  T* tmp_root_data = tmp_root.data<T>();
+  LoDTensor tmp_tensor;
+  TensorCopy(*tensor, platform::CPUPlace(), &tmp_tensor);
+  T* data = tmp_tensor.data<T>();
+  for (int i = 0; i < tmp_tensor.numel(); i++) {
+    tmp_root_data[i] += data[i];
+  }
+  TensorCopy(tmp_root, platform::CPUPlace(), root_tensor);
+}
+
 void MultiTrainer::Finalize() {
   if (need_dump_field_ || need_dump_param_) {
     FinalizeDumpEnv();
   }
+#ifdef PADDLE_WITH_HETERPS
+  for (size_t i = 0; i < need_merge_var_names_.size(); i++) {
+    Variable* root_var = root_scope_->FindVar(need_merge_var_names_[i]);
+    if (root_var == nullptr) {
+      continue;
+    }
+    LoDTensor* root_tensor = root_var->GetMutable<LoDTensor>();
+
+    for (size_t j = 0; j < places_.size(); j++) {
+      Scope* cur_thread_scope = workers_[j]->GetThreadScope();
+      Variable* thread_var =
+          cur_thread_scope->FindVar(need_merge_var_names_[i]);
+      if (thread_var == nullptr) {
+        continue;
+      }
+      LoDTensor* thread_tensor = thread_var->GetMutable<LoDTensor>();
+#define MergeCallback(cpp_type, proto_type)                                    \
+  do {                                                                         \
+    if (root_tensor->type() == proto_type) {                                   \
+      if (thread_tensor->type() != proto_type) {                               \
+        VLOG(0) << "Error: thread id=" << j << ", need_merge_var_names_[" << i \
+                << "] " << need_merge_var_names_[i]                            \
+                << ", root tensor type=" << root_tensor->type()                \
+                << ", thread tensor type=" << thread_tensor->type();           \
+        exit(-1);                                                              \
+      }                                                                        \
+      MergeToRootScope<cpp_type>(root_tensor, thread_tensor);                  \
+    }                                                                          \
+  } while (0)
+      _ForEachDataType_(MergeCallback);
+    }
+  }
+  MergeDenseParam();
+
+#endif
   root_scope_->DropKids();
 }
 
diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h
index 4c52932976122..818da7478b239 100644
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -343,6 +343,12 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
       ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
       __VA_ARGS__)
 
+#define REGISTER_OP_NPU_KERNEL_FUNCTOR(op_type, ...)                  \
+  REGISTER_OP_KERNEL_EX(                                              \
+      op_type, NPU, ::paddle::platform::NPUPlace, DEFAULT_TYPE,       \
+      ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
+      __VA_ARGS__)
+
 /**
  * Macro to mark what Operator and Kernel
  * we will use and tell the compiler to
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index e624fbc237dea..955c917b2c1bf 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1236,7 +1236,8 @@ void OperatorWithKernel::ChooseKernel(const RuntimeContext& ctx,
       }
     }
   }
-  VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
+  VLOG(3) << "op type:" << type_
+          << ", expected_kernel_key:" << expected_kernel_key;
 
   auto kernel_iter = kernels.find(expected_kernel_key);
 #ifdef PADDLE_WITH_MKLDNN
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index bf27a8e37e0b3..3fc61581eca72 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -419,6 +419,7 @@ class ExecutionContext {
   const RuntimeContext Context() const { return ctx_; }
 
   std::string DebugString() const { return op_.DebugString(); }
+  const OperatorBase& GetOp() const { return op_; }
 
  private:
   const OperatorBase& op_;
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index dd93639f31908..73a699b41c8e0 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -27,7 +27,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h"
-#include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
 #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
@@ -631,141 +630,15 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
   InitP2P(places);
   ir::InitReaderQueueDeviceCount(graph, *(member_->global_scope_),
                                  member_->places_.size());
-  member_->use_device_ = exec_strategy.use_device_;
-  member_->build_strategy_ = build_strategy;
-  member_->use_all_reduce_ = member_->build_strategy_.reduce_ ==
-                             BuildStrategy::ReduceStrategy::kAllReduce;
-  member_->nranks_ = build_strategy.num_trainers_ * places.size();
-  if (!member_->use_all_reduce_ && member_->nranks_ == 1) {
-    LOG(INFO) << "If you set build_strategy.reduce with 'Reduce',"
-                 "the number of places should be greater than 1.";
-    member_->build_strategy_.reduce_ =
-        BuildStrategy::ReduceStrategy::kAllReduce;
-    member_->use_all_reduce_ = true;
-  }
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && defined(_WIN32)
-  if (member_->IsUseCUDA(member_->use_device_)) {
-    PADDLE_ENFORCE_EQ(
-        places.size(), 1,
-        platform::errors::Unavailable("Windows can support Single GPU only."));
-  }
-#endif
-
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
-    (!defined(PADDLE_WITH_NCCL) && !defined(PADDLE_WITH_RCCL))
-  if (member_->IsUseCUDA(member_->use_device_)) {
-    PADDLE_ENFORCE_EQ(
-        places.size(), 1,
-        platform::errors::PermissionDenied(
-            "Your machine has multiple cards, "
-            "but the WITH_NCCL option is not turned on during compilation, "
-            "and you cannot use multi-card training or prediction. "
-            "Please recompile and turn on the WITH_NCCL option."));
-  }
-#endif
-
-  std::string device_name;
-  if (member_->use_device_ == p::kCPU) {
-    device_name = "CPU";
-  } else if (member_->use_device_ == p::kCUDA) {
-    device_name = "CUDA";
-  } else {
-    device_name = "XPU";
-  }
-
-  VLOG(1) << string::Sprintf(
-      "The Program will be executed on %s using ParallelExecutor, %lu "
-      "cards are used, so %lu programs are executed in parallel.",
-      device_name, places.size(), places.size());
-
-  // Step 1. Bcast the bcast_vars to devs.
-  // Create local scopes
-  if (local_scopes.empty()) {
-    member_->own_local_scope_ = true;
-    member_->local_scopes_.emplace_back(member_->global_scope_);
-    for (size_t i = 1; i < member_->places_.size(); ++i) {
-      member_->local_scopes_.emplace_back(&scope->NewScope());
-    }
-  } else {
-    member_->own_local_scope_ = false;
-    PADDLE_ENFORCE_EQ(member_->places_.size(), local_scopes.size(),
-                      platform::errors::PreconditionNotMet(
-                          "member_->places_.size() = %d is not equal to "
-                          "local_scopes.size() = %d",
-                          member_->places_.size(), local_scopes.size()));
-    for (size_t i = 0; i < member_->places_.size(); ++i) {
-      member_->local_scopes_.emplace_back(&local_scopes[i]->NewScope());
-    }
-  }
-
-  std::vector<ir::Graph *> graphs;
-  if (member_->build_strategy_.async_mode_) {
-    PADDLE_ENFORCE_EQ(member_->IsUseCUDA(member_->use_device_), false,
-                      platform::errors::Unavailable(
-                          "gpu mode does not support async_mode_ now!"));
-    graphs.push_back(graph);
-    for (size_t i = 1; i < places.size(); ++i) {
-      auto *tmp_graph = new ir::Graph(graph->OriginProgram());
-      async_graphs_.emplace_back(tmp_graph);
-      graphs.push_back(tmp_graph);
-    }
-  }
-
-  // FIXME(Yancey1989): parallel graph mode get better performance
-  // in GPU allreduce distributed training. Need an elegant way to
-  // choice the execution strategy.
-  member_->build_strategy_.enable_parallel_graph_ =
-      EnableParallelGraphExecution(*graph, exec_strategy,
-                                   member_->build_strategy_);
-  if (member_->build_strategy_.enable_parallel_graph_) {
-    LOG(INFO) << "The Executor would execute the graph by ParallelGraph "
-                 "Execution which can get better performance,"
-              << "you can force it off by env FLAGS_enable_parallel_graph=0";
-  }
+  // Initialize necessary info of member_ with strategy.
+  InitExecutorPrivateMemberInfo(exec_strategy, build_strategy, places.size(),
+                                *graph);
 
-  if (member_->IsUseCUDA(member_->use_device_) && member_->nranks_ > 1) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-    member_->InitOrGetNCCLCommunicator(scope, &member_->build_strategy_);
-
-    // Initialize device context's nccl comm, will be used by normal
-    // Operators like sync_batch_norm, and collective ops.
-    // NOTE: more than one ParallelExecutor with same place, the nccl comm will
-    // be rewrite and there will be some problem.
-    // NOTE: NCCL group-calls and non-group-calls can not use the same
-    // NCCL communicator, so for ParallelGraph and Multi-Process mode, re-use
-    // same communicators.
-    auto *nccl_ctxs =
-        member_->nccl_ctxs_->GetSyncBatchNormCtx(scope, member_->places_);
-    auto &pool = platform::DeviceContextPool::Instance();
-    for (size_t dev_id = 0; dev_id < member_->places_.size(); ++dev_id) {
-      auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
-          pool.Get(member_->places_[dev_id]));
-      auto &nccl_ctx = nccl_ctxs->at(member_->places_[dev_id]);
-      dev_ctx->set_nccl_comm(nccl_ctx.comm());
-    }
-#else
-    PADDLE_THROW(
-        platform::errors::PreconditionNotMet("Not compiled with CUDA."));
-#endif
-  }
-  if (member_->use_device_ == p::kXPU && member_->nranks_ > 1) {
-#if defined(PADDLE_WITH_XPU_BKCL)
-    member_->InitOrGetBKCLCommunicator(scope, member_->build_strategy_);
+  // Step 1. Create local scopes and Clone graph into multi device
+  CreateLocalScopes(scope, local_scopes, /*create_new*/ true);
+  std::vector<ir::Graph *> graphs = CloneGraphToMultiDevices(graph);
+  PrepareNCCLCommunicator(scope);
 
-    auto *bkcl_ctxs =
-        member_->bkcl_ctxs_->GetSyncBatchNormCtx(scope, member_->places_);
-    auto &pool = platform::DeviceContextPool::Instance();
-    for (size_t dev_id = 0; dev_id < member_->places_.size(); ++dev_id) {
-      auto *dev_ctx = static_cast<platform::XPUDeviceContext *>(
-          pool.Get(member_->places_[dev_id]));
-      auto &bkcl_ctx = bkcl_ctxs->at(member_->places_[dev_id]);
-      dev_ctx->set_bkcl_context(bkcl_ctx.comm());
-    }
-#else
-    PADDLE_THROW(
-        platform::errors::PreconditionNotMet("Not compiled with XPU."));
-#endif
-  }
   // broadcast parameters from the 0th device to others:
   auto need_broadcast = [&]() -> bool {
     if (member_->build_strategy_.num_trainers_ > 1) {
@@ -778,257 +651,75 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
     }
     return false;
   };
-  // Bcast Parameters to all GPUs
   if (need_broadcast()) {
     BCastParamsToDevices(bcast_vars, member_->build_strategy_.trainer_id_);
   }
 
-  // Startup Program has been run. All local scopes has correct parameters.
-
   // Step 2. Convert main_program to SSA form and dependency graph. Also, insert
   // ncclOp
-  std::vector<ir::Graph *> async_graphs(places.size());
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-  if (member_->build_strategy_.async_mode_) {
-    VLOG(3) << "use local async mode";
-    graph = member_->build_strategy_.Apply(
-        graph, {member_->places_[0]}, loss_var_name,
-        {member_->local_scopes_[0]}, 1, member_->use_device_,
-        member_->nccl_ctxs_);
-    for (size_t i = 1; i < member_->places_.size(); ++i) {
-      graphs[i] = member_->build_strategy_.Apply(
-          graphs[i], {member_->places_[i]}, loss_var_name,
-          {member_->local_scopes_[i]}, 1, member_->use_device_,
-          member_->nccl_ctxs_);
-      async_graphs[i] = graphs[i];
-    }
-  } else {
-    graph = member_->build_strategy_.Apply(
-        graph, member_->places_, loss_var_name, member_->local_scopes_,
-        member_->nranks_, member_->use_device_, member_->nccl_ctxs_);
-  }
-#elif defined(PADDLE_WITH_XPU_BKCL)
-  if (member_->build_strategy_.async_mode_) {
-    VLOG(3) << "use local async mode";
-    graph = member_->build_strategy_.Apply(
-        graph, {member_->places_[0]}, loss_var_name,
-        {member_->local_scopes_[0]}, 1, member_->use_device_,
-        member_->bkcl_ctxs_);
-    for (size_t i = 1; i < member_->places_.size(); ++i) {
-      graphs[i] = member_->build_strategy_.Apply(
-          graphs[i], {member_->places_[i]}, loss_var_name,
-          {member_->local_scopes_[i]}, 1, member_->use_device_,
-          member_->bkcl_ctxs_);
-      async_graphs[i] = graphs[i];
-    }
-  } else {
-    graph = member_->build_strategy_.Apply(
-        graph, member_->places_, loss_var_name, member_->local_scopes_,
-        member_->nranks_, member_->use_device_, member_->bkcl_ctxs_);
-  }
-#else
-  if (member_->build_strategy_.async_mode_) {
-    VLOG(3) << "use local async mode";
-    graph = member_->build_strategy_.Apply(
-        graph, {member_->places_[0]}, loss_var_name,
-        {member_->local_scopes_[0]}, 1, member_->use_device_);
-    for (size_t i = 1; i < member_->places_.size(); ++i) {
-      graphs[i] = member_->build_strategy_.Apply(
-          graphs[i], {member_->places_[i]}, loss_var_name,
-          {member_->local_scopes_[i]}, 1, member_->use_device_);
-      async_graphs[i] = graphs[i];
-    }
-  } else {
-    graph = member_->build_strategy_.Apply(
-        graph, member_->places_, loss_var_name, member_->local_scopes_,
-        member_->nranks_, member_->use_device_);
-  }
-#endif
-
+  std::vector<ir::Graph *> async_graphs =
+      CompileGraphWithBuildStrategy(graph, &graphs, loss_var_name);
   graph = member_->ApplyMemoryOptimizePass(graph);
-
   async_graphs[0] = graph;
 
   // Step 3. Create vars in each scope. Passes may also create new vars.
   //         skip control vars and empty vars
   std::vector<details::VariableInfo> var_infos;
-  for (auto &node : graph->Nodes()) {
-    if (node->IsVar() && !node->IsCtrlVar() && node->Var()) {
-      var_infos.emplace_back();
-      var_infos.back().name_ = node->Var()->Name();
-      var_infos.back().type_ = node->Var()->GetType();
-      var_infos.back().persistable_ = node->Var()->Persistable();
-
-      member_->is_persistable_.emplace(node->Var()->Name(),
-                                       node->Var()->Persistable());
-    }
-  }
-
-  if (graph->Has(details::kFusedVars)) {
-    auto &fused_vars = graph->Get<details::FusedVars>(details::kFusedVars);
-    for (auto &fused_var : fused_vars) {
-      var_infos.emplace_back();
-      var_infos.back() = fused_var.second;
+  CreateVariableInfos(&var_infos, graph);
+  std::unordered_map<Scope *, Scope *> scope_map =
+      CreateLocalExecScopes(member_->local_scopes_, /*create_new*/ true);
 
-      member_->is_persistable_.emplace(fused_var.first,
-                                       fused_var.second.persistable_);
-    }
-  }
+  // Step 4. Create SSAGraph executor
+  std::vector<ir::Graph *> final_graphs =
+      CreateSSAGraphExecutor(exec_strategy, &async_graphs, graph);
 
-  std::unordered_map<Scope *, Scope *> scope_map;
-  for (auto *scope : member_->local_scopes_) {
-    auto &local_exec_scope = scope->NewScope();
-    member_->local_exec_scopes_.emplace_back(&local_exec_scope);
-    scope_map.emplace(scope, &local_exec_scope);
+  VLOG(3) << "use ScopeBufferedSSAGraphExecutor";
+  if (!member_->build_strategy_.async_mode_) {
+    member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor(
+        exec_strategy, member_->local_scopes_, member_->local_exec_scopes_,
+        std::move(var_infos), member_->places_, std::move(member_->executor_)));
   }
 
-  PADDLE_ENFORCE_EQ(
-      member_->local_scopes_.size(), member_->local_exec_scopes_.size(),
-      platform::errors::PreconditionNotMet(
-          "member_->local_scopes_.size() = %d is not equal to "
-          "member_->local_exec_scopes_.size() = %d",
-          member_->local_scopes_.size(), member_->local_exec_scopes_.size()));
+  ResetOpHandleScopeMapOfGraphs(final_graphs, scope_map);
+  SetReaderOpDeviceInfoOfGraphs(final_graphs);
+}
 
-  std::vector<ir::Graph *> final_graphs;
+void ParallelExecutor::BCastParamsToDevices(
+    const std::vector<std::string> &vars, int trainer_id) const {
+  VLOG(3) << "BCastParamsToDevices";
+  // the initializing bcast, all vars would be bcast from device(0).
+  for (auto &var : vars) {
+    framework::Variable *main_var = member_->local_scopes_[0]->FindVar(var);
+    if (main_var == nullptr || !main_var->IsType<LoDTensor>()) {
+      continue;
+    }
 
-  if (member_->build_strategy_.async_mode_) {
-    VLOG(3) << "use AsyncSSAGraphExecutor";
-    member_->executor_.reset(new details::AsyncSSAGraphExecutor(
-        exec_strategy, member_->local_scopes_, member_->local_exec_scopes_,
-        member_->places_, async_graphs));
-    final_graphs = async_graphs;
-  } else if (member_->build_strategy_.enable_parallel_graph_) {
-    VLOG(3) << "use ParallelSSAGraphExecutor";
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    // TODO(Yancey1989): Remove passing in the main_program when
-    // allreduce_seq_pass doesn't need it as the attr.
-    bool is_inference = details::IsDataParallelInferenceGraph(*graph);
-    bool has_drop_last_read_op = details::HasDropLastReadOp(*graph);
+    auto &main_tensor = main_var->Get<LoDTensor>();
+    if (!main_tensor.IsInitialized()) {
+      VLOG(3) << "one in var not inited, return!";
+      continue;
+    }
+    auto &dims = main_tensor.dims();
+    if (paddle::platform::is_gpu_place(main_tensor.place())) {
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+      std::vector<void *> buffers;
+      buffers.reserve(member_->places_.size());
+      size_t numel = main_tensor.numel();
+      ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type());
+      for (size_t i = 0; i < member_->places_.size(); ++i) {
+        auto place = member_->places_[i];
+        void *buffer;
 
-    auto *pg_exe = new details::ParallelSSAGraphExecutor(
-        exec_strategy, member_->local_scopes_, member_->local_exec_scopes_,
-        member_->places_, graph);
-    final_graphs = pg_exe->Graphs();
-    member_->executor_.reset(pg_exe);
-
-    if (is_inference && member_->places_.size() > 1) {
-      member_->inference_executor_ = pg_exe;
-      if (!has_drop_last_read_op) {
-        VLOG(5) << "Enable partial feed support in inference phase";
-        pg_exe->EnablePartialFeedSupport();
-      }
-    }
-#else
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "Paddle should be compiled with CUDA for ParallelGraph Execution."));
-#endif
-  } else {
-    bool has_drop_last_read_op = details::HasDropLastReadOp(*graph);
-    auto possible_inference_graphs =
-        details::TrySeparateToMultipleSingleDeviceGraphs(graph);
-    if (!possible_inference_graphs.empty()) {
-      VLOG(5) << "Use ParallelSSAGraphExecutor in inference phase";
-      auto *pg_exe = new details::ParallelSSAGraphExecutor(
-          exec_strategy, member_->local_scopes_, member_->local_exec_scopes_,
-          member_->places_, std::move(possible_inference_graphs));
-      if (!has_drop_last_read_op) {
-        VLOG(5) << "Enable partial feed support in inference phase";
-        pg_exe->EnablePartialFeedSupport();
-      }
-      final_graphs = pg_exe->Graphs();
-      member_->executor_.reset(pg_exe);
-      member_->inference_executor_ = pg_exe;
-    } else {
-      LOG_IF(WARNING, details::HasKeepLastReadOp(*graph))
-          << "drop_last=False for DataLoader is not supported in training "
-             "network. It is automatically turned to drop_last=True.";
-      if (exec_strategy.type_ == ExecutionStrategy::kDefault) {
-        VLOG(3) << "use ThreadedSSAGraphExecutor";
-        member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
-            exec_strategy, member_->local_scopes_, member_->local_exec_scopes_,
-            member_->places_, graph));
-      } else {
-        if (member_->use_device_ == p::kXPU) {
-#if defined(PADDLE_WITH_XPU)
-          VLOG(3) << "use BindThreadedSSAGraphExecutor";
-          member_->executor_.reset(new details::BindThreadedSSAGraphExecutor(
-              exec_strategy, member_->local_scopes_,
-              member_->local_exec_scopes_, member_->places_, graph));
-#else
-          PADDLE_THROW(platform::errors::PermissionDenied(
-              "Paddle can't use XPU device since it's not compiled with XPU,"
-              "Please recompile or reinstall Paddle with XPU support."));
-#endif
-        } else {
-          VLOG(3) << "use FastThreadedSSAGraphExecutor";
-          member_->executor_.reset(new details::FastThreadedSSAGraphExecutor(
-              exec_strategy, member_->local_scopes_,
-              member_->local_exec_scopes_, member_->places_, graph));
-        }
-      }
-      final_graphs.emplace_back(graph);
-    }
-  }
-
-  VLOG(3) << "use ScopeBufferedSSAGraphExecutor";
-  if (!member_->build_strategy_.async_mode_) {
-    member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor(
-        exec_strategy, member_->local_scopes_, member_->local_exec_scopes_,
-        std::move(var_infos), member_->places_, std::move(member_->executor_)));
-  }
-
-  for (auto *g : final_graphs) {
-    auto ops = ir::FilterByNodeWrapper<details::OpHandleBase>(*g);
-    for (auto *op : ops) {
-      op->SetLocalExecScopes(scope_map);
-    }
-  }
-
-  if (final_graphs.size() == 1) {
-    ir::SetReaderOpDeviceInfo(final_graphs[0], member_->places_.size());
-  } else {
-    for (size_t i = 0; i < final_graphs.size(); ++i) {
-      ir::SetReaderOpDeviceInfo(final_graphs[i], member_->places_.size(), i);
-    }
-  }
-}
-
-void ParallelExecutor::BCastParamsToDevices(
-    const std::vector<std::string> &vars, int trainer_id) const {
-  VLOG(3) << "BCastParamsToDevices";
-  // the initializing bcast, all vars would be bcast from device(0).
-  for (auto &var : vars) {
-    framework::Variable *main_var = member_->local_scopes_[0]->FindVar(var);
-    if (main_var == nullptr || !main_var->IsType<LoDTensor>()) {
-      continue;
-    }
-
-    auto &main_tensor = main_var->Get<LoDTensor>();
-    if (!main_tensor.IsInitialized()) {
-      VLOG(3) << "one in var not inited, return!";
-      continue;
-    }
-    auto &dims = main_tensor.dims();
-    if (paddle::platform::is_gpu_place(main_tensor.place())) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-      std::vector<void *> buffers;
-      buffers.reserve(member_->places_.size());
-      size_t numel = main_tensor.numel();
-      ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type());
-      for (size_t i = 0; i < member_->places_.size(); ++i) {
-        auto place = member_->places_[i];
-        void *buffer;
-
-        if (i == 0 && trainer_id == 0) {
-          buffer = const_cast<void *>(main_tensor.data<void>());
-        } else {
-          auto local_scope = member_->local_scopes_[i];
-          auto *t = local_scope->Var(var)->GetMutable<LoDTensor>();
-          t->Resize(dims);
-          buffer = t->mutable_data(place, main_tensor.type());
-        }
-        buffers.push_back(buffer);
-      }
+        if (i == 0 && trainer_id == 0) {
+          buffer = const_cast<void *>(main_tensor.data<void>());
+        } else {
+          auto local_scope = member_->local_scopes_[i];
+          auto *t = local_scope->Var(var)->GetMutable<LoDTensor>();
+          t->Resize(dims);
+          buffer = t->mutable_data(place, main_tensor.type());
+        }
+        buffers.push_back(buffer);
+      }
 
       PADDLE_ENFORCE_EQ(member_->places_.size(), buffers.size(),
                         platform::errors::PreconditionNotMet(
@@ -1367,6 +1058,399 @@ bool ParallelExecutor::EnableParallelGraphExecution(
   return enable_parallel_graph;
 }
 
+void ParallelExecutor::InitExecutorPrivateMemberInfo(
+    const ExecutionStrategy &exec_strategy, const BuildStrategy &build_strategy,
+    size_t device_count, const ir::Graph &graph) {
+  member_->use_device_ = exec_strategy.use_device_;
+  member_->build_strategy_ = build_strategy;
+  member_->use_all_reduce_ = member_->build_strategy_.reduce_ ==
+                             BuildStrategy::ReduceStrategy::kAllReduce;
+  member_->nranks_ = build_strategy.num_trainers_ * device_count;
+  if (!member_->use_all_reduce_ && member_->nranks_ == 1) {
+    LOG(INFO) << "If you set build_strategy.reduce with 'Reduce',"
+                 "the number of places should be greater than 1.";
+    member_->build_strategy_.reduce_ =
+        BuildStrategy::ReduceStrategy::kAllReduce;
+    member_->use_all_reduce_ = true;
+  }
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && defined(_WIN32)
+  if (member_->IsUseCUDA(member_->use_device_)) {
+    PADDLE_ENFORCE_EQ(
+        device_count, 1,
+        platform::errors::Unavailable("Windows can support Single GPU only."));
+  }
+#endif
+
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
+    (!defined(PADDLE_WITH_NCCL) && !defined(PADDLE_WITH_RCCL))
+  if (member_->IsUseCUDA(member_->use_device_)) {
+    PADDLE_ENFORCE_EQ(
+        device_count, 1,
+        platform::errors::PermissionDenied(
+            "Your machine has multiple cards, "
+            "but the WITH_NCCL option is not turned on during compilation, "
+            "and you cannot use multi-card training or prediction. "
+            "Please recompile and turn on the WITH_NCCL option."));
+  }
+#endif
+
+  std::string device_name;
+  if (member_->use_device_ == p::kCPU) {
+    device_name = "CPU";
+  } else if (member_->use_device_ == p::kCUDA) {
+    device_name = "CUDA";
+  } else {
+    device_name = "XPU";
+  }
+
+  VLOG(1) << string::Sprintf(
+      "The Program will be executed on %s using ParallelExecutor, %lu "
+      "cards are used, so %lu programs are executed in parallel.",
+      device_name, device_count, device_count);
+
+  // FIXME(Yancey1989): parallel graph mode get better performance
+  // in GPU allreduce distributed training. Need an elegant way to
+  // choice the execution strategy.
+  member_->build_strategy_.enable_parallel_graph_ =
+      EnableParallelGraphExecution(graph, exec_strategy,
+                                   member_->build_strategy_);
+  if (member_->build_strategy_.enable_parallel_graph_) {
+    LOG(INFO) << "The Executor would execute the graph by ParallelGraph "
+                 "Execution which can get better performance,"
+              << "you can force it off by env FLAGS_enable_parallel_graph=0";
+  }
+}
+
+void ParallelExecutor::CreateLocalScopes(
+    Scope *global_scope, const std::vector<Scope *> &local_scopes,
+    bool create_new) {
+  if (local_scopes.empty()) {
+    member_->own_local_scope_ = true;
+    member_->local_scopes_.emplace_back(global_scope);
+    for (size_t i = 1; i < member_->places_.size(); ++i) {
+      member_->local_scopes_.emplace_back(&global_scope->NewScope());
+    }
+  } else {
+    member_->own_local_scope_ = false;
+    PADDLE_ENFORCE_EQ(member_->places_.size(), local_scopes.size(),
+                      platform::errors::PreconditionNotMet(
+                          "member_->places_.size() = %d is not equal to "
+                          "local_scopes.size() = %d",
+                          member_->places_.size(), local_scopes.size()));
+    for (size_t i = 0; i < member_->places_.size(); ++i) {
+      if (create_new) {
+        member_->local_scopes_.emplace_back(&local_scopes[i]->NewScope());
+      } else {
+        // Use local scopes directly
+        member_->local_scopes_.emplace_back(local_scopes[i]);
+      }
+    }
+  }
+}
+
+std::unordered_map<Scope *, Scope *> ParallelExecutor::CreateLocalExecScopes(
+    const std::vector<Scope *> &local_scopes, bool create_new) {
+  std::unordered_map<Scope *, Scope *> scope_map;
+
+  for (auto *scope : local_scopes) {
+    Scope *local_exec_scope = scope;
+    if (create_new) {
+      local_exec_scope = &scope->NewScope();
+    }
+    member_->local_exec_scopes_.emplace_back(local_exec_scope);
+    scope_map.emplace(scope, local_exec_scope);
+  }
+
+  PADDLE_ENFORCE_EQ(
+      member_->local_scopes_.size(), member_->local_exec_scopes_.size(),
+      platform::errors::PreconditionNotMet(
+          "member_->local_scopes_.size() = %d is not equal to "
+          "member_->local_exec_scopes_.size() = %d",
+          member_->local_scopes_.size(), member_->local_exec_scopes_.size()));
+
+  return scope_map;
+}
+
+std::vector<ir::Graph *> ParallelExecutor::CloneGraphToMultiDevices(
+    ir::Graph *graph) {
+  std::vector<ir::Graph *> graphs;
+  if (member_->build_strategy_.async_mode_) {
+    PADDLE_ENFORCE_EQ(member_->IsUseCUDA(member_->use_device_), false,
+                      platform::errors::Unavailable(
+                          "gpu mode does not support async_mode_ now!"));
+    graphs.push_back(graph);
+    for (size_t i = 1; i < member_->places_.size(); ++i) {
+      auto *tmp_graph = new ir::Graph(graph->OriginProgram());
+      async_graphs_.emplace_back(tmp_graph);
+      graphs.push_back(tmp_graph);
+    }
+  }
+
+  return graphs;
+}
+
+void ParallelExecutor::PrepareNCCLCommunicator(Scope *global_scope) {
+  if (member_->IsUseCUDA(member_->use_device_) && member_->nranks_ > 1) {
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+    member_->InitOrGetNCCLCommunicator(global_scope, &member_->build_strategy_);
+
+    // Initialize device context's nccl comm, will be used by normal
+    // Operators like sync_batch_norm, and collective ops.
+    // NOTE: more than one ParallelExecutor with same place, the nccl comm will
+    // be rewrite and there will be some problem.
+    // NOTE: NCCL group-calls and non-group-calls can not use the same
+    // NCCL communicator, so for ParallelGraph and Multi-Process mode, re-use
+    // same communicators.
+    auto *nccl_ctxs = member_->nccl_ctxs_->GetSyncBatchNormCtx(
+        global_scope, member_->places_);
+    auto &pool = platform::DeviceContextPool::Instance();
+    for (size_t dev_id = 0; dev_id < member_->places_.size(); ++dev_id) {
+      auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
+          pool.Get(member_->places_[dev_id]));
+      auto &nccl_ctx = nccl_ctxs->at(member_->places_[dev_id]);
+      dev_ctx->set_nccl_comm(nccl_ctx.comm());
+    }
+#else
+    PADDLE_THROW(
+        platform::errors::PreconditionNotMet("Not compiled with CUDA."));
+#endif
+  }
+  if (member_->use_device_ == p::kXPU && member_->nranks_ > 1) {
+#if defined(PADDLE_WITH_XPU_BKCL)
+    member_->InitOrGetBKCLCommunicator(global_scope, member_->build_strategy_);
+
+    auto *bkcl_ctxs = member_->bkcl_ctxs_->GetSyncBatchNormCtx(
+        global_scope, member_->places_);
+    auto &pool = platform::DeviceContextPool::Instance();
+    for (size_t dev_id = 0; dev_id < member_->places_.size(); ++dev_id) {
+      auto *dev_ctx = static_cast<platform::XPUDeviceContext *>(
+          pool.Get(member_->places_[dev_id]));
+      auto &bkcl_ctx = bkcl_ctxs->at(member_->places_[dev_id]);
+      dev_ctx->set_bkcl_context(bkcl_ctx.comm());
+    }
+#else
+    PADDLE_THROW(
+        platform::errors::PreconditionNotMet("Not compiled with XPU."));
+#endif
+  }
+}
+
+std::vector<ir::Graph *> ParallelExecutor::CompileGraphWithBuildStrategy(
+    ir::Graph *graph, std::vector<ir::Graph *> *device_graphs,
+    const std::string &loss_var_name) {
+  auto device_count = member_->places_.size();
+  std::vector<ir::Graph *> async_graphs(device_count);
+
+  auto &graphs = *device_graphs;
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+  if (member_->build_strategy_.async_mode_) {
+    PADDLE_ENFORCE_EQ(graphs.size(), device_count,
+                      platform::errors::PreconditionNotMet(
+                          "graphs.size() shoule be %d, but received %d",
+                          device_count, graphs.size()));
+    VLOG(3) << "use local async mode";
+    graph = member_->build_strategy_.Apply(
+        graph, {member_->places_[0]}, loss_var_name,
+        {member_->local_scopes_[0]}, 1, member_->use_device_,
+        member_->nccl_ctxs_);
+    for (size_t i = 1; i < device_count; ++i) {
+      graphs[i] = member_->build_strategy_.Apply(
+          graphs[i], {member_->places_[i]}, loss_var_name,
+          {member_->local_scopes_[i]}, 1, member_->use_device_,
+          member_->nccl_ctxs_);
+      async_graphs[i] = graphs[i];
+    }
+  } else {
+    graph = member_->build_strategy_.Apply(
+        graph, member_->places_, loss_var_name, member_->local_scopes_,
+        member_->nranks_, member_->use_device_, member_->nccl_ctxs_);
+  }
+#elif defined(PADDLE_WITH_XPU_BKCL)
+  if (member_->build_strategy_.async_mode_) {
+    PADDLE_ENFORCE_EQ(graphs.size(), device_count,
+                      platform::errors::PreconditionNotMet(
+                          "graphs.size() shoule be %d, but received %d",
+                          device_count, graphs.size()));
+    VLOG(3) << "use local async mode";
+    graph = member_->build_strategy_.Apply(
+        graph, {member_->places_[0]}, loss_var_name,
+        {member_->local_scopes_[0]}, 1, member_->use_device_,
+        member_->bkcl_ctxs_);
+    for (size_t i = 1; i < device_count; ++i) {
+      graphs[i] = member_->build_strategy_.Apply(
+          graphs[i], {member_->places_[i]}, loss_var_name,
+          {member_->local_scopes_[i]}, 1, member_->use_device_,
+          member_->bkcl_ctxs_);
+      async_graphs[i] = graphs[i];
+    }
+  } else {
+    graph = member_->build_strategy_.Apply(
+        graph, member_->places_, loss_var_name, member_->local_scopes_,
+        member_->nranks_, member_->use_device_, member_->bkcl_ctxs_);
+  }
+#else
+  if (member_->build_strategy_.async_mode_) {
+    VLOG(3) << "use local async mode";
+    graph = member_->build_strategy_.Apply(
+        graph, {member_->places_[0]}, loss_var_name,
+        {member_->local_scopes_[0]}, 1, member_->use_device_);
+    for (size_t i = 1; i < device_count; ++i) {
+      graphs[i] = member_->build_strategy_.Apply(
+          graphs[i], {member_->places_[i]}, loss_var_name,
+          {member_->local_scopes_[i]}, 1, member_->use_device_);
+      async_graphs[i] = graphs[i];
+    }
+  } else {
+    graph = member_->build_strategy_.Apply(
+        graph, member_->places_, loss_var_name, member_->local_scopes_,
+        member_->nranks_, member_->use_device_);
+  }
+#endif
+
+  return async_graphs;
+}
+
+void ParallelExecutor::CreateVariableInfos(
+    std::vector<details::VariableInfo> *var_infos, ir::Graph *graph) {
+  PADDLE_ENFORCE_EQ(
+      var_infos->size(), 0,
+      platform::errors::PreconditionNotMet(
+          "var_infos->size() shoule be 0, but received %d", var_infos->size()));
+  PADDLE_ENFORCE_EQ(
+      member_->is_persistable_.size(), 0,
+      platform::errors::PreconditionNotMet(
+          "member_->is_persistable_.size() shoule be 0, but received %d",
+          member_->is_persistable_.size()));
+  for (auto &node : graph->Nodes()) {
+    if (node->IsVar() && !node->IsCtrlVar() && node->Var()) {
+      var_infos->emplace_back();
+      var_infos->back().name_ = node->Var()->Name();
+      var_infos->back().type_ = node->Var()->GetType();
+      var_infos->back().persistable_ = node->Var()->Persistable();
+
+      member_->is_persistable_.emplace(node->Var()->Name(),
+                                       node->Var()->Persistable());
+    }
+  }
+
+  if (graph->Has(details::kFusedVars)) {
+    auto &fused_vars = graph->Get<details::FusedVars>(details::kFusedVars);
+    for (auto &fused_var : fused_vars) {
+      var_infos->emplace_back();
+      var_infos->back() = fused_var.second;
+
+      member_->is_persistable_.emplace(fused_var.first,
+                                       fused_var.second.persistable_);
+    }
+  }
+}
+
+std::vector<ir::Graph *> ParallelExecutor::CreateSSAGraphExecutor(
+    const ExecutionStrategy &exec_strategy,
+    std::vector<ir::Graph *> *async_graphs, ir::Graph *graph) {
+  std::vector<ir::Graph *> final_graphs;
+
+  if (member_->build_strategy_.async_mode_) {
+    VLOG(3) << "use AsyncSSAGraphExecutor";
+    member_->executor_.reset(new details::AsyncSSAGraphExecutor(
+        exec_strategy, member_->local_scopes_, member_->local_exec_scopes_,
+        member_->places_, *async_graphs));
+    final_graphs = *async_graphs;
+  } else if (member_->build_strategy_.enable_parallel_graph_) {
+    VLOG(3) << "use ParallelSSAGraphExecutor";
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    // TODO(Yancey1989): Remove passing in the main_program when
+    // allreduce_seq_pass doesn't need it as the attr.
+    bool is_inference = details::IsDataParallelInferenceGraph(*graph);
+    bool has_drop_last_read_op = details::HasDropLastReadOp(*graph);
+
+    auto *pg_exe = new details::ParallelSSAGraphExecutor(
+        exec_strategy, member_->local_scopes_, member_->local_exec_scopes_,
+        member_->places_, graph);
+    final_graphs = pg_exe->Graphs();
+    member_->executor_.reset(pg_exe);
+
+    if (is_inference && member_->places_.size() > 1) {
+      member_->inference_executor_ = pg_exe;
+      if (!has_drop_last_read_op) {
+        VLOG(5) << "Enable partial feed support in inference phase";
+        pg_exe->EnablePartialFeedSupport();
+      }
+    }
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "Paddle should be compiled with CUDA for ParallelGraph Execution."));
+#endif
+  } else {
+    bool has_drop_last_read_op = details::HasDropLastReadOp(*graph);
+    auto possible_inference_graphs =
+        details::TrySeparateToMultipleSingleDeviceGraphs(graph);
+    if (!possible_inference_graphs.empty()) {
+      VLOG(5) << "Use ParallelSSAGraphExecutor in inference phase";
+      auto *pg_exe = new details::ParallelSSAGraphExecutor(
+          exec_strategy, member_->local_scopes_, member_->local_exec_scopes_,
+          member_->places_, std::move(possible_inference_graphs));
+      if (!has_drop_last_read_op) {
+        VLOG(5) << "Enable partial feed support in inference phase";
+        pg_exe->EnablePartialFeedSupport();
+      }
+      final_graphs = pg_exe->Graphs();
+      member_->executor_.reset(pg_exe);
+      member_->inference_executor_ = pg_exe;
+    } else {
+      LOG_IF(WARNING, details::HasKeepLastReadOp(*graph))
+          << "drop_last=False for DataLoader is not supported in training "
+             "network. It is automatically turned to drop_last=True.";
+      if (exec_strategy.type_ == ExecutionStrategy::kDefault) {
+        VLOG(3) << "use ThreadedSSAGraphExecutor";
+        member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
+            exec_strategy, member_->local_scopes_, member_->local_exec_scopes_,
+            member_->places_, graph));
+      } else {
+        VLOG(3) << "use FastThreadedSSAGraphExecutor";
+        member_->executor_.reset(new details::FastThreadedSSAGraphExecutor(
+            exec_strategy, member_->local_scopes_, member_->local_exec_scopes_,
+            member_->places_, graph));
+      }
+      final_graphs.emplace_back(graph);
+    }
+  }
+  return final_graphs;
+}
+
+void ParallelExecutor::ResetOpHandleScopeMapOfGraphs(
+    const std::vector<ir::Graph *> &final_graphs,
+    const std::unordered_map<Scope *, Scope *> &scope_map) {
+  PADDLE_ENFORCE_GE(
+      final_graphs.size(), 1,
+      platform::errors::PreconditionNotMet(
+          "final_graphs shoule contain at least one graph, but received %d",
+          final_graphs.size()));
+
+  PADDLE_ENFORCE_GT(scope_map.size(), 0,
+                    platform::errors::PreconditionNotMet(
+                        "scope_map shoule contain at least one "
+                        "element, but received %d",
+                        scope_map.size()));
+  for (auto *g : final_graphs) {
+    auto ops = ir::FilterByNodeWrapper<details::OpHandleBase>(*g);
+    for (auto *op : ops) {
+      op->SetLocalExecScopes(scope_map);
+    }
+  }
+}
+
+void ParallelExecutor::SetReaderOpDeviceInfoOfGraphs(
+    const std::vector<ir::Graph *> &final_graphs) {
+  if (final_graphs.size() == 1) {
+    ir::SetReaderOpDeviceInfo(final_graphs[0], member_->places_.size());
+  } else {
+    for (size_t i = 0; i < final_graphs.size(); ++i) {
+      ir::SetReaderOpDeviceInfo(final_graphs[i], member_->places_.size(), i);
+    }
+  }
+}
+
 const ir::Graph &ParallelExecutor::Graph() const {
   return member_->executor_->Graph();
 }
diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h
index 47de7dc48f4f2..d4d0b534b55f0 100644
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -24,6 +24,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/details/build_strategy.h"
 #include "paddle/fluid/framework/details/execution_strategy.h"
 #include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/feed_fetch_type.h"
 #include "paddle/fluid/framework/op_info.h"
@@ -41,6 +42,7 @@ namespace framework {
 
 class ParallelExecutorPrivate;
 
+using details::VariableInfo;
 using details::BuildStrategy;
 using details::ExecutionStrategy;
 namespace p = paddle::platform;
@@ -93,6 +95,40 @@ class ParallelExecutor {
                                     const ExecutionStrategy &exec_strategy,
                                     const BuildStrategy &build_strategy) const;
 
+  void InitExecutorPrivateMemberInfo(const ExecutionStrategy &exec_strategy,
+                                     const BuildStrategy &build_strategy,
+                                     size_t device_count,
+                                     const ir::Graph &graph);
+
+  void CreateLocalScopes(Scope *global_scope,
+                         const std::vector<Scope *> &local_scopes,
+                         bool create_new);
+
+  std::unordered_map<Scope *, Scope *> CreateLocalExecScopes(
+      const std::vector<Scope *> &local_scopes, bool create_new);
+
+  std::vector<ir::Graph *> CloneGraphToMultiDevices(ir::Graph *graph);
+
+  void PrepareNCCLCommunicator(Scope *global_scope);
+
+  std::vector<ir::Graph *> CompileGraphWithBuildStrategy(
+      ir::Graph *graph, std::vector<ir::Graph *> *graphs,
+      const std::string &loss_var_name);
+
+  void CreateVariableInfos(std::vector<VariableInfo> *var_infos,
+                           ir::Graph *graph);
+
+  std::vector<ir::Graph *> CreateSSAGraphExecutor(
+      const ExecutionStrategy &exec_strategy,
+      std::vector<ir::Graph *> *async_graphs, ir::Graph *graph);
+
+  void ResetOpHandleScopeMapOfGraphs(
+      const std::vector<ir::Graph *> &final_graphs,
+      const std::unordered_map<Scope *, Scope *> &scope_map);
+
+  void SetReaderOpDeviceInfoOfGraphs(
+      const std::vector<ir::Graph *> &final_graphs);
+
   ParallelExecutorPrivate *member_;
   std::vector<std::unique_ptr<ir::Graph>> async_graphs_;
 };
diff --git a/paddle/fluid/framework/pipeline_trainer.cc b/paddle/fluid/framework/pipeline_trainer.cc
index 5968df548dfb0..3649e00e7c9d8 100644
--- a/paddle/fluid/framework/pipeline_trainer.cc
+++ b/paddle/fluid/framework/pipeline_trainer.cc
@@ -12,7 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+    defined(WITH_ASCEND_CL)
 #include "paddle/fluid/framework/data_feed_factory.h"
 #include "paddle/fluid/framework/device_worker_factory.h"
 #include "paddle/fluid/framework/trainer.h"
@@ -34,7 +35,11 @@ void PipelineTrainer::Initialize(const TrainerDesc& trainer_desc,
   ParseDumpConfig(trainer_desc);
   const auto& section_config = section_params.section_config();
   int place_id = section_config.place_id();
+#if (defined PADDLE_WITH_NCCL)
   place_ = platform::CUDAPlace(place_id);
+#elif (defined WITH_ASCEND_CL)
+  place_ = platform::NPUPlace(place_id);
+#endif
   worker_ = DeviceWorkerFactory::CreateDeviceWorker(
       trainer_desc.device_worker_name());
   auto this_worker =
diff --git a/paddle/fluid/framework/ps_gpu_trainer.cc b/paddle/fluid/framework/ps_gpu_trainer.cc
index e77932fa5f226..39bc3f040639b 100644
--- a/paddle/fluid/framework/ps_gpu_trainer.cc
+++ b/paddle/fluid/framework/ps_gpu_trainer.cc
@@ -19,10 +19,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_feed_factory.h"
 #include "paddle/fluid/framework/data_set.h"
 #include "paddle/fluid/framework/device_worker_factory.h"
-#include "paddle/fluid/framework/fleet/fleet_wrapper.h"
-#include "paddle/fluid/framework/fleet/heter_context.h"
-#include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
-#include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h"
 #include "paddle/fluid/framework/trainer.h"
 #if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
     (defined PADDLE_WITH_PSLIB)
@@ -64,7 +60,6 @@ void PSGPUTrainer::Initialize(const TrainerDesc& trainer_desc,
   pull_dense_worker_ = PullDenseWorker::GetInstance();
   pull_dense_worker_->Initialize(trainer_desc);
   SetDebug(trainer_desc.debug());
-  fleet_ptr_ = FleetWrapper::GetInstance();
   trainer_desc_ = trainer_desc;
   workers_.resize(place_num);
   for (int i = 0; i < place_num; ++i) {
diff --git a/paddle/fluid/framework/ps_gpu_worker.cc b/paddle/fluid/framework/ps_gpu_worker.cc
index 2597901d91f36..d178c4e556ca5 100644
--- a/paddle/fluid/framework/ps_gpu_worker.cc
+++ b/paddle/fluid/framework/ps_gpu_worker.cc
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/device_worker.h"
 #include "paddle/fluid/framework/device_worker_factory.h"
-#include "paddle/fluid/framework/fleet/fleet_wrapper.h"
 #include "paddle/fluid/framework/fleet/heter_wrapper.h"
 #include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/string/string_helper.h"
diff --git a/paddle/fluid/framework/section_worker.cc b/paddle/fluid/framework/section_worker.cc
index e740771e5ca9f..7860b69313e7b 100644
--- a/paddle/fluid/framework/section_worker.cc
+++ b/paddle/fluid/framework/section_worker.cc
@@ -9,7 +9,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+    defined(WITH_ASCEND_CL)
 #include <float.h>
 #include "paddle/fluid/framework/device_worker.h"
 #include "paddle/fluid/framework/executor_gc_helper.h"
diff --git a/paddle/fluid/framework/selected_rows.cc b/paddle/fluid/framework/selected_rows.cc
index 4c30c40ad5837..7e48d0dc5f962 100644
--- a/paddle/fluid/framework/selected_rows.cc
+++ b/paddle/fluid/framework/selected_rows.cc
@@ -113,6 +113,21 @@ void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows,
   TensorToStream(os, selected_rows.value(), dev_ctx);
 }
 
+void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows) {
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  const platform::DeviceContext* dev_ctx;
+  auto place = selected_rows.place();
+  dev_ctx = pool.Get(place);
+  SerializeToStream(os, selected_rows, *dev_ctx);
+}
+
+void DeserializeFromStream(std::ifstream& os, SelectedRows* selected_rows) {
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  const platform::DeviceContext* dev_ctx;
+  dev_ctx = pool.Get(platform::CPUPlace());
+  DeserializeFromStream(os, selected_rows, *dev_ctx);
+}
+
 void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows,
                            const platform::DeviceContext& dev_ctx) {
   {
diff --git a/paddle/fluid/framework/selected_rows.h b/paddle/fluid/framework/selected_rows.h
index 48353b43f56ca..e53e3d973c524 100644
--- a/paddle/fluid/framework/selected_rows.h
+++ b/paddle/fluid/framework/selected_rows.h
@@ -173,5 +173,9 @@ void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows,
 void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows,
                            const platform::DeviceContext& dev_ctx);
 
+void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows);
+
+void DeserializeFromStream(std::ifstream& os, SelectedRows* selected_rows);
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index d6882b25d2258..78fd1af09e294 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -822,6 +822,29 @@ void TensorToStream(std::ostream& os, const Tensor& tensor,
 #else
       PADDLE_THROW(platform::errors::Unimplemented(
           "XPUPlace is not supported when not compiled with XPU"));
+#endif
+    } else if (platform::is_npu_place(tensor.place())) {
+#ifdef PADDLE_WITH_ASCEND_CL
+      constexpr size_t kBufSize = 1024 * 1024 * 64;  // 64MB
+      std::unique_ptr<char[]> buf(new char[kBufSize]);
+      auto& npu_dev_ctx =
+          static_cast<const platform::NPUDeviceContext&>(dev_ctx);
+      platform::CPUPlace cpu;
+      uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
+      while (size != 0) {
+        size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
+        memory::Copy(cpu, buf.get(),
+                     BOOST_GET_CONST(platform::NPUPlace, tensor.place()),
+                     reinterpret_cast<const void*>(data), size_to_write,
+                     npu_dev_ctx.stream());
+        npu_dev_ctx.Wait();
+        os.write(buf.get(), size_to_write);
+        data += size_to_write;
+        size -= size_to_write;
+      }
+#else
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "NPUPlace is not supported when not compiled with NPU"));
 #endif
     } else {
       os.write(static_cast<const char*>(data_ptr),
@@ -877,9 +900,10 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
     auto ctx = platform::CPUDeviceContext();
     size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
     if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
-        platform::is_xpu_place(dev_ctx.GetPlace())) {
+        platform::is_xpu_place(dev_ctx.GetPlace()) ||
+        platform::is_npu_place(dev_ctx.GetPlace())) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
-    defined(PADDLE_WITH_XPU)
+    defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_ASCEND_CL)
       Tensor cpu_tensor;
       cpu_tensor.Resize(framework::make_ddim(shape));
       framework::VisitDataType(
@@ -888,13 +912,19 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
       is.read(static_cast<char*>(buf), size);
       auto dst_place = dev_ctx.GetPlace();
       framework::TensorCopy(cpu_tensor, dst_place, dev_ctx, tensor);
+      if (platform::is_npu_place(dev_ctx.GetPlace())) {
+        dev_ctx.Wait();
+      }
 #else
       if (platform::is_gpu_place(dev_ctx.GetPlace())) {
         PADDLE_THROW(platform::errors::Unimplemented(
             "CUDAPlace is not supported when not compiled with CUDA"));
-      } else {
+      } else if (platform::is_xpu_place(dev_ctx.GetPlace())) {
         PADDLE_THROW(platform::errors::Unimplemented(
             "XPUPlace is not supported when not compiled with XPU"));
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "NPUPlace is not supported when not compiled with NPU"));
       }
 #endif
     } else {
@@ -935,9 +965,10 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
     auto ctx = platform::CPUDeviceContext();
     size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
     if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
-        platform::is_xpu_place(dev_ctx.GetPlace())) {
+        platform::is_xpu_place(dev_ctx.GetPlace()) ||
+        platform::is_npu_place(dev_ctx.GetPlace())) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
-    defined(PADDLE_WITH_XPU)
+    defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_ASCEND_CL)
       Tensor cpu_tensor;
       cpu_tensor.Resize(framework::make_ddim(dims));
       framework::VisitDataType(
@@ -946,13 +977,19 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
       is.read(static_cast<char*>(buf), size);
       auto dst_place = dev_ctx.GetPlace();
       framework::TensorCopy(cpu_tensor, dst_place, dev_ctx, tensor);
+      if (platform::is_npu_place(dev_ctx.GetPlace())) {
+        dev_ctx.Wait();
+      }
 #else
       if (platform::is_gpu_place(dev_ctx.GetPlace())) {
         PADDLE_THROW(platform::errors::Unimplemented(
             "CUDAPlace is not supported when not compiled with CUDA"));
-      } else {
+      } else if (platform::is_xpu_place(dev_ctx.GetPlace())) {
         PADDLE_THROW(platform::errors::Unimplemented(
             "XPUPlace is not supported when not compiled with XPU"));
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "NPUPlace is not supported when not compiled with NPU"));
       }
 #endif
     } else {
diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h
index 868d920f13ca8..22c8e1c1665f1 100644
--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -135,6 +135,7 @@ void TensorFromArray(const T* src, const size_t& array_size,
   }
 #endif
 }
+
 template <typename T>
 void TensorFromVector(const std::vector<T>& src,
                       const platform::DeviceContext& ctx, Tensor* dst) {
@@ -158,13 +159,58 @@ void TensorFromVector(const std::vector<T>& src,
   }
 #endif
 #ifdef PADDLE_WITH_ASCEND_CL
+  // NOTE(zhiqiu): Becareful that aclrtMemcpyAsync is different from
+  // cudaMemcpyAsync.
+  // cudaMemcpyAsync is actually "sync" between cpu <-> gpu.
+  // aclrtMemcpyAsync is really "async" between cpu <-> npu.
+  // Since vector is on cpu, I think this function should be a "sync" operation,
+  // so pass nullptr as stream to  memory::Copy().
   else if (platform::is_npu_place(dst_place)) {  // NOLINT
+    memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
+                 src_place, src_ptr, size, nullptr);
+  }
+#endif
+}
+
+// The fully specialized function should be inline to avoid
+// multi-definition.
+template <>
+inline void TensorFromVector(const std::vector<bool>& src,
+                             const platform::DeviceContext& ctx, Tensor* dst) {
+  // vector<bool> has no data() member, use array instead.
+  // See details:
+  // https://stackoverflow.com/questions/46115669/why-does-stdvectorbool-have-no-data/46115714
+  bool* array = new bool[src.size()];
+  for (unsigned int i = 0; i < src.size(); i++) {
+    array[i] = static_cast<bool>(src[i]);
+  }
+
+  auto dst_place = ctx.GetPlace();
+  auto src_ptr = static_cast<const void*>(array);
+  platform::CPUPlace src_place;
+  dst->Resize({static_cast<int64_t>(src.size())});
+  auto dst_ptr = static_cast<void*>(dst->mutable_data<bool>(dst_place));
+  auto size = src.size() * sizeof(bool);
+
+  if (platform::is_cpu_place(dst_place)) {
+    memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
+                 src_place, src_ptr, size);
+  }
+#ifdef PADDLE_WITH_CUDA
+  else if (platform::is_gpu_place(dst_place)) {  // NOLINT
     memory::Copy(
-        BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, src_place,
+        BOOST_GET_CONST(platform::CUDAPlace, dst_place), dst_ptr, src_place,
         src_ptr, size,
-        reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
+        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
   }
 #endif
+#ifdef PADDLE_WITH_ASCEND_CL
+  else if (platform::is_npu_place(dst_place)) {  // NOLINT
+    memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
+                 src_place, src_ptr, size, nullptr);
+  }
+#endif
+  delete[] array;
 }
 
 template <typename T>
@@ -179,6 +225,23 @@ void TensorFromVector(const std::vector<T>& src, Tensor* dst) {
   memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
 }
 
+template <>
+inline void TensorFromVector(const std::vector<bool>& src, Tensor* dst) {
+  bool* array = new bool[src.size()];
+  for (unsigned int i = 0; i < src.size(); i++) {
+    array[i] = static_cast<bool>(src[i]);
+  }
+  platform::CPUPlace dst_place = platform::CPUPlace();
+  auto src_ptr = static_cast<const void*>(array);
+  platform::CPUPlace src_place;
+  dst->Resize({static_cast<int64_t>(src.size())});
+  auto dst_ptr = static_cast<void*>(dst->mutable_data<bool>(dst_place));
+  auto size = src.size() * sizeof(bool);
+
+  memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
+  delete[] array;
+}
+
 template <typename T>
 void TensorToVector(const Tensor& src, const platform::DeviceContext& ctx,
                     std::vector<T>* dst) {
@@ -204,12 +267,50 @@ void TensorToVector(const Tensor& src, const platform::DeviceContext& ctx,
 #endif
 #ifdef PADDLE_WITH_ASCEND_CL
   else if (platform::is_npu_place(src.place())) {  // NOLINT
+    memory::Copy(dst_place, dst_ptr,
+                 BOOST_GET_CONST(platform::NPUPlace, src.place()), src_ptr,
+                 size, nullptr);
+  }
+#endif
+}
+
+template <>
+inline void TensorToVector(const Tensor& src,
+                           const platform::DeviceContext& ctx,
+                           std::vector<bool>* dst) {
+  auto src_ptr = static_cast<const void*>(src.data<bool>());
+  auto size = src.numel() * sizeof(bool);
+
+  bool* array = new bool[src.numel()];
+
+  platform::CPUPlace dst_place;
+  dst->resize(src.numel());
+  auto dst_ptr = static_cast<void*>(array);
+
+  if (platform::is_cpu_place(src.place())) {
+    memory::Copy(dst_place, dst_ptr,
+                 BOOST_GET_CONST(platform::CPUPlace, src.place()), src_ptr,
+                 size);
+  }
+#ifdef PADDLE_WITH_CUDA
+  else if (platform::is_gpu_place(src.place())) {  // NOLINT
     memory::Copy(
-        dst_place, dst_ptr, BOOST_GET_CONST(platform::NPUPlace, src.place()),
+        dst_place, dst_ptr, BOOST_GET_CONST(platform::CUDAPlace, src.place()),
         src_ptr, size,
-        reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
+        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
   }
 #endif
+#ifdef PADDLE_WITH_ASCEND_CL
+  else if (platform::is_npu_place(src.place())) {  // NOLINT
+    memory::Copy(dst_place, dst_ptr,
+                 BOOST_GET_CONST(platform::NPUPlace, src.place()), src_ptr,
+                 size, nullptr);
+  }
+#endif
+  for (unsigned int i = 0; i < src.numel(); i++) {
+    (*dst)[i] = static_cast<bool>(array[i]);
+  }
+  delete[] array;
 }
 
 template <typename T>
@@ -231,6 +332,32 @@ void TensorToVector(const Tensor& src, std::vector<T>* dst) {
                BOOST_GET_CONST(platform::CPUPlace, src.place()), src_ptr, size);
 }
 
+template <>
+inline void TensorToVector(const Tensor& src, std::vector<bool>* dst) {
+  auto src_ptr = static_cast<const void*>(src.data<bool>());
+  auto size = src.numel() * sizeof(bool);
+
+  bool* array = new bool[src.numel()];
+
+  platform::CPUPlace dst_place;
+  dst->resize(src.numel());
+  auto dst_ptr = static_cast<void*>(array);
+
+  PADDLE_ENFORCE_EQ(
+      platform::is_cpu_place(src.place()), true,
+      platform::errors::InvalidArgument(
+          "The input tensor should be CPU device, but actually it is in %s.",
+          src.place()));
+
+  memory::Copy(dst_place, dst_ptr,
+               BOOST_GET_CONST(platform::CPUPlace, src.place()), src_ptr, size);
+
+  for (unsigned int i = 0; i < src.numel(); i++) {
+    (*dst)[i] = static_cast<bool>(array[i]);
+  }
+  delete[] array;
+}
+
 std::ostream& operator<<(std::ostream& os, const Tensor& t);
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/tensor_util_test.cc b/paddle/fluid/framework/tensor_util_test.cc
index c32efd0a470be..8587ee8d1e919 100644
--- a/paddle/fluid/framework/tensor_util_test.cc
+++ b/paddle/fluid/framework/tensor_util_test.cc
@@ -242,6 +242,61 @@ TEST(TensorToVector, Tensor) {
 #endif
 }
 
+TEST(TensorToVector, Tensor_bool) {
+  {
+    paddle::framework::Tensor src;
+    bool* src_ptr =
+        src.mutable_data<bool>({3, 3}, paddle::platform::CPUPlace());
+    for (int i = 0; i < 3 * 3; ++i) {
+      src_ptr[i] = static_cast<bool>(i % 2);
+    }
+
+    paddle::platform::CPUPlace place;
+    std::vector<bool> dst;
+    paddle::framework::TensorToVector<bool>(src, &dst);
+
+    for (int i = 0; i < 3 * 3; ++i) {
+      EXPECT_EQ(src_ptr[i], dst[i]);
+    }
+  }
+#ifdef PADDLE_WITH_CUDA
+  {
+    std::vector<bool> src_vec = {
+        false, true, false, true, false, true, false, true, false,
+    };
+    paddle::framework::Tensor gpu_tensor;
+    paddle::platform::CUDAPlace place;
+    paddle::platform::CUDADeviceContext gpu_ctx(place);
+    paddle::framework::TensorFromVector<bool>(src_vec, gpu_ctx, &gpu_tensor);
+
+    std::vector<bool> dst;
+    paddle::framework::TensorToVector<bool>(gpu_tensor, gpu_ctx, &dst);
+
+    for (int i = 0; i < 3 * 3; ++i) {
+      EXPECT_EQ(src_vec[i], dst[i]);
+    }
+  }
+#endif
+#ifdef PADDLE_WITH_ASCEND_CL
+  {
+    std::vector<bool> src_vec = {
+        false, true, false, true, false, true, false, true, false,
+    };
+    paddle::framework::Tensor npu_tensor;
+    paddle::platform::NPUPlace place(0);
+    paddle::platform::NPUDeviceContext npu_ctx(place);
+    paddle::framework::TensorFromVector<bool>(src_vec, npu_ctx, &npu_tensor);
+
+    std::vector<bool> dst;
+    paddle::framework::TensorToVector<bool>(npu_tensor, npu_ctx, &dst);
+
+    for (int i = 0; i < 3 * 3; ++i) {
+      EXPECT_EQ(src_vec[i], dst[i]);
+    }
+  }
+#endif
+}
+
 TEST(TensorFromDLPack, Tensor) {
   {
     std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
diff --git a/paddle/fluid/framework/trainer.h b/paddle/fluid/framework/trainer.h
index ca290a50b42fe..01aa07e618464 100644
--- a/paddle/fluid/framework/trainer.h
+++ b/paddle/fluid/framework/trainer.h
@@ -109,13 +109,22 @@ class MultiTrainer : public TrainerBase {
   virtual Scope* GetWorkerScope(int thread_id);
   virtual std::string GetDumpPath(int tid);
 
+  template <typename T>
+  void MergeToRootScope(LoDTensor* root_tensor, LoDTensor* thread_tensor);
+#ifdef PADDLE_WITH_HETERPS
+
+  void MergeDenseParam();
+#endif
+
  protected:
   int thread_num_;
   std::vector<std::thread> threads_;
   std::vector<DataFeed*> readers_;
   std::vector<std::shared_ptr<DeviceWorker>> workers_;
   std::vector<std::string> need_merge_var_names_;
-
+#ifdef PADDLE_WITH_HETERPS
+  std::vector<platform::Place> places_;
+#endif
   int mpi_rank_;
   int mpi_size_;
   int dump_file_num_;
@@ -313,7 +322,6 @@ class PSGPUTrainer : public TrainerBase {
   float scale_datanorm_;
   paddle::platform::Place place_;
   ProgramDesc program_;
-  std::shared_ptr<paddle::framework::FleetWrapper> fleet_ptr_;
   std::shared_ptr<paddle::framework::PullDenseWorker> pull_dense_worker_;
   std::vector<std::shared_ptr<DeviceWorker>> workers_;
   std::vector<platform::Place> places_;
@@ -324,7 +332,8 @@ class PSGPUTrainer : public TrainerBase {
 };
 #endif
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+    defined(WITH_ASCEND_CL)
 class PipelineTrainer : public TrainerBase {
  public:
   PipelineTrainer() {}
diff --git a/paddle/fluid/framework/type_defs.h b/paddle/fluid/framework/type_defs.h
index a2b5a98401e23..e43cccfe64816 100644
--- a/paddle/fluid/framework/type_defs.h
+++ b/paddle/fluid/framework/type_defs.h
@@ -45,6 +45,17 @@ using Attribute = boost::variant<
 
 using AttributeMap = std::unordered_map<std::string, Attribute>;
 
+#ifdef PADDLE_WITH_ASCEND_CL
+using NPUAttribute =
+    boost::variant<boost::blank, int, float, std::string, std::vector<int>,
+                   std::vector<float>, std::vector<std::string>, bool,
+                   std::vector<bool>, BlockDesc*, int64_t,
+                   std::vector<BlockDesc*>, std::vector<int64_t>,
+                   std::vector<double>, std::vector<std::vector<int64_t>>>;
+
+using NPUAttributeMap = std::unordered_map<std::string, NPUAttribute>;
+#endif
+
 using OpCreator = std::function<OperatorBase*(
     const std::string& /*type*/, const VariableNameMap& /*inputs*/,
     const VariableNameMap& /*outputs*/, const AttributeMap& /*attrs*/)>;
diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h
index fc754cbaf177c..473df85aa0421 100644
--- a/paddle/fluid/framework/var_type_traits.h
+++ b/paddle/fluid/framework/var_type_traits.h
@@ -36,6 +36,11 @@
 #endif
 #endif
 
+#ifdef PADDLE_WITH_ASCEND_CL
+#include <hccl/hccl.h>
+#include <hccl/hccl_types.h>
+#endif
+
 #if defined(PADDLE_WITH_XPU_BKCL)
 #include "xpu/bkcl.h"
 #endif
@@ -50,6 +55,10 @@ class Communicator;
 class NCCLCommunicator;
 #endif
 #endif
+#ifdef PADDLE_WITH_ASCEND_CL
+class Communicator;
+class HCCLCommunicator;
+#endif
 
 #if defined(PADDLE_WITH_XPU_BKCL)
 class BKCLCommunicator;
@@ -162,6 +171,9 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl<
 #endif
     operators::CudnnRNNCache,
 #endif
+#if defined(PADDLE_WITH_ASCEND_CL)
+    HcclRootInfo,
+#endif
 #if defined(PADDLE_WITH_XPU_BKCL)
     BKCLUniqueId, platform::BKCLCommunicator,
 #endif
diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt
index a24c0ac09c758..6bee3d44b2edd 100644
--- a/paddle/fluid/imperative/CMakeLists.txt
+++ b/paddle/fluid/imperative/CMakeLists.txt
@@ -4,7 +4,7 @@ cc_library(prepared_operator SRCS prepared_operator.cc DEPS proto_desc operator
 cc_library(layer SRCS layer.cc DEPS prepared_operator math_function imperative_flag variable_helper op_registry)
 add_subdirectory(jit)
 cc_library(amp SRCS amp_auto_cast.cc DEPS layer )
-cc_library(tracer SRCS tracer.cc DEPS layer engine program_desc_tracer amp)
+cc_library(tracer SRCS tracer.cc DEPS layer engine program_desc_tracer amp denormal)
 cc_library(basic_engine SRCS basic_engine.cc DEPS layer gradient_accumulator)
 cc_library(engine SRCS basic_engine.cc partial_grad_engine.cc DEPS layer gradient_accumulator)
 cc_library(imperative_profiler SRCS profiler.cc)
diff --git a/paddle/fluid/imperative/basic_engine.cc b/paddle/fluid/imperative/basic_engine.cc
index 2a439a6f1ea81..d5350744e4c55 100644
--- a/paddle/fluid/imperative/basic_engine.cc
+++ b/paddle/fluid/imperative/basic_engine.cc
@@ -284,15 +284,15 @@ static std::shared_ptr<NameVarMap<VariableWrapper>> CallGradientHooks(
   for (const auto& pair : bwd_ins) {
     for (size_t i = 0; i < pair.second.size(); ++i) {
       auto& var = pair.second[i];
-      if (var->HasHook()) {
+      if (var->HasVariableWrapperHook()) {
         if (tmp_ins_ptr == nullptr) {
           tmp_ins_ptr = std::make_shared<NameVarMap<VariableWrapper>>(bwd_ins);
         }
-        VLOG(3) << "Call " << var->GetHooks().size() << " hooks of " << op_type
-                << "'s input `" << pair.first << "`'s var `" << var->Name()
-                << "`.";
+        VLOG(3) << "Call " << var->GetVariableWrapperHooks().size()
+                << " hooks of " << op_type << "'s input `" << pair.first
+                << "`'s var `" << var->Name() << "`.";
         auto tmp_var = var;
-        for (const auto& hook_pair : var->GetHooks()) {
+        for (const auto& hook_pair : var->GetVariableWrapperHooks()) {
           tmp_var = (*hook_pair.second)(tmp_var);
         }
         (*tmp_ins_ptr)[pair.first][i] = tmp_var;
diff --git a/paddle/fluid/imperative/dygraph_grad_maker.h b/paddle/fluid/imperative/dygraph_grad_maker.h
index a367840472827..7fefc9ccc67b5 100644
--- a/paddle/fluid/imperative/dygraph_grad_maker.h
+++ b/paddle/fluid/imperative/dygraph_grad_maker.h
@@ -279,6 +279,8 @@ class TracedGradOp {
 
   void SetType(const std::string& type) { op_->SetType(type); }
 
+  const framework::OperatorBase& InnerOp() const { return op_->InnerOp(); }
+
   void SetAttrMap(const framework::AttributeMap& attrs) {
     return op_->SetAttrMap(attrs);
   }
diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc
index 64f5a9e0cc877..43546cf99c69f 100644
--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -467,14 +467,14 @@ void GradientAccumulator::CallGradientHooks() {
       platform::errors::PreconditionNotMet("Leaf Tensor's inner var "
                                            "is not initialized when "
                                            "call gradient hook."));
-  if (var_->HasHook()) {
-    VLOG(3) << "Call " << var_->GetHooks().size()
+  if (var_->HasVariableWrapperHook()) {
+    VLOG(3) << "Call " << var_->GetVariableWrapperHooks().size()
             << " hooks of leaf gradient accumulator's inner var `"
             << var_->Name() << "`.";
     auto tmp_var = inner_var_;
     VLOG(3) << "Input var " << var_->Name() << "'s hook size - "
-            << var_->GetHooks().size();
-    for (const auto& hook_pair : var_->GetHooks()) {
+            << var_->GetVariableWrapperHooks().size();
+    for (const auto& hook_pair : var_->GetVariableWrapperHooks()) {
       tmp_var = (*hook_pair.second)(tmp_var);
     }
     inner_var_ = tmp_var;
@@ -495,10 +495,10 @@ void GradientAccumulator::CallReduceHooks() {
                         "Only can call reduce hooks after the "
                         "gradient accumulation is completed in "
                         "current batch or across batchs."));
-  if (var_->HasMutableHook()) {
-    for (const auto& hook : var_->GetMutableHooks()) {
+  if (var_->HasVoidHook()) {
+    for (const auto& hook : var_->GetVoidHooks()) {
       VLOG(3) << "call gradient accumulator backward hooks.";
-      (*hook)(var_);
+      (*hook)();
     }
   }
 }
diff --git a/paddle/fluid/imperative/hooks.h b/paddle/fluid/imperative/hooks.h
index 4d59298aed51f..fa929b7c7a51c 100644
--- a/paddle/fluid/imperative/hooks.h
+++ b/paddle/fluid/imperative/hooks.h
@@ -23,32 +23,34 @@ namespace imperative {
 
 class VariableWrapper;
 
-/** [ Const VariableWrapper Hook: Pre hook functor of OpBase ]
+/** [ VariableWrapper Hook ]
  *
- * @brief This hook functor is executed before the grad OpBase is executed,
- *        taking the input of the current grad OpBase as input, and
- *        executing python hooks (user-defined) or C++ hooks (developer-defined)
- *        to achieve the purpose of custom operations on the interior VarBase
- *        gradient.
+ * @brief This hook functor is executed before the grad OpBase is executed or
+ *        after gradient accumulation completed in current batch.
+ *        1. For interior var, VariableWrapper Hook take the input of the
+ *        current grad OpBase as input.
+ *        2. For leaf var, VariableWrapper Hook take the inner_var_ of
+ *        GradientAccumulator as input.
  *
- * @note  This hook functor will not change the input gradient VarBase.
+ * @note  This hook functor will not change the input gradient VariableWrapper,
+ *        but if you copy the input VariableWrapper and change the value of
+ *        Variable in VariableWrapper, the value of input will also be changed,
+ *        because they shared same PlaceHolder.
  *
- * @note  [Why need to be OpBase `PreHook`, why not `PostHook`?]
+ * @note  [ Why need to be OpBase `PreHook`, why not `PostHook`? ]
  *
- *        1. We expect If set OpBase post hook, when the op executed end, the
+ *        We expect If set OpBase post hook, when the op executed end, the
  *        op's output gradient may not be the final state, because it may need
  *        other op's gradient output to accumulated to it. But before op can
  *        be executed, the gradient output must have been accumulated to final
  *        value.
- *        2. We don’t want the hook to change its input Tensor value, so now
- *        we can't call all hooks in GradAccumulator.
  *
- * @note  [Why only can be used for interior VarBase?]
+ * @note  [ Why Leaf gradient is special? ]
  *
  *        Because the leaf VarBase's GradVarBase has no GradOpNode, so leaf
  *        GradVarBase has no next OpBase to executed, so if need to deal with
- *        the leaf GradVarBase, cannot use this hook functor. For this case, we
- *        deal with by other inplace hook method.
+ *        the leaf GradVarBase, we should call hooks after gradient accumulation
+ *        completed.
  */
 class VariableWrapperHook {
  public:
@@ -57,34 +59,22 @@ class VariableWrapperHook {
       const std::shared_ptr<VariableWrapper>& var) = 0;
 };
 
-/** [ Inplace VariableWrapper Hook: Post hook functor of GradAccumulator ]
- *
- * @brief This hook functor is the Hook that operates on the current
- *        gradientafter the GradientAccumulator has accumulated the gradient.
- *        Leaf GradVarBase has no next OpBase, if we want to register hook
- *        for it, we also need to wait until the leaf GradVarBase accumulation
- *        is completed, so we can add post hook to GradientAccumulator.
- *
- * @note  This hook functor will change the grad VarBase value.
- *
- * @note  Only allow leaf VarBase hold call this hook functor.
- */
-class InplaceVariableWrapperHook {
- public:
-  virtual ~InplaceVariableWrapperHook() = default;
-  virtual void operator()(VariableWrapper* var) = 0;
-};
-
-class LambdaInplaceVariableWrapperHook : public InplaceVariableWrapperHook {
+class CppVariableWrapperHook : public VariableWrapperHook {
  public:
-  explicit LambdaInplaceVariableWrapperHook(
-      std::function<void(VariableWrapper*)>&& fn)
+  explicit CppVariableWrapperHook(
+      std::function<std::shared_ptr<VariableWrapper>(
+          const std::shared_ptr<VariableWrapper>&)>&& fn)
       : fn_(std::move(fn)) {}
 
-  void operator()(VariableWrapper* var) override { fn_(var); }
+  std::shared_ptr<VariableWrapper> operator()(
+      const std::shared_ptr<VariableWrapper>& var) override {
+    return fn_(var);
+  }
 
  private:
-  std::function<void(VariableWrapper*)> fn_;
+  std::function<std::shared_ptr<VariableWrapper>(
+      const std::shared_ptr<VariableWrapper>&)>
+      fn_;
 };
 
 }  // namespace imperative
diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc
index 062f04c6b7052..a4af3117d3e32 100644
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -187,6 +187,7 @@ size_t VarBase::GradOpNum() const {
 }
 
 void VarBase::ClearGradient() {
+  VLOG(4) << "ClearGradient " << Name();
   if (grad_var_) {
     if (grad_var_->Var().IsType<framework::SelectedRows>()) {
       auto* grad_t =
@@ -406,7 +407,7 @@ void OpBase::Run(const framework::OperatorBase& op,
   OpBaseRunImpl<VariableWrapper>(op, ins, outs, attrs, place);
 }
 
-static void ClearNoNeedBufferInputs(OpBase* op) {
+void ClearNoNeedBufferInputs(OpBase* op) {
   auto& inferer = op->Info().NoNeedBufferVarsInferer();
   if (!inferer) return;
   auto* ins = op->GetMutableInsMap();
diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h
index f87db415768a1..bbede47e36429 100644
--- a/paddle/fluid/imperative/layer.h
+++ b/paddle/fluid/imperative/layer.h
@@ -108,6 +108,10 @@ class VarBase {
 
   void ClearGradVarBase() { grad_var_ = nullptr; }
 
+  void SetGradVarBase(VarBase& grad_var) {
+    MutableGradVarBase()->CopyFrom(grad_var, true);
+  }
+
   const std::shared_ptr<VarBase>& MutableGradVarBase() {
     if (grad_var_ == nullptr) {
       if (auto grad_var_wrapper = var_->GetGradVar()) {
@@ -222,23 +226,25 @@ class VarBase {
   void BumpInplaceVersion();
 
   /* Hook related method: now only used for GradVarBase */
-  bool HasHook() const { return var_->HasHook(); }
+  bool HasVariableWrapperHook() const { return var_->HasVariableWrapperHook(); }
 
-  int64_t AddHook(std::shared_ptr<VariableWrapperHook>&& hook) {
-    return var_->AddHook(
+  int64_t AddVariableWrapperHook(std::shared_ptr<VariableWrapperHook>&& hook) {
+    return var_->AddVariableWrapperHook(
         std::forward<std::shared_ptr<VariableWrapperHook>>(hook));
   }
 
-  bool RemoveHook(const int64_t& hook_id) { return var_->RemoveHook(hook_id); }
+  bool RemoveVariableWrapperHook(const int64_t& hook_id) {
+    return var_->RemoveVariableWrapperHook(hook_id);
+  }
 
-  const std::map<int64_t, std::shared_ptr<VariableWrapperHook>>& GetHooks()
-      const {
-    return var_->GetHooks();
+  const std::map<int64_t, std::shared_ptr<VariableWrapperHook>>&
+  GetVariableWrapperHooks() const {
+    return var_->GetVariableWrapperHooks();
   }
 
-  void AddMutableHook(std::shared_ptr<InplaceVariableWrapperHook>&& hook) {
-    var_->AddMutableHook(
-        std::forward<std::shared_ptr<InplaceVariableWrapperHook>>(hook));
+  void AddVoidHook(std::shared_ptr<std::function<void()>>&& hook) {
+    var_->AddVoidHook(
+        std::forward<std::shared_ptr<std::function<void()>>>(hook));
   }
 
  private:
@@ -280,5 +286,7 @@ std::shared_ptr<GradOpNode> CreateGradOpNode(
     const platform::Place& place,
     const std::map<std::string, std::string>& inplace_map);
 
+void ClearNoNeedBufferInputs(OpBase* op);
+
 }  // namespace imperative
 }  // namespace paddle
diff --git a/paddle/fluid/imperative/py_layer_fwd.h b/paddle/fluid/imperative/py_layer_fwd.h
new file mode 100644
index 0000000000000..bd132f2576fec
--- /dev/null
+++ b/paddle/fluid/imperative/py_layer_fwd.h
@@ -0,0 +1,172 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/imperative/layer.h"
+#include "paddle/fluid/imperative/tracer.h"
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/type_defs.h"
+#include "paddle/fluid/operators/py_layer_op.h"
+
+namespace paddle {
+namespace imperative {
+
+namespace py = ::pybind11;
+
+bool RequiredGrad(const NameVarBaseMap& ins, const NameVarBaseMap& outs) {
+  for (const auto& name_pair : ins) {
+    for (const auto& var_base : name_pair.second) {
+      if (!var_base->OverridedStopGradient()) {
+        PassStopGradient(outs, var_base->OverridedStopGradient());
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+std::shared_ptr<GradOpNode> CreateGradOpNode(
+    const std::string& type, const NameVarBaseMap& ins,
+    const NameVarBaseMap& outs, const framework::AttributeMap& attrs,
+    const platform::Place& place,
+    const std::map<std::string, std::string>& inplace_map,
+    const std::shared_ptr<operators::PyLayerContext>& py_context) {
+  operators::PyLayerGradOpMaker<paddle::imperative::OpBase> maker(
+      type, ins, outs, attrs, inplace_map);
+
+  maker.SetPyLayerContext(py_context);
+  auto grad_node = maker();
+  if (grad_node && !grad_node->empty()) {
+    for (auto& grad_op : *grad_node) {
+      grad_op.SetId(OpBase::GenerateUniqueId());
+      grad_op.SetPlace(place);
+      ClearNoNeedBufferInputs(&grad_op);
+    }
+    return grad_node;
+  } else {
+    return nullptr;
+  }
+}
+
+py::object PyLayerApply(const platform::Place& place, const py::object& cls,
+                        const py::args args, const py::kwargs kwargs) {
+  auto bk_function = cls.attr("_backward_function");
+  auto context = bk_function();
+  auto forward = cls.attr("forward");
+
+  auto result_forward = forward(context, *args, **kwargs);
+  std::shared_ptr<operators::PyLayerContext> py_layer_ctx =
+      std::make_shared<operators::PyLayerContext>(context.release().ptr());
+  // make inputs to varbase
+  std::vector<std::shared_ptr<imperative::VarBase>> input_vars;
+  // process args,`input_vars` only collect `imperative::VarBase`
+  if (!args.empty()) {
+    for (auto ptr = args.begin(); ptr != args.end(); ptr++) {
+      try {
+        if (Py_None != ptr->ptr()) {
+          auto a = ptr->cast<std::shared_ptr<VarBase>>();
+          input_vars.push_back(a);
+        }
+      } catch (py::cast_error& err) {
+        // Only collect Tensor type in 'args' and pass them to backward. Ignore
+        // other types of input temporarily.
+      }
+    }
+  }
+  // process kwargs, only collect `imperative::VarBase`
+  if (!kwargs.empty()) {
+    for (auto ptr = kwargs.begin(); ptr != kwargs.end(); ptr++) {
+      try {
+        if (Py_None != ptr->second.ptr()) {
+          auto a = ptr->second.cast<std::shared_ptr<VarBase>>();
+          input_vars.push_back(a);
+        }
+      } catch (py::cast_error&) {
+        // Only collect Tensor type in 'kwargs' and pass them to backward.
+        // Ignore other types of input temporarily.
+      }
+    }
+  }
+  NameVarBaseMap ins = {{"X", input_vars}};
+
+  std::vector<std::shared_ptr<imperative::VarBase>> output_vars;
+  if (PyTuple_Check(result_forward.ptr()) ||
+      PyList_Check(result_forward.ptr())) {
+    auto tuple_result = result_forward.cast<py::tuple>();
+    for (size_t i = 0; i < tuple_result.size(); i++) {
+      if (Py_None != tuple_result[i].ptr()) {
+        try {
+          auto temp_out =
+              tuple_result[i].cast<std::shared_ptr<imperative::VarBase>>();
+          output_vars.push_back(temp_out);
+        } catch (py::cast_error&) {
+          PADDLE_THROW(platform::errors::Unimplemented(
+              "The output of `PyLayer.forward` should be `Tensor`."));
+        }
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "The output of `PyLayer.forward` can not be `None`."));
+      }
+    }
+  } else {
+    if (Py_None != result_forward.ptr()) {
+      try {
+        auto temp_out =
+            result_forward.cast<std::shared_ptr<imperative::VarBase>>();
+        output_vars.push_back(temp_out);
+      } catch (py::cast_error&) {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "The output of `PyLayer.forward` should be `Tensor`."));
+      }
+    } else {
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "The output of `PyLayer.forward` can not be `None`."));
+    }
+  }
+
+  NameVarBaseMap outs = {{"Out", output_vars}};
+
+  if (RequiredGrad(ins, outs)) {
+    std::map<std::string, std::string> inplace_map{};
+    bool if_inplace = false;
+    for (auto temp_ins : input_vars) {
+      if (if_inplace) {
+        break;
+      }
+      for (auto temp_outs : output_vars) {
+        if (temp_ins->Name() == temp_outs->Name()) {
+          if_inplace = true;
+          break;
+        }
+      }
+    }
+    if (if_inplace) {
+      inplace_map["X"] = "Out";
+    }
+
+    CreateGradOpNode("py_layer", ins, outs, {{}}, place, inplace_map,
+                     py_layer_ctx);
+  } else {
+    VLOG(3) << "No Grad to track for Op: py_layer_op";
+  }
+
+  return result_forward;
+}
+
+}  // namespace imperative
+}  // namespace paddle
diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc
index 5422b7ce9c855..a92704ce447dc 100644
--- a/paddle/fluid/imperative/reducer.cc
+++ b/paddle/fluid/imperative/reducer.cc
@@ -310,9 +310,8 @@ Reducer::Reducer(const std::vector<std::shared_ptr<imperative::VarBase>> &vars,
   for (size_t global_var_index = 0; global_var_index < vars_.size();
        ++global_var_index) {
     auto var = vars_[global_var_index];
-    var->GradVarBase()->AddMutableHook(
-        std::make_shared<LambdaInplaceVariableWrapperHook>([=](
-            VariableWrapper *grad) { this->AddDistHook(global_var_index); }));
+    var->GradVarBase()->AddVoidHook(std::make_shared<std::function<void()>>(
+        [=]() { this->AddDistHook(global_var_index); }));
     var_index_map_[var->GradVarBase()->SharedVar().get()] = global_var_index;
   }
 
diff --git a/paddle/fluid/imperative/tests/test_hooks.cc b/paddle/fluid/imperative/tests/test_hooks.cc
index 8c907b9890652..5c4e1538cf053 100644
--- a/paddle/fluid/imperative/tests/test_hooks.cc
+++ b/paddle/fluid/imperative/tests/test_hooks.cc
@@ -37,6 +37,30 @@ namespace imperative {
 using vb_vector = std::vector<std::shared_ptr<imperative::VarBase>>;
 using var_pair = std::pair<std::string, vb_vector>;
 
+std::shared_ptr<imperative::VariableWrapper> DoubleHook(
+    const std::shared_ptr<imperative::VariableWrapper>& var) {
+  // 1. create out var
+  auto out_var = std::make_shared<imperative::VariableWrapper>(var->Name());
+  out_var->SetType(var->Type());
+  out_var->SetDataType(var->DataType());
+  out_var->SetForwardDataType(var->ForwardDataType());
+  out_var->InnerSetOverridedStopGradient(var->InnerOverridedStopGradient());
+
+  // 2. get input and output var's tensor
+  auto* out_tensor = out_var->MutableVar()->GetMutable<framework::LoDTensor>();
+  auto& tensor = var->Var().Get<framework::LoDTensor>();
+  out_tensor->Resize(tensor.dims());
+
+  // 3. double calc
+  auto* data = tensor.data<float>();
+  auto* out_data = out_tensor->mutable_data<float>(platform::CPUPlace());
+  for (int64_t i = 0; i < out_tensor->numel(); ++i) {
+    out_data[i] = data[i] * 2.0;
+  }
+
+  return out_var;
+}
+
 TEST(TestHooks, TestGradVarLeafBackwardHook) {
   // 1. prepare
   Tracer tracer;
@@ -73,16 +97,14 @@ TEST(TestHooks, TestGradVarLeafBackwardHook) {
   framework::AttributeMap mul_attr_map;
   mul_attr_map["use_mkldnn"] = false;
 
-  // add GradAccumulatorPostHook
-  x->GradVarBase()->AddMutableHook(
-      std::make_shared<LambdaInplaceVariableWrapperHook>(
-          [=](VariableWrapper* grad) {
-            auto* grad_tensor =
-                grad->MutableVar()->GetMutable<framework::LoDTensor>();
-            for (int i = 0; i < grad_tensor->numel(); ++i) {
-              grad_tensor->mutable_data<float>(place)[i] *= 2.0;
-            }
-          }));
+  // add VariableWrapper hook
+  x->GradVarBase()->AddVariableWrapperHook(
+      std::make_shared<imperative::CppVariableWrapperHook>(DoubleHook));
+
+  // add Void hook
+  int64_t hook_value = 0;
+  x->GradVarBase()->AddVoidHook(
+      std::make_shared<std::function<void()>>([&]() { hook_value = 10; }));
 
   // 2. forward
   tracer.TraceOp("mul", ins, outs, mul_attr_map, place, true);
@@ -98,12 +120,15 @@ TEST(TestHooks, TestGradVarLeafBackwardHook) {
   engine.Init(tensors, grad_tensors);
   engine.Execute();
 
+  // verify VariableWrapper hook result
   framework::LoDTensor x_grad;
   framework::TensorCopySync(x->GradVar().Get<framework::LoDTensor>(), place,
                             &x_grad);
   for (int i = 0; i < x_grad.numel(); ++i) {
     ASSERT_EQ(x_grad.data<float>()[i], 8.0);
   }
+  // verify Void hook result
+  ASSERT_EQ(hook_value, 10);
 
   framework::LoDTensor y_grad;
   framework::TensorCopySync(y->GradVar().Get<framework::LoDTensor>(), place,
@@ -152,16 +177,14 @@ void GradVarLeafBackwardHookWithGradAccmulatedTest() {
   memory::Copy(place, mutable_z, place, src_data.data(),
                sizeof(float) * src_data.size());
 
-  // add ReduceBackwardHook
-  x->GradVarBase()->AddMutableHook(
-      std::make_shared<LambdaInplaceVariableWrapperHook>(
-          [=](VariableWrapper* grad) {
-            auto* grad_tensor =
-                grad->MutableVar()->GetMutable<framework::LoDTensor>();
-            for (int i = 0; i < grad_tensor->numel(); ++i) {
-              grad_tensor->mutable_data<float>(place)[i] *= 2.0;
-            }
-          }));
+  // add VariableWrapper hook
+  x->GradVarBase()->AddVariableWrapperHook(
+      std::make_shared<imperative::CppVariableWrapperHook>(DoubleHook));
+
+  // add Void hook
+  int64_t hook_value = 0;
+  x->GradVarBase()->AddVoidHook(
+      std::make_shared<std::function<void()>>([&]() { hook_value = 100; }));
 
   // 2. forward
   var_pair x_pair = var_pair("X", vb_vector(1, x));
@@ -199,12 +222,15 @@ void GradVarLeafBackwardHookWithGradAccmulatedTest() {
   engine.Init(tensors, grad_tensors);
   engine.Execute();
 
+  // verify VariableWrapper hook result
   framework::LoDTensor x_grad;
   framework::TensorCopySync(x->GradVar().Get<framework::LoDTensor>(), place,
                             &x_grad);
   for (int i = 0; i < x_grad.numel(); ++i) {
     ASSERT_EQ(x_grad.data<float>()[i], 16.0);
   }
+  // verify Void hook result
+  ASSERT_EQ(hook_value, 100);
 
   framework::LoDTensor y_grad;
   framework::TensorCopySync(y->GradVar().Get<framework::LoDTensor>(), place,
diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc
index 608cc407d5b77..742514c0910a2 100644
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -19,6 +19,7 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/imperative/amp_auto_cast.h"
 #include "paddle/fluid/imperative/op_base.h"
+#include "paddle/fluid/platform/denormal.h"
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/string/string_helper.h"
 
@@ -38,7 +39,7 @@ void SetCurrentTracer(const std::shared_ptr<Tracer>& tracer) {
   VLOG(6) << "Set current tracer: " << g_current_tracer;
 }
 
-static void PassStopGradient(const NameVarBaseMap& outs, bool generate_grad) {
+void PassStopGradient(const NameVarBaseMap& outs, bool generate_grad) {
   for (const auto& pair : outs) {
     for (const auto& var : pair.second) {
       // NOTE(zhiqiu): this happends when None output are passed from python
@@ -134,6 +135,7 @@ void Tracer::TraceOp(const std::string& type, const NameVarBaseMap& ins,
                      const platform::Place& place, bool trace_backward,
                      const std::map<std::string, std::string>& inplace_map) {
   platform::RecordEvent op_type_record_event(type);
+  platform::ScopedFlushDenormal flush;
   VLOG(1) << "Trace Op: " << type;
   if (FLAGS_use_mkldnn) {
     // if both lists are empty all ops are enabled (default for
diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h
index b10d1b2d0b49d..8f50550878262 100644
--- a/paddle/fluid/imperative/tracer.h
+++ b/paddle/fluid/imperative/tracer.h
@@ -130,5 +130,7 @@ void IncreaseVarbaseReferenceCountUntilCopyComplete(
     const std::shared_ptr<imperative::VarBase>& var,
     const platform::Place& place);
 
+void PassStopGradient(const NameVarBaseMap& outs, bool generate_grad);
+
 }  // namespace imperative
 }  // namespace paddle
diff --git a/paddle/fluid/imperative/variable_wrapper.h b/paddle/fluid/imperative/variable_wrapper.h
index 7d287c9829104..5fa8b89a396d9 100644
--- a/paddle/fluid/imperative/variable_wrapper.h
+++ b/paddle/fluid/imperative/variable_wrapper.h
@@ -38,6 +38,9 @@ class VariableWrapper {
 
   explicit VariableWrapper(const std::string& name) : name_(name) {}
 
+  VariableWrapper(const std::string& name, const framework::Variable& variable)
+      : var_(variable), name_(name) {}
+
   ~VariableWrapper() { VLOG(10) << "Destruct VariableWrapper: " << Name(); }
 
   const framework::Variable& Var() const { return var_; }
@@ -220,35 +223,35 @@ class VariableWrapper {
   }
 
   /* Hook related methods */
-  bool HasHook() const { return !hooks_.empty(); }
-
-  bool HasMutableHook() const { return !mutable_hooks_.empty(); }
+  bool HasVariableWrapperHook() const { return !var_hooks_.empty(); }
 
-  int64_t AddHook(std::shared_ptr<VariableWrapperHook>&& hook) {
-    hooks_.emplace(next_hook_id_, std::move(hook));
+  int64_t AddVariableWrapperHook(std::shared_ptr<VariableWrapperHook>&& hook) {
+    var_hooks_.emplace(next_hook_id_, std::move(hook));
     return next_hook_id_++;
   }
 
-  bool RemoveHook(const int64_t& hook_id) {
-    auto remove_cnt = hooks_.erase(hook_id);
+  bool RemoveVariableWrapperHook(const int64_t& hook_id) {
+    auto remove_cnt = var_hooks_.erase(hook_id);
     if (remove_cnt == 0) {
       return false;
     }
     return true;
   }
 
-  const std::map<int64_t, std::shared_ptr<VariableWrapperHook>>& GetHooks()
-      const {
-    return hooks_;
+  const std::map<int64_t, std::shared_ptr<VariableWrapperHook>>&
+  GetVariableWrapperHooks() const {
+    return var_hooks_;
   }
 
-  void AddMutableHook(std::shared_ptr<InplaceVariableWrapperHook>&& hook) {
-    mutable_hooks_.emplace_back(std::move(hook));
+  bool HasVoidHook() const { return !void_hooks_.empty(); }
+
+  void AddVoidHook(std::shared_ptr<std::function<void()>>&& hook) {
+    void_hooks_.emplace_back(std::move(hook));
   }
 
-  const std::vector<std::shared_ptr<InplaceVariableWrapperHook>>&
-  GetMutableHooks() const {
-    return mutable_hooks_;
+  const std::vector<std::shared_ptr<std::function<void()>>>& GetVoidHooks()
+      const {
+    return void_hooks_;
   }
 
  private:
@@ -319,14 +322,19 @@ class VariableWrapper {
   // isn't need
   bool is_empty_{false};
 
-  // NOTE(chenweihang): only grad var can hold hooks now
+  // NOTE(chenweihang): only grad var will hold hooks now
   int64_t next_hook_id_{0};
-  // Hooks used to register hook for grad var, support adding and removing,
+  // [ Hooks with VariableWrapper as input and output ]
+  // NOTE: Now registered for grad var, support adding and removing,
   // key is the accumulated int64_t value
-  std::map<int64_t, std::shared_ptr<VariableWrapperHook>> hooks_;
-  // Hooks executed after the execution of the entire backward process is over,
-  // currently only supported for reducing in distributed training
-  std::vector<std::shared_ptr<InplaceVariableWrapperHook>> mutable_hooks_;
+  // NOTE: Var hook need to support removing, so need hook id
+  std::map<int64_t, std::shared_ptr<VariableWrapperHook>> var_hooks_;
+  // [ Hooks without input and output ]
+  // NOTE: Now registered after the execution of the entire backward
+  // process is over, currently only used for reducing in distributed
+  // training
+  // NOTE: Now no need to support remove void hook
+  std::vector<std::shared_ptr<std::function<void()>>> void_hooks_;
 };
 
 }  // namespace imperative
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index 93fd85f13cbf0..c002c7a10cb7b 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -33,7 +33,7 @@ if (WITH_LITE)
   add_subdirectory(lite)
 endif()
 
-# fluid_modules exclude API-interface of inference/api and inference/capi
+# fluid_modules exclude API-interface of inference/api and inference/capi_exp
 get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
 
 # Adapt to custom op mechanism: Include the header files related to the data type
@@ -61,7 +61,7 @@ if(NOT APPLE)
 endif()
 
 # C inference API
-add_subdirectory(capi)
+add_subdirectory(capi_exp)
 
 if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
     add_subdirectory(tests/api)
diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
index bd27b1f5f3447..255c6ca75dfd7 100644
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -213,6 +213,11 @@ struct Argument {
   DECL_ARGUMENT_FIELD(tensorrt_use_calib_mode, TensorRtUseCalibMode, bool);
   DECL_ARGUMENT_FIELD(tensorrt_use_oss, TensorRtUseOSS, bool);
 
+  DECL_ARGUMENT_FIELD(use_dlnne, UseDlnne, bool);
+  DECL_ARGUMENT_FIELD(dlnne_min_subgraph_size, DlnneMinSubgraphSize, int);
+  DECL_ARGUMENT_FIELD(dlnne_max_batch_size, DlnneMaxBatchSize, int);
+  DECL_ARGUMENT_FIELD(dlnne_workspace_size, DlnneWorkspaceSize, int);
+
   DECL_ARGUMENT_FIELD(lite_passes_filter, LitePassesFilter,
                       std::vector<std::string>);
   DECL_ARGUMENT_FIELD(lite_ops_filter, LiteOpsFilter, std::vector<std::string>);
@@ -222,6 +227,11 @@ struct Argument {
 
   DECL_ARGUMENT_FIELD(use_xpu, UseXpu, bool);
   DECL_ARGUMENT_FIELD(xpu_l3_workspace_size, XpuL3WorkspaceSize, int);
+  DECL_ARGUMENT_FIELD(xpu_locked, XpuLocked, bool);
+  DECL_ARGUMENT_FIELD(xpu_autotune, XpuAutotune, bool);
+  DECL_ARGUMENT_FIELD(xpu_autotune_file, XpuAutotuneFile, std::string);
+  DECL_ARGUMENT_FIELD(xpu_precision, XpuPrecision, std::string);
+  DECL_ARGUMENT_FIELD(xpu_adaptive_seqlen, XpuAdaptiveSeqlen, bool);
 
   // Memory optimized related.
   DECL_ARGUMENT_FIELD(enable_memory_optim, EnableMemoryOptim, bool);
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index a4e263e2f464c..8407f98e6dfd9 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -166,6 +166,11 @@ void IRPassManager::CreatePasses(Argument *argument,
       // run fp16.
       pass->Set("disable_trt_plugin_fp16",
                 new bool(argument->disable_trt_plugin_fp16()));
+    } else if (pass_name == "dlnne_subgraph_pass") {
+      pass->Set("min_subgraph_size",
+                new int(argument->dlnne_min_subgraph_size()));
+      pass->Set("program",
+                new framework::ProgramDesc *(&argument->main_program()));
     }
     if (pass_name == "lite_subgraph_pass") {
       bool enable_int8 =
@@ -183,6 +188,12 @@ void IRPassManager::CreatePasses(Argument *argument,
                 new int(argument->xpu_l3_workspace_size()));
       pass->Set("cpu_math_library_num_threads",
                 new int(argument->cpu_math_library_num_threads()));
+      pass->Set("locked", new bool(argument->xpu_locked()));
+      pass->Set("autotune", new bool(argument->xpu_autotune()));
+      pass->Set("autotune_file",
+                new std::string(argument->xpu_autotune_file()));
+      pass->Set("precision", new std::string(argument->xpu_precision()));
+      pass->Set("adaptive_seqlen", new bool(argument->xpu_adaptive_seqlen()));
     }
     disable_logs_ = argument->disable_logs();
     if (pass_name == "fc_fuse_pass") {
diff --git a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
index e35178428cc7b..330f7a9984734 100644
--- a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
@@ -20,3 +20,15 @@ if (WITH_LITE)
   set(INFER_IR_PASSES ${INFER_IR_PASSES} lite_subgraph_pass CACHE INTERNAL "")
   cc_test(lite_subgraph_pass_tester SRCS lite_subgraph_pass_tester.cc DEPS lite_subgraph_pass gtest glog)
 endif()
+
+MESSAGE("WITH_DLNNE:${WITH_DLNNE}")
+if(WITH_DLNNE)
+  cc_library(dlnne_subgraph_pass SRCS dlnne_subgraph_pass.cc DEPS ${analysis_deps} subgraph_util)
+  set(analysis_deps ${analysis_deps}
+        subgraph_util dlnne_subgraph_pass
+        CACHE INTERNAL "")
+
+  set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h.tmp)
+  file(APPEND ${pass_file} "USE_PASS(dlnne_subgraph_pass);\n")
+  set(INFER_IR_PASSES ${INFER_IR_PASSES} dlnne_subgraph_pass CACHE INTERNAL "")
+endif()
diff --git a/paddle/fluid/operators/distributed/large_scale_kv.cc b/paddle/fluid/inference/analysis/ir_passes/dlnne_reg_py.h
similarity index 64%
rename from paddle/fluid/operators/distributed/large_scale_kv.cc
rename to paddle/fluid/inference/analysis/ir_passes/dlnne_reg_py.h
index d2673ed6ffb36..ae977c1403a87 100644
--- a/paddle/fluid/operators/distributed/large_scale_kv.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/dlnne_reg_py.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -11,16 +11,11 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-
-#include "paddle/fluid/operators/distributed/large_scale_kv.h"
+#pragma once
 
 namespace paddle {
-namespace operators {
-namespace distributed {
-
-std::once_flag LargeScaleKV::init_flag_;
-std::shared_ptr<LargeScaleKV> LargeScaleKV::scale_kv_(nullptr);
+namespace inference {
 
-}  // namespace distributed
-}  // namespace operators
+int RegisterPyFunc(const std::string& name, void* pfn);
+}  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.cc
new file mode 100644
index 0000000000000..8f789139af9bf
--- /dev/null
+++ b/paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.cc
@@ -0,0 +1,351 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <algorithm>
+#include <map>
+#include <set>
+
+#include <fstream>
+#include <iostream>
+
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/ir/subgraph_detector.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/fluid/inference/analysis/helper.h"
+#include "paddle/fluid/inference/analysis/ir_passes/dlnne_reg_py.h"
+#include "paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.h"
+#include "paddle/fluid/string/pretty_log.h"
+
+namespace paddle {
+namespace inference {
+
+int (*PyConvertGraph)(const char *graph_name);
+
+int RegisterPyFunc(const std::string &name, void *pfn) {
+  if (name.compare("convert_graph") == 0) {
+    PyConvertGraph = reinterpret_cast<decltype(PyConvertGraph)>(pfn);
+  }
+
+  return 0;
+}
+int ConvertGraph(std::string graph_name) {
+  LOG(INFO) << "starting doing convert_graph";
+
+  PyConvertGraph(graph_name.c_str());
+
+  return 0;
+}
+
+namespace analysis {
+
+using framework::ir::Node;
+
+void analysis::DlnneSubgraphPass::ApplyImpl(framework::ir::Graph *graph) const {
+  static std::unordered_set<std::string> teller_set{
+      "mul", "matmul", "conv2d", "pool2d", "relu", "softmax", "sigmoid",
+      "hard_swish", "depthwise_conv2d", "batch_norm", "concat", "tanh", "pad",
+      "elementwise_add", "elementwise_mul", "dropout", "prelu",
+      "conv2d_transpose", "leaky_relu",
+      // "fc",
+      "shuffle_channel", "swish", "split",
+      // "instance_norm",
+      "gelu",
+      // "layer_norm",
+      // "scale",
+      // "stack",
+      "relu6", "reshape2", "transpose2", "concat", "slice",
+  };
+
+  framework::ir::FusePassBase::Init("dlnne_subgraph_pass", graph);
+
+  auto teller = [&](const framework::ir::Node *node) {
+    if (!node->IsOp() || !node->Op()) return false;
+    return teller_set.find(node->Op()->Type()) != teller_set.end();
+  };
+
+  framework::ir::SubGraphFuser fuser(
+      graph, teller, Get<int>("min_subgraph_size") /*min subgraph size*/,
+      "dlnne_engine");
+  fuser();
+
+  std::vector<std::string> graph_param_names =
+      ExtractParameters(graph->Nodes());
+  // those parameter already exist in dlnne, and should not have another copy in
+  // fluid.
+  std::vector<std::string> repetitive_params;
+
+  for (auto *node : graph->Nodes()) {
+    if (node->IsOp() && !framework::ir::Agent(node).subgraph()->empty()) {
+      CreateDlnneOp(node, graph, graph_param_names, &repetitive_params);
+
+      std::unordered_set<const Node *> nodes2remove(
+          framework::ir::Agent(node).subgraph()->begin(),
+          framework::ir::Agent(node).subgraph()->end());
+      framework::ir::GraphSafeRemoveNodes(graph, nodes2remove);
+    }
+  }
+
+  std::unordered_set<const Node *> nodes2remove;
+  for (auto *node : graph->Nodes()) {
+    if (node->IsOp() && framework::ir::Agent(node).deleted()) {
+      nodes2remove.insert(node);
+    }
+  }
+  framework::ir::GraphSafeRemoveNodes(graph, nodes2remove);
+}
+
+std::string GenerateEngineKey(const std::set<std::string> &engine_inputs,
+                              const std::set<std::string> &engine_outputs,
+                              const std::string &predictor_id) {
+  std::string engine_hash_key = "";
+  for (auto name : engine_inputs) {
+    engine_hash_key += name;
+  }
+  for (auto name : engine_outputs) {
+    engine_hash_key += name;
+  }
+  engine_hash_key += predictor_id;
+  auto engine_key = std::to_string(std::hash<std::string>()(engine_hash_key));
+  return engine_key;
+}
+std::string replace_name(std::string name, const char *raw,
+                         const char *new_char) {
+  std::string r_name = name;
+  int pos = r_name.find(raw);
+  while (pos >= 0) {
+    r_name = r_name.replace(pos, 1, new_char);
+    pos = r_name.find(raw);
+  }
+  return r_name;
+}
+
+void DlnneSubgraphPass::CreateDlnneOp(
+    framework::ir::Node *node, framework::ir::Graph *graph,
+    const std::vector<std::string> &graph_params,
+    std::vector<std::string> *repetitive_params) const {
+  auto *op_desc = node->Op();
+  auto &subgraph = *framework::ir::Agent(node).subgraph();
+  PADDLE_ENFORCE_EQ(subgraph.empty(), false,
+                    platform::errors::PreconditionNotMet(
+                        "The subgraph should not be empty."));
+
+  // A fake block desc.
+  framework::proto::BlockDesc block_proto;
+  framework::BlockDesc block_desc(nullptr, &block_proto);
+  block_desc.Proto()->set_parent_idx(-1);
+  block_desc.Proto()->set_idx(0);
+  LOG(INFO) << "---  detect a sub-graph with " << subgraph.size() << " nodes";
+  // for debug
+  framework::ProgramDesc tmp_dump_program_desc;
+  auto *tmp_dump_main_block = tmp_dump_program_desc.MutableBlock(0);
+
+  std::unordered_map<std::string, framework::VarDesc *> name_var_desc;
+  std::set<std::string> name_var_input_nodes;
+  std::set<std::string> name_var_output_nodes;
+  std::set<std::string> name_ops;
+
+  for (auto *node : subgraph) {
+    auto *op = block_desc.AppendOp();
+    *op->Proto() = *node->Op()->Proto();
+
+    // debug
+    {
+      name_ops.insert(node->Name());
+      auto *tmp_dump_new_block_op = tmp_dump_main_block->AppendOp();
+
+      framework::OpDesc op_desc;
+      op_desc.CopyFrom(*node->Op());
+
+      for (auto argument_name : op_desc.InputArgumentNames()) {
+        if (std::count(graph_params.begin(), graph_params.end(),
+                       argument_name) > 0) {
+          op_desc.Rename(argument_name, replace_name(argument_name, "/", "."));
+        }
+      }
+      for (auto argument_name : op_desc.OutputArgumentNames()) {
+        if (std::count(graph_params.begin(), graph_params.end(),
+                       argument_name) > 0) {
+          op_desc.Rename(argument_name, replace_name(argument_name, "/", "."));
+        }
+      }
+      *tmp_dump_new_block_op->Proto() = *op_desc.Proto();
+
+      for (auto *x : node->inputs) {
+        if (x->IsVar()) {
+          name_var_desc[x->Name()] = x->Var();
+        }
+        if (std::count(graph_params.begin(), graph_params.end(), x->Name()) ==
+            0)
+          name_var_input_nodes.insert(x->Name());
+      }
+
+      for (auto *x : node->outputs) {
+        if (x->IsVar()) {
+          name_var_desc[x->Name()] = x->Var();
+        }
+        if (std::count(graph_params.begin(), graph_params.end(), x->Name()) ==
+            0)
+          name_var_output_nodes.insert(x->Name());
+      }
+    }
+  }
+  std::set<std::string> valid_input_names;
+  std::set<std::string> valid_output_names;
+  for (auto name : name_var_output_nodes) {
+    if (name_var_input_nodes.find(name) == name_var_input_nodes.end()) {
+      valid_output_names.insert(name);
+    }
+  }
+
+  for (auto name : name_var_input_nodes) {
+    if (name_var_output_nodes.find(name) == name_var_output_nodes.end()) {
+      valid_input_names.insert(name);
+    }
+  }
+
+  // Then, we will use the input_names_with_id and output_names_with_id to
+  // generate the engine key.
+  // So, We use set instead of unordered_set here to ensure that the engine key
+  // is unique.
+  std::set<std::string> input_names;
+  std::set<std::string> input_names_with_id;
+  std::vector<std::string> params;
+  // if we delete fluid copy of params shared by more than 1 ops, there will be
+  // problem, so we filter them out.
+
+  // The node->inputs contains input tensors and parameters.
+  for (auto *x : node->inputs) {
+    input_names.insert(x->Name());
+    input_names_with_id.insert(x->Name() + std::to_string(x->id()));
+    if (std::count(graph_params.begin(), graph_params.end(), x->Name()) > 0) {
+      params.push_back(x->Name());
+    }
+  }
+
+  std::set<std::string> output_names;
+  std::set<std::string> output_names_with_id;
+  std::vector<int> origin_output_dims;
+  for (auto *x : node->outputs) {
+    origin_output_dims.push_back(x->Var()->GetShape().size());
+    output_names.insert(x->Name());
+    output_names_with_id.insert(x->Name() + std::to_string(x->id()));
+  }
+
+  std::unordered_map<std::string, std::string> output_name_map;
+  std::unordered_map<std::string, framework::ir::Node *> graph_var_map;
+
+  for (framework::ir::Node *node : graph->Nodes()) {
+    if (node->IsVar() && node->Var()) {
+      graph_var_map[node->Name()] = node;
+    }
+  }
+
+  // Set attrs
+  op_desc->SetType("dlnne_engine");
+  op_desc->SetInput("Xs", std::vector<std::string>(valid_input_names.begin(),
+                                                   valid_input_names.end()));
+
+  op_desc->SetOutput("Ys", std::vector<std::string>(valid_output_names.begin(),
+                                                    valid_output_names.end()));
+
+  op_desc->SetAttr("parameters", params);
+  auto engine_key = GenerateEngineKey(input_names_with_id, output_names_with_id,
+                                      std::to_string(0));
+  op_desc->SetAttr("engine_key", engine_key);
+  auto *scope = param_scope();
+
+  {
+    std::set<std::string> input_names;
+
+    for (auto name : name_var_input_nodes) {
+      if (name_var_output_nodes.find(name) == name_var_output_nodes.end()) {
+        input_names.insert(name);
+      }
+    }
+
+    // add feed to subgraph:
+    int input_idx = 0;
+    for (auto input_name : input_names) {
+      auto *feed0 = tmp_dump_main_block->AppendOp();
+      feed0->SetType("feed");
+      feed0->SetInput("X", {"feed"});
+      feed0->SetOutput("Out", {input_name});
+      feed0->SetAttr("col", input_idx);
+      input_idx++;
+    }
+    // add fetch to subgraph:
+    int output_idx = 0;
+    for (auto output_name : valid_output_names) {
+      auto *fetch0 = tmp_dump_main_block->AppendOp();
+      fetch0->SetType("fetch");
+      fetch0->SetInput("X", {output_name});
+      fetch0->SetOutput("Out", {"out"});
+      fetch0->SetAttr("col", output_idx);
+      output_idx++;
+    }
+
+    mkdir("./dump", 0777);
+    std::string dir_name = "./dump/" + engine_key;
+    mkdir(dir_name.c_str(), 0777);
+    ofstream m_stream;
+    m_stream.open(dir_name + "/__model__", ios::out);
+
+    VLOG(4) << "name_var_desc size:" << name_var_desc.size();
+
+    for (auto &kv : name_var_desc) {
+      auto *new_add_var = tmp_dump_main_block->Proto()->add_vars();
+      *new_add_var = *kv.second->Proto();
+      auto *variable_tmp = scope->FindVar(kv.first);
+      if (variable_tmp != nullptr) {
+        *new_add_var->mutable_name() = replace_name(kv.first, "/", ".");
+        new_add_var->set_persistable(true);
+      } else {
+        new_add_var->set_persistable(false);
+      }
+    }
+
+    for (auto param_name : params) {
+      auto *var = scope->FindVar(param_name);
+      if (var != nullptr) {
+        auto *var_t = var->GetMutable<framework::LoDTensor>();
+        ofstream p_stream;
+        p_stream.open(dir_name + "/" + replace_name(param_name, "/", "."),
+                      ios::out);
+        platform::DeviceContextPool &pool =
+            platform::DeviceContextPool::Instance();
+        auto &dev_ctx = *pool.Get(var_t->place());
+        framework::SerializeToStream(p_stream, *var_t, dev_ctx);
+        p_stream.close();
+      }
+    }
+
+    std::string model;
+
+    tmp_dump_program_desc.Proto()->SerializeToString(&model);
+    m_stream << model;
+    m_stream.close();
+
+    op_desc->SetBlockAttr("sub_block", tmp_dump_main_block);
+    op_desc->SetAttr("subgraph", model);
+    op_desc->Flush();
+
+    ConvertGraph(engine_key);
+  }
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_PASS(dlnne_subgraph_pass,
+              paddle::inference::analysis::DlnneSubgraphPass);
diff --git a/paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.h b/paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.h
new file mode 100644
index 0000000000000..5a1d2506fdb09
--- /dev/null
+++ b/paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.h
@@ -0,0 +1,55 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/inference/analysis/ir_passes/subgraph_util.h"
+#include "paddle/fluid/inference/api/paddle_analysis_config.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+class Graph;
+class Node;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace inference {
+
+int ConvertGraph(std::string graph_name);
+
+namespace analysis {
+
+class DlnneSubgraphPass : public framework::ir::FusePassBase {
+ public:
+  void ApplyImpl(framework::ir::Graph *graph) const override;
+
+ private:
+  void CleanIntermediateOutputs(framework::ir::Node *node);
+  void CreateDlnneOp(framework::ir::Node *x, framework::ir::Graph *graph,
+                     const std::vector<std::string> &graph_params,
+                     std::vector<std::string> *repetitive_params) const;
+};
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
index c697914904b3e..b8cac8992f4ee 100644
--- a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
@@ -245,6 +245,11 @@ void LiteSubgraphPass::SetUpEngine(
   bool use_xpu = Get<bool>("use_xpu");
   int xpu_l3_workspace_size = Get<int>("xpu_l3_workspace_size");
   int cpu_math_library_num_threads = Get<int>("cpu_math_library_num_threads");
+  bool locked = Get<bool>("locked");
+  bool autotune = Get<bool>("autotune");
+  std::string autotune_file = Get<std::string>("autotune_file");
+  std::string precision = Get<std::string>("precision");
+  bool adaptive_seqlen = Get<bool>("adaptive_seqlen");
 
   lite_api::TargetType target_type;
   if (use_gpu) {
@@ -282,6 +287,11 @@ void LiteSubgraphPass::SetUpEngine(
   };
   config.cpu_math_library_num_threads = cpu_math_library_num_threads;
   config.xpu_l3_workspace_size = xpu_l3_workspace_size;
+  config.locked = locked;
+  config.autotune = autotune;
+  config.autotune_file = autotune_file;
+  config.precision = precision;
+  config.adaptive_seqlen = adaptive_seqlen;
   if (dump_model) {
     lite::StrToBinaryFile("./model.bin", config.model);
     lite::StrToBinaryFile("./param.bin", config.param);
diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
index 60de4234b41a8..f57f07883dcd7 100644
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -20,6 +20,7 @@
 #include "paddle/fluid/inference/analysis/helper.h"
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/engine.h"
+#include "paddle/fluid/inference/tensorrt/helper.h"
 #include "paddle/fluid/inference/tensorrt/op_teller.h"
 
 namespace paddle {
@@ -321,11 +322,20 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
     opt_input_shape = {};
   }
 
-  if (min_input_shape.size() > 0 && TRT_VERSION > 6000) {
+  auto to_major_version = [&](int full_version) -> float {
+    return (full_version / 100) / 10.0;
+  };
+  const float compile_time_trt_version = to_major_version(TRT_VERSION);
+  const float run_time_trt_version =
+      to_major_version(tensorrt::GetInferLibVersion());
+  if (compile_time_trt_version != run_time_trt_version) {
     LOG_FIRST_N(WARNING, 1)
-        << "The Paddle lib links the " << TRT_VERSION << " version TensorRT, "
-        << "make sure the runtime TensorRT you are using is no less than this "
-           "version, otherwise, there might be Segfault!";
+        << "The Paddle Inference library is compiled with "
+        << compile_time_trt_version << " version TensorRT, "
+        << "but the runtime TensorRT you are using is " << run_time_trt_version
+        << " version. "
+           "This might cause serious compatibility issues. We strongly "
+           "recommend using the same TRT version at runtime.";
   }
 
   // Setting the disable_trt_plugin_fp16 to true means that TRT plugin will not
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index 0622fb27d9e38..853c1ac1da874 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -26,6 +26,7 @@ namespace paddle {
 struct MkldnnQuantizerConfig;
 
 extern const std::vector<std::string> kTRTSubgraphPasses;
+extern const std::vector<std::string> kDlnneSubgraphPasses;
 extern const std::vector<std::string> kLiteSubgraphPasses;
 
 PassStrategy *AnalysisConfig::pass_builder() const {
@@ -95,9 +96,17 @@ void AnalysisConfig::DisableFCPadding() {
   Update();
 }
 
-void AnalysisConfig::EnableXpu(int l3_workspace_size) {
+void AnalysisConfig::EnableXpu(int l3_workspace_size, bool locked,
+                               bool autotune, const std::string &autotune_file,
+                               const std::string &precision,
+                               bool adaptive_seqlen) {
   use_xpu_ = true;
   xpu_l3_workspace_size_ = l3_workspace_size;
+  xpu_locked_ = locked;
+  xpu_autotune_ = autotune;
+  xpu_autotune_file_ = autotune_file;
+  xpu_precision_ = precision;
+  xpu_adaptive_seqlen_ = adaptive_seqlen;
   Update();
 }
 
@@ -134,6 +143,9 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
   CP_MEMBER(trt_use_static_engine_);
   CP_MEMBER(trt_use_calib_mode_);
   CP_MEMBER(trt_use_oss_);
+  // Dlnne related
+  CP_MEMBER(use_dlnne_);
+  CP_MEMBER(dlnne_min_subgraph_size_);
   // MKLDNN related.
   CP_MEMBER(use_mkldnn_);
   CP_MEMBER(mkldnn_enabled_op_types_);
@@ -157,6 +169,11 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
 
   CP_MEMBER(use_xpu_);
   CP_MEMBER(xpu_l3_workspace_size_);
+  CP_MEMBER(xpu_locked_);
+  CP_MEMBER(xpu_autotune_);
+  CP_MEMBER(xpu_autotune_file_);
+  CP_MEMBER(xpu_precision_);
+  CP_MEMBER(xpu_adaptive_seqlen_);
 
   // profile related.
   CP_MEMBER(with_profile_);
@@ -211,6 +228,21 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
       pass_builder_->DeletePass(ps);
     }
   }
+  if (use_dlnne_) {
+    auto all_passes = kDlnneSubgraphPasses;
+    auto other_passes = other.pass_builder()->AllPasses();
+    // We should sort them, because the user may call the SwitchIrDebug
+    // interface, which will change the pass.
+    std::sort(all_passes.begin(), all_passes.end());
+    std::sort(other_passes.begin(), other_passes.end());
+    std::vector<std::string> deleted_passes;
+    std::set_difference(all_passes.begin(), all_passes.end(),
+                        other_passes.begin(), other_passes.end(),
+                        std::inserter(deleted_passes, deleted_passes.begin()));
+    for (auto ps : deleted_passes) {
+      pass_builder_->DeletePass(ps);
+    }
+  }
 }
 
 void AnalysisConfig::EnableCUDNN() {
@@ -309,6 +341,12 @@ void AnalysisConfig::EnableTensorRtEngine(
 #endif
 }
 
+void AnalysisConfig::EnableDlnne(int min_subgraph_size) {
+  use_dlnne_ = true;
+  dlnne_min_subgraph_size_ = min_subgraph_size;
+  Update();
+}
+
 void AnalysisConfig::SetTRTDynamicShapeInfo(
     std::map<std::string, std::vector<int>> min_input_shape,
     std::map<std::string, std::vector<int>> max_input_shape,
@@ -383,6 +421,14 @@ void AnalysisConfig::Update() {
       pass_builder()->AppendPass(pass);
     }
   }
+  LOG(INFO) << "use_dlnne_:" << use_dlnne_ << std::endl;
+  if (use_dlnne_) {
+    pass_builder()->ClearPasses();
+    for (const auto &pass : kDlnneSubgraphPasses) {
+      pass_builder()->AppendPass(pass);
+    }
+  }
+
   if (use_gpu() && use_cudnn_) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     if (!enable_ir_optim_) {
@@ -479,6 +525,9 @@ std::string AnalysisConfig::SerializeInfoCache() {
   ss << tensorrt_max_batchsize_;
   ss << tensorrt_min_subgraph_size_;
 
+  ss << use_dlnne_;
+  ss << dlnne_min_subgraph_size_;
+
   for (auto &op : trt_disabled_ops_) ss << op.c_str();
   ss << ";";
 
@@ -512,6 +561,11 @@ std::string AnalysisConfig::SerializeInfoCache() {
   ss << use_lite_;
   ss << use_xpu_;
   ss << xpu_l3_workspace_size_;
+  ss << xpu_locked_;
+  ss << xpu_autotune_;
+  ss << xpu_autotune_file_;
+  ss << xpu_precision_;
+  ss << xpu_adaptive_seqlen_;
 
   ss << thread_local_stream_;
 
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 4b6c746d57525..95b0831836843 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -537,6 +537,12 @@ void AnalysisPredictor::PrepareArgument() {
     argument_.SetCloseTrtPluginFp16(config_.disable_trt_plugin_fp16_);
   }
 
+  if (config_.dlnne_enabled()) {
+    LOG(INFO) << "Dlnne subgraph is enabled";
+    argument_.SetUseDlnne(true);
+    argument_.SetDlnneMinSubgraphSize(config_.dlnne_min_subgraph_size_);
+  }
+
   if (config_.lite_engine_enabled()) {
     argument_.SetCpuMathLibraryNumThreads(
         config_.cpu_math_library_num_threads());
@@ -546,6 +552,11 @@ void AnalysisPredictor::PrepareArgument() {
     argument_.SetLiteZeroCopy(config_.lite_zero_copy_);
     argument_.SetUseXpu(config_.use_xpu_);
     argument_.SetXpuL3WorkspaceSize(config_.xpu_l3_workspace_size_);
+    argument_.SetXpuLocked(config_.xpu_locked_);
+    argument_.SetXpuAutotune(config_.xpu_autotune_);
+    argument_.SetXpuAutotuneFile(config_.xpu_autotune_file_);
+    argument_.SetXpuPrecision(config_.xpu_precision_);
+    argument_.SetXpuAdaptiveSeqlen(config_.xpu_adaptive_seqlen_);
     LOG(INFO) << "Lite subgraph engine is enabled";
   }
 
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index e492b32cb6cbe..2bbd4bb837a22 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -177,7 +177,10 @@ struct PD_INFER_DECL AnalysisConfig {
   ///
   void DisableGpu();
 
-  void EnableXpu(int l3_workspace_size = 0xfffc00);
+  void EnableXpu(int l3_workspace_size = 0xfffc00, bool locked = false,
+                 bool autotune = true, const std::string& autotune_file = "",
+                 const std::string& precision = "int16",
+                 bool adaptive_seqlen = false);
   ///
   /// \brief A boolean state telling whether the GPU is turned on.
   ///
@@ -360,6 +363,9 @@ struct PD_INFER_DECL AnalysisConfig {
   ///
   bool tensorrt_dla_enabled() { return trt_use_dla_; }
 
+  void EnableDlnne(int min_subgraph_size = 3);
+  bool dlnne_enabled() const { return use_dlnne_; }
+
   ///
   /// \brief Turn on the usage of Lite sub-graph engine.
   ///
@@ -627,6 +633,10 @@ struct PD_INFER_DECL AnalysisConfig {
   std::vector<std::string> trt_disabled_ops_{};
   bool disable_trt_plugin_fp16_{false};
 
+  // dlnne related.
+  bool use_dlnne_{false};
+  int dlnne_min_subgraph_size_{3};
+
   // memory reuse related.
   bool enable_memory_optim_{false};
 
@@ -661,6 +671,11 @@ struct PD_INFER_DECL AnalysisConfig {
   bool thread_local_stream_{false};
   bool use_xpu_{false};
   int xpu_l3_workspace_size_;
+  bool xpu_locked_;
+  bool xpu_autotune_;
+  std::string xpu_autotune_file_;
+  std::string xpu_precision_;
+  bool xpu_adaptive_seqlen_;
 
   // mkldnn related.
   int mkldnn_cache_capacity_{0};
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index 1d77ddaf73ef7..2b7333edae0da 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -110,6 +110,15 @@ const std::vector<std::string> kTRTSubgraphPasses({
       "transpose_flatten_concat_fuse_pass",
 });
 
+const std::vector<std::string> kDlnneSubgraphPasses({
+    "is_test_pass",                  //
+    "simplify_with_basic_ops_pass",  //
+    "conv_bn_fuse_pass",             //
+    "depthwise_conv_bn_fuse_pass",   //
+    "shuffle_channel_detect_pass",   //
+    "dlnne_subgraph_pass",           //
+});
+
 const std::vector<std::string> kLiteSubgraphPasses({
 #ifdef PADDLE_WITH_LITE
     "lite_subgraph_pass",
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h
index a725ebab35ead..d7556b50031b7 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -242,6 +242,9 @@ class PD_INFER_DECL XpuPassStrategy final : public PassStrategy {
 /// \brief List of tensorRT subgraph passes.
 PD_INFER_DECL extern const std::vector<std::string> kTRTSubgraphPasses;
 
+/// \brief List of dlnne subgraph passes.
+PD_INFER_DECL extern const std::vector<std::string> kDlnneSubgraphPasses;
+
 /// \brief List of lite subgraph passes.
 PD_INFER_DECL extern const std::vector<std::string> kLiteSubgraphPasses;
 
diff --git a/paddle/fluid/inference/capi/pd_config.cc b/paddle/fluid/inference/capi/pd_config.cc
index 231639667244d..9bb52ba578025 100644
--- a/paddle/fluid/inference/capi/pd_config.cc
+++ b/paddle/fluid/inference/capi/pd_config.cc
@@ -260,6 +260,22 @@ bool PD_TensorrtEngineEnabled(const PD_AnalysisConfig* config) {
   return config->config.tensorrt_engine_enabled();
 }
 
+void PD_EnableDlnne(PD_AnalysisConfig* config, int min_subgraph_size) {
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
+  config->config.EnableDlnne(min_subgraph_size);
+}
+
+bool PD_DlnneEnabled(const PD_AnalysisConfig* config) {
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
+  return config->config.dlnne_enabled();
+}
+
 void PD_SwitchIrDebug(PD_AnalysisConfig* config, bool x) {
   PADDLE_ENFORCE_NOT_NULL(
       config,
diff --git a/paddle/fluid/inference/capi_exp/CMakeLists.txt b/paddle/fluid/inference/capi_exp/CMakeLists.txt
new file mode 100644
index 0000000000000..521d24329d464
--- /dev/null
+++ b/paddle/fluid/inference/capi_exp/CMakeLists.txt
@@ -0,0 +1,29 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+set(C_API_SRCS pd_config.cc pd_predictor.cc pd_tensor.cc pd_utils.cc)
+
+cc_library(paddle_inference_c SRCS ${C_API_SRCS} DEPS paddle_inference)
+
+if(NOT ON_INFER)
+    return()
+endif()
+
+# Create inference capi shared library
+cc_library(paddle_inference_c_shared SHARED SRCS ${C_API_SRCS} DEPS paddle_inference)
+set_target_properties(paddle_inference_c_shared PROPERTIES OUTPUT_NAME paddle_inference_c)
+if(WIN32)
+    target_link_libraries(paddle_inference_c_shared shlwapi.lib)
+endif()
diff --git a/paddle/fluid/inference/capi_exp/lod_demo.cc b/paddle/fluid/inference/capi_exp/lod_demo.cc
new file mode 100644
index 0000000000000..2b049e992e71d
--- /dev/null
+++ b/paddle/fluid/inference/capi_exp/lod_demo.cc
@@ -0,0 +1,102 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+///
+/// \file lod_demo.cc
+///
+/// \brief a demo for user to learn how to inference by c api.
+///  it rectify from
+///  paddle/fluid/inference/tests/api/analyzer_capi_exp_ner_tester.cc.
+///
+/// \author paddle-infer@baidu.com
+/// \date 2021-04-21
+/// \since 2.1
+///
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string>
+#include <vector>
+#include "paddle/fluid/inference/capi_exp/pd_inference_api.h"
+
+int main(int argc, char *argv[]) {
+  auto model_dir = FLAGS_infer_model;
+  PD_Config *config = PD_ConfigCreate();
+  PD_ConfigSetModel(config, (model_dir + "/__model__").c_str(),
+                    (model_dir + "/param").c_str());
+  PD_ConfigDisableGpu(config);
+
+  PD_Predictor *predictor = PD_PredictorCreate(config);
+  size_t input_num = PD_PredictorGetInputNum(predictor);
+  size_t output_num = PD_PredictorGetOutputNum(predictor);
+
+  PD_OneDimArrayCstr *input_names = PD_PredictorGetInputNames(predictor);
+  LOG(INFO) << "Predictor start run!";
+  PD_Tensor *inputs[2];
+  inputs[0] = PD_PredictorGetInputHandle(predictor, input_names->data[0]);
+  inputs[1] = PD_PredictorGetInputHandle(predictor, input_names->data[1]);
+  LOG(INFO) << "Predictor start run!";
+  // inputs[0]: word, use lod memory in stack
+  int32_t shape_0[2] = {11, 1};
+  int64_t data_0[11 * 1] = {12673, 9763, 905, 284, 45, 7474, 20, 17, 1, 4, 9};
+  size_t lod_layer_0[2] = {0, 11};
+  PD_OneDimArraySize layer_0;
+  layer_0.size = 2;
+  layer_0.data = lod_layer_0;
+  PD_OneDimArraySize *layer_0_ptr = &layer_0;
+  PD_TwoDimArraySize lod_0;
+  lod_0.size = 1;
+  lod_0.data = &layer_0_ptr;
+  PD_TensorReshape(inputs[0], 2, shape_0);
+  PD_TensorCopyFromCpuInt64(inputs[0], data_0);
+  PD_TensorSetLod(inputs[0], &lod_0);
+
+  // inputs[1]: mention, use lod memory in heap
+  int32_t shape_1[2] = {11, 1};
+  int64_t data_1[11 * 1] = {27, 0, 0, 33, 34, 33, 0, 0, 0, 1, 2};
+  PD_TwoDimArraySize *lod_1_ptr = new PD_TwoDimArraySize();
+  lod_1_ptr->size = 1;
+  lod_1_ptr->data = new PD_OneDimArraySize *[1];
+  lod_1_ptr->data[0] = new PD_OneDimArraySize();
+  lod_1_ptr->data[0]->size = 2;
+  lod_1_ptr->data[0]->data = new size_t[2];
+  lod_1_ptr->data[0]->data[0] = 0;
+  lod_1_ptr->data[0]->data[1] = 11;
+
+  PD_TensorReshape(inputs[1], 2, shape_1);
+  PD_TensorCopyFromCpuInt64(inputs[1], data_1);
+  PD_TensorSetLod(inputs[1], lod_1_ptr);
+  // retrieve the lod memory
+  delete[] lod_1_ptr->data[0]->data;
+  delete lod_1_ptr->data[0];
+  delete[] lod_1_ptr->data;
+  delete lod_1_ptr;
+  lod_1_ptr = nullptr;
+
+  PD_PredictorRun(predictor);
+  PD_OneDimArrayCstr *output_names = PD_PredictorGetOutputNames(predictor);
+  PD_Tensor *output =
+      PD_PredictorGetOutputHandle(predictor, output_names->data[0]);
+  PD_TwoDimArraySize *output_lod = PD_TensorGetLod(output);
+
+  PD_TwoDimArraySizeDestroy(output_lod);
+  PD_TensorDestroy(output);
+  PD_OneDimArrayCstrDestroy(output_names);
+
+  PD_TensorDestroy(inputs[0]);
+  PD_TensorDestroy(inputs[1]);
+  PD_OneDimArrayCstrDestroy(input_names);
+  PD_PredictorDestroy(predictor);
+}
diff --git a/paddle/fluid/inference/capi_exp/pd_common.h b/paddle/fluid/inference/capi_exp/pd_common.h
new file mode 100644
index 0000000000000..4b70ed7fbad29
--- /dev/null
+++ b/paddle/fluid/inference/capi_exp/pd_common.h
@@ -0,0 +1,75 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <stdint.h>
+#include <stdio.h>
+
+#if defined(_WIN32)
+#ifdef PADDLE_DLL_INFERENCE
+#define PADDLE_CAPI_EXPORT __declspec(dllexport)
+#else
+#define PADDLE_CAPI_EXPORT __declspec(dllimport)
+#endif  // PADDLE_DLL_INFERENCE
+#else
+#define PADDLE_CAPI_EXPORT __attribute__((visibility("default")))
+#endif  // _WIN32
+
+///
+/// __pd_give means that a new object is returned. The user should make sure
+/// that the returned pointer is used exactly once as a value for an __pd_take
+/// argument. In between, it can be used as a value for as many __pd_keep
+/// arguments as the user likes.
+///
+#ifndef __pd_give
+#define __pd_give
+#endif
+///
+/// __pd_take means that the object the argument points to is taken over by the
+/// function and may no longer be used by the user as an argument to any other
+/// function. The pointer value must be one returned by a function returning an
+/// __pd_give pointer.
+///
+#ifndef __pd_take
+#define __pd_take
+#endif
+///
+/// __pd_keep means that the function will only use the object temporarily. The
+/// object which the argument points to is not taken over by the function. After
+/// the function has finished, the user can still use it as an argument to other
+/// functions.
+///
+#ifndef __pd_keep
+#define __pd_keep
+#endif
+
+typedef int8_t PD_Bool;
+#define TRUE 1
+#define FALSE 0
+
+#define PD_ENUM(type)   \
+  typedef int32_t type; \
+  enum
+
+PD_ENUM(PD_PrecisionType){PD_PRECISION_FLOAT32 = 0, PD_PRECISION_INT8,
+                          PD_PRECISION_HALF};
+
+PD_ENUM(PD_PlaceType){PD_PLACE_UNK = -1, PD_PLACE_CPU, PD_PLACE_GPU,
+                      PD_PLACE_XPU};
+
+PD_ENUM(PD_DataType){
+    PD_DATA_UNK = -1, PD_DATA_FLOAT32, PD_DATA_INT32,
+    PD_DATA_INT64,    PD_DATA_UINT8,
+};
diff --git a/paddle/fluid/inference/capi_exp/pd_config.cc b/paddle/fluid/inference/capi_exp/pd_config.cc
new file mode 100644
index 0000000000000..c45454e86bdaa
--- /dev/null
+++ b/paddle/fluid/inference/capi_exp/pd_config.cc
@@ -0,0 +1,382 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/capi_exp/pd_config.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/platform/enforce.h"
+
+#define CHECK_NULL_POINTER_PARM(param)                  \
+  PADDLE_ENFORCE_NOT_NULL(                              \
+      param, paddle::platform::errors::InvalidArgument( \
+                 "The pointer of " #param " shouldn't be nullptr"))
+
+#define CHECK_AND_CONVERT_PD_CONFIG                                         \
+  PADDLE_ENFORCE_NOT_NULL(                                                  \
+      pd_config, paddle::platform::errors::InvalidArgument(                 \
+                     "The pointer of paddle config shouldn't be nullptr")); \
+  Config* config = reinterpret_cast<Config*>(pd_config)
+
+using paddle_infer::Config;
+
+static Config::Precision ConvertToCxxPrecisionType(PD_PrecisionType precision) {
+  switch (precision) {
+    case PD_PRECISION_FLOAT32:
+      return Config::Precision::kFloat32;
+    case PD_PRECISION_INT8:
+      return Config::Precision::kInt8;
+    case PD_PRECISION_HALF:
+      return Config::Precision::kHalf;
+    default:
+      PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+          "Unsupport paddle precision type %d.", precision));
+      return Config::Precision::kFloat32;
+  }
+}
+
+extern "C" {
+__pd_give PD_Config* PD_ConfigCreate() {
+  return reinterpret_cast<PD_Config*>(new Config());
+}
+
+void PD_ConfigDestroy(__pd_take PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  delete reinterpret_cast<Config*>(config);
+}
+
+void PD_ConfigSetModel(__pd_keep PD_Config* pd_config,
+                       const char* prog_file_path,
+                       const char* params_file_path) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  CHECK_NULL_POINTER_PARM(prog_file_path);
+  CHECK_NULL_POINTER_PARM(params_file_path);
+  config->SetModel(prog_file_path, params_file_path);
+}
+void PD_ConfigSetProgFile(__pd_keep PD_Config* pd_config,
+                          const char* prog_file_path) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  CHECK_NULL_POINTER_PARM(prog_file_path);
+  config->SetProgFile(prog_file_path);
+}
+void PD_ConfigSetParamsFile(__pd_keep PD_Config* pd_config,
+                            const char* params_file_path) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  CHECK_NULL_POINTER_PARM(params_file_path);
+  config->SetParamsFile(params_file_path);
+}
+void PD_ConfigSetOptimCacheDir(__pd_keep PD_Config* pd_config,
+                               const char* opt_cache_dir) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  CHECK_NULL_POINTER_PARM(opt_cache_dir);
+  config->SetOptimCacheDir(opt_cache_dir);
+}
+
+void PD_ConfigSetModelDir(__pd_keep PD_Config* pd_config,
+                          const char* model_dir) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  CHECK_NULL_POINTER_PARM(model_dir);
+  config->SetModel(model_dir);
+}
+const char* PD_ConfigGetModelDir(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->model_dir().c_str();
+}
+const char* PD_ConfigGetProgFile(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->prog_file().c_str();
+}
+const char* PD_ConfigGetParamsFile(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->params_file().c_str();
+}
+
+void PD_ConfigDisableFCPadding(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->DisableFCPadding();
+}
+PD_Bool PD_ConfigUseFcPadding(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->use_fc_padding();
+}
+
+void PD_ConfigEnableUseGpu(__pd_keep PD_Config* pd_config,
+                           uint64_t memory_pool_init_size_mb,
+                           int32_t device_id) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->EnableUseGpu(memory_pool_init_size_mb, device_id);
+}
+void PD_ConfigDisableGpu(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->DisableGpu();
+}
+PD_Bool PD_ConfigUseGpu(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->use_gpu();
+}
+
+void PD_ConfigEnableXpu(__pd_keep PD_Config* pd_config,
+                        int32_t l3_workspace_size) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->EnableXpu(l3_workspace_size);
+}
+PD_Bool PD_ConfigUseXpu(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->use_xpu();
+}
+
+int32_t PD_ConfigGpuDeviceId(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->gpu_device_id();
+}
+int32_t PD_ConfigXpuDeviceId(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->xpu_device_id();
+}
+int32_t PD_ConfigMemoryPoolInitSizeMb(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->memory_pool_init_size_mb();
+}
+float PD_ConfigFractionOfGpuMemoryForPool(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->fraction_of_gpu_memory_for_pool();
+}
+void PD_ConfigEnableCudnn(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->EnableCUDNN();
+}
+PD_Bool PD_ConfigCudnnEnabled(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->cudnn_enabled();
+}
+
+void PD_ConfigSwitchIrOptim(__pd_keep PD_Config* pd_config, PD_Bool x) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->SwitchIrOptim(x);
+}
+PD_Bool PD_ConfigIrOptim(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->ir_optim();
+}
+
+void PD_ConfigEnableTensorRtEngine(__pd_keep PD_Config* pd_config,
+                                   int32_t workspace_size,
+                                   int32_t max_batch_size,
+                                   int32_t min_subgraph_size,
+                                   PD_PrecisionType precision,
+                                   PD_Bool use_static, PD_Bool use_calib_mode) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->EnableTensorRtEngine(
+      workspace_size, max_batch_size, min_subgraph_size,
+      ConvertToCxxPrecisionType(precision), use_static, use_calib_mode);
+}
+PD_Bool PD_ConfigTensorRtEngineEnabled(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->tensorrt_engine_enabled();
+}
+
+void PD_ConfigSetTrtDynamicShapeInfo(__pd_keep PD_Config* pd_config,
+                                     size_t tensor_num,
+                                     const char** tensor_name,
+                                     size_t* shapes_num, int32_t** min_shape,
+                                     int32_t** max_shape, int32_t** optim_shape,
+                                     PD_Bool disable_trt_plugin_fp16) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  std::map<std::string, std::vector<int>> min_input_shapes;
+  std::map<std::string, std::vector<int>> max_input_shapes;
+  std::map<std::string, std::vector<int>> optim_input_shapes;
+  for (size_t tensor_index = 0; tensor_index < tensor_num; ++tensor_index) {
+    std::string name(tensor_name[tensor_index]);
+    std::vector<int> min_input_shape, max_input_shape, optim_input_shape;
+    for (size_t shape_index = 0; shape_index < shapes_num[tensor_index];
+         ++shape_index) {
+      min_input_shape.emplace_back(min_shape[tensor_index][shape_index]);
+      max_input_shape.emplace_back(max_shape[tensor_index][shape_index]);
+      optim_input_shape.emplace_back(optim_shape[tensor_index][shape_index]);
+    }
+    min_input_shapes[name] = std::move(min_input_shape);
+    max_input_shapes[name] = std::move(max_input_shape);
+    optim_input_shapes[name] = std::move(optim_input_shape);
+  }
+  config->SetTRTDynamicShapeInfo(min_input_shapes, max_input_shapes,
+                                 optim_input_shapes, disable_trt_plugin_fp16);
+}
+
+void PD_ConfigDisableTensorRtOPs(__pd_keep PD_Config* pd_config, size_t ops_num,
+                                 const char** ops_name) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  std::vector<std::string> ops_list;
+  for (size_t index = 0; index < ops_num; ++index) {
+    ops_list.emplace_back(ops_name[index]);
+  }
+  config->Exp_DisableTensorRtOPs(ops_list);
+}
+
+void PD_ConfigEnableTensorRtOSS(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->EnableTensorRtOSS();
+}
+PD_Bool PD_ConfigTensorRtOssEnabled(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->tensorrt_oss_enabled();
+}
+
+void PD_ConfigEnableTensorRtDla(__pd_keep PD_Config* pd_config,
+                                int32_t dla_core) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->EnableTensorRtDLA(dla_core);
+}
+PD_Bool PD_ConfigTensorRtDlaEnabled(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->tensorrt_dla_enabled();
+}
+
+void PD_ConfigEnableLiteEngine(__pd_keep PD_Config* pd_config,
+                               PD_PrecisionType precision, PD_Bool zero_copy,
+                               size_t passes_filter_num,
+                               const char** passes_filter,
+                               size_t ops_filter_num, const char** ops_filter) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  std::vector<std::string> passes_filters, ops_filters;
+  for (size_t index = 0; index < passes_filter_num; ++index) {
+    passes_filters.emplace_back(passes_filter[index]);
+  }
+  for (size_t index = 0; index < ops_filter_num; ++index) {
+    ops_filters.emplace_back(ops_filter[index]);
+  }
+  config->EnableLiteEngine(ConvertToCxxPrecisionType(precision), zero_copy,
+                           passes_filters, ops_filters);
+}
+PD_Bool PD_ConfigLiteEngineEnabled(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->lite_engine_enabled();
+}
+
+void PD_ConfigSwitchIrDebug(__pd_keep PD_Config* pd_config, PD_Bool x) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->SwitchIrDebug(x);
+}
+void PD_ConfigEnableMKLDNN(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->EnableMKLDNN();
+}
+void PD_ConfigSetMkldnnCacheCapacity(__pd_keep PD_Config* pd_config,
+                                     int32_t capacity) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->SetMkldnnCacheCapacity(capacity);
+}
+PD_Bool PD_ConfigMkldnnEnabled(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->mkldnn_enabled();
+}
+void PD_ConfigSetCpuMathLibraryNumThreads(
+    __pd_keep PD_Config* pd_config, int32_t cpu_math_library_num_threads) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->SetCpuMathLibraryNumThreads(cpu_math_library_num_threads);
+}
+int32_t PD_ConfigGetCpuMathLibraryNumThreads(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->cpu_math_library_num_threads();
+}
+
+void PD_ConfigSetMkldnnOp(__pd_keep PD_Config* pd_config, size_t ops_num,
+                          const char** op_list) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  std::unordered_set<std::string> op_names;
+  for (size_t index = 0; index < ops_num; ++index) {
+    op_names.emplace(op_list[index]);
+  }
+  config->SetMKLDNNOp(std::move(op_names));
+}
+void PD_ConfigEnableMkldnnQuantizer(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->EnableMkldnnQuantizer();
+}
+void PD_ConfigEnableMkldnnBfloat16(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->EnableMkldnnBfloat16();
+}
+PD_Bool PD_ConfigMkldnnBfloat16Enabled(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->mkldnn_bfloat16_enabled();
+}
+void PD_ConfigSetBfloat16Op(__pd_keep PD_Config* pd_config, size_t ops_num,
+                            const char** op_list) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  std::unordered_set<std::string> op_names;
+  for (size_t index = 0; index < ops_num; ++index) {
+    op_names.emplace(op_list[index]);
+  }
+  config->SetBfloat16Op(std::move(op_names));
+}
+PD_Bool PD_ConfigThreadLocalStreamEnabled(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->thread_local_stream_enabled();
+}
+PD_Bool PD_ConfigMkldnnQuantizerEnabled(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->mkldnn_quantizer_enabled();
+}
+void PD_ConfigSetModelBuffer(__pd_keep PD_Config* pd_config,
+                             const char* prog_buffer, size_t prog_buffer_size,
+                             const char* params_buffer,
+                             size_t params_buffer_size) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->SetModelBuffer(prog_buffer, prog_buffer_size, params_buffer,
+                         params_buffer_size);
+}
+PD_Bool PD_ConfigModelFromMemory(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->model_from_memory();
+}
+void PD_ConfigEnableMemoryOptim(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->EnableMemoryOptim();
+}
+PD_Bool PD_ConfigMemoryOptimEnabled(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->enable_memory_optim();
+}
+void PD_ConfigEnableProfile(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->EnableProfile();
+}
+PD_Bool PD_ConfigProfileEnabled(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->profile_enabled();
+}
+void PD_ConfigDisableGlogInfo(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->DisableGlogInfo();
+}
+PD_Bool PD_ConfigGlogInfoDisabled(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->glog_info_disabled();
+}
+void PD_ConfigSetInvalid(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->SetInValid();
+}
+PD_Bool PD_ConfigIsValid(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->is_valid();
+}
+void PD_ConfigEnableGpuMultiStream(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->EnableGpuMultiStream();
+}
+void PD_ConfigPartiallyRelease(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->PartiallyRelease();
+}
+
+}  // extern "C"
diff --git a/paddle/fluid/inference/capi_exp/pd_config.h b/paddle/fluid/inference/capi_exp/pd_config.h
new file mode 100644
index 0000000000000..e44983e24484e
--- /dev/null
+++ b/paddle/fluid/inference/capi_exp/pd_config.h
@@ -0,0 +1,571 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+///
+/// \file pd_config.h
+///
+/// \brief interface for paddle config
+///
+/// \author paddle-infer@baidu.com
+/// \date 2021-04-21
+/// \since 2.1
+///
+
+#pragma once
+
+#include "pd_common.h"  // NOLINT
+
+typedef struct PD_Config PD_Config;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+///
+/// \brief Create a paddle config
+///
+/// \return new config.
+///
+PADDLE_CAPI_EXPORT extern __pd_give PD_Config* PD_ConfigCreate();
+///
+/// \brief Destroy the paddle config
+///
+/// \param[in] pd_onfig config
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigDestroy(__pd_take PD_Config* pd_config);
+///
+/// \brief Set the combined model with two specific pathes for program and
+/// parameters.
+///
+/// \param[in] pd_onfig config
+/// \param[in] prog_file_path model file path of the combined model.
+/// \param[in] params_file_path params file path of the combined model.
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigSetModel(__pd_keep PD_Config* pd_config,
+                                                 const char* prog_file_path,
+                                                 const char* params_file_path);
+///
+/// \brief Set the model file path of a combined model.
+///
+/// \param[in] pd_onfig config
+/// \param[in] prog_file_path model file path.
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigSetProgFile(
+    __pd_keep PD_Config* pd_config, const char* prog_file_path);
+///
+/// \brief Set the params file path of a combined model.
+///
+/// \param[in] pd_onfig config
+/// \param[in] params_file_path params file path.
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigSetParamsFile(
+    __pd_keep PD_Config* pd_config, const char* params_file_path);
+///
+/// \brief Set the path of optimization cache directory.
+/// \param[in] pd_onfig config
+/// \param[in] opt_cache_dir the path of optimization cache directory.
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigSetOptimCacheDir(
+    __pd_keep PD_Config* pd_config, const char* opt_cache_dir);
+///
+/// \brief Set the no-combined model dir path.
+/// \param[in] pd_onfig config
+/// \param[in] model_dir model dir path.
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigSetModelDir(
+    __pd_keep PD_Config* pd_config, const char* model_dir);
+///
+/// \brief Get the model directory path.
+///
+/// \param[in] pd_onfig config
+/// \return The model directory path.
+///
+PADDLE_CAPI_EXPORT extern const char* PD_ConfigGetModelDir(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Get the program file path.
+///
+/// \param[in] pd_onfig config
+/// \return The program file path.
+///
+PADDLE_CAPI_EXPORT extern const char* PD_ConfigGetProgFile(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Get the params file path.
+///
+/// \param[in] pd_onfig config
+/// \return The params file path.
+///
+PADDLE_CAPI_EXPORT extern const char* PD_ConfigGetParamsFile(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Turn off FC Padding.
+///
+/// \param[in] pd_onfig config
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigDisableFCPadding(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief A boolean state telling whether fc padding is used.
+///
+/// \param[in] pd_onfig config
+/// \return Whether fc padding is used.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigUseFcPadding(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Turn on GPU.
+///
+/// \param[in] pd_onfig config
+/// \param[in] memory_pool_init_size_mb initial size of the GPU memory pool in
+/// MB.
+/// \param[in] device_id device_id the GPU card to use.
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigEnableUseGpu(
+    __pd_keep PD_Config* pd_config, uint64_t memory_pool_init_size_mb,
+    int32_t device_id);
+///
+/// \brief Turn off GPU.
+///
+/// \param[in] pd_onfig config
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigDisableGpu(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief A boolean state telling whether the GPU is turned on.
+///
+/// \brief Turn off GPU.
+/// \return Whether the GPU is turned on.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigUseGpu(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Turn on XPU.
+///
+/// \param[in] pd_onfig config
+/// \param[in] l3_workspace_size l3 workspace size.
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigEnableXpu(
+    __pd_keep PD_Config* pd_config, int32_t l3_workspace_size);
+///
+/// \brief A boolean state telling whether the XPU is turned on.
+///
+/// \param[in] pd_onfig config
+/// \return Whether the XPU is turned on.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigUseXpu(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Get the GPU device id.
+///
+/// \param[in] pd_onfig config
+/// \return The GPU device id.
+///
+PADDLE_CAPI_EXPORT extern int32_t PD_ConfigGpuDeviceId(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Get the XPU device id.
+///
+/// \param[in] pd_onfig config
+/// \return The XPU device id.
+///
+PADDLE_CAPI_EXPORT extern int32_t PD_ConfigXpuDeviceId(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Get the initial size in MB of the GPU memory pool.
+///
+/// \param[in] pd_onfig config
+/// \return The initial size in MB of the GPU memory pool.
+///
+PADDLE_CAPI_EXPORT extern int32_t PD_ConfigMemoryPoolInitSizeMb(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Get the proportion of the initial memory pool size compared to the
+/// device.
+///
+/// \param[in] pd_onfig config
+/// \return The proportion of the initial memory pool size.
+///
+PADDLE_CAPI_EXPORT extern float PD_ConfigFractionOfGpuMemoryForPool(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Turn on CUDNN.
+///
+/// \param[in] pd_onfig config
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigEnableCudnn(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief A boolean state telling whether to use CUDNN.
+///
+/// \param[in] pd_onfig config
+/// \return Whether to use CUDNN.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigCudnnEnabled(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Control whether to perform IR graph optimization.
+/// If turned off, the AnalysisConfig will act just like a NativeConfig.
+///
+/// \param[in] pd_onfig config
+/// \param[in] x Whether the ir graph optimization is actived.
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigSwitchIrOptim(
+    __pd_keep PD_Config* pd_config, PD_Bool x);
+///
+/// \brief A boolean state telling whether the ir graph optimization is
+/// actived.
+///
+/// \param[in] pd_onfig config
+/// \return Whether to use ir graph optimization.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigIrOptim(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Turn on the TensorRT engine.
+/// The TensorRT engine will accelerate some subgraphes in the original Fluid
+/// computation graph. In some models such as resnet50, GoogleNet and so on,
+/// it gains significant performance acceleration.
+///
+/// \param[in] pd_onfig config
+/// \param[in] workspace_size The memory size(in byte) used for TensorRT
+/// workspace.
+/// \param[in] max_batch_size The maximum batch size of this prediction task,
+/// better set as small as possible for less performance loss.
+/// \param[in] min_subgrpah_size The minimum TensorRT subgraph size needed, if a
+/// subgraph is smaller than this, it will not be transferred to TensorRT
+/// engine.
+/// \param[in] precision The precision used in TensorRT.
+/// \param[in] use_static Serialize optimization information to disk for
+/// reusing.
+/// \param[in] use_calib_mode Use TRT int8 calibration(post training
+/// quantization).
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigEnableTensorRtEngine(
+    __pd_keep PD_Config* pd_config, int32_t workspace_size,
+    int32_t max_batch_size, int32_t min_subgraph_size,
+    PD_PrecisionType precision, PD_Bool use_static, PD_Bool use_calib_mode);
+///
+/// \brief A boolean state telling whether the TensorRT engine is used.
+///
+/// \param[in] pd_onfig config
+/// \return Whether the TensorRT engine is used.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigTensorRtEngineEnabled(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Set min, max, opt shape for TensorRT Dynamic shape mode.
+///
+/// \param[in] pd_onfig config
+/// \param[in] tensor_num The number of the subgraph input.
+/// \param[in] tensor_name The name of every subgraph input.
+/// \param[in] shapes_num The shape size of every subgraph input.
+/// \param[in] min_shape The min input shape of every subgraph input.
+/// \param[in] max_shape The max input shape of every subgraph input.
+/// \param[in] optim_shape The opt input shape of every subgraph input.
+/// \param[in] disable_trt_plugin_fp16 Setting this parameter to true means that
+/// TRT plugin will not run fp16.
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigSetTrtDynamicShapeInfo(
+    __pd_keep PD_Config* pd_config, size_t tensor_num, const char** tensor_name,
+    size_t* shapes_num, int32_t** min_shape, int32_t** max_shape,
+    int32_t** optim_shape, PD_Bool disable_trt_plugin_fp16);
+///
+/// \brief Prevent ops running in Paddle-TRT
+/// NOTE: just experimental, not an official stable API, easy to be broken.
+///
+/// \param[in] pd_onfig config
+/// \param[in] ops_num ops number
+/// \param[in] ops_name ops name
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigDisableTensorRtOPs(
+    __pd_keep PD_Config* pd_config, size_t ops_num, const char** ops_name);
+///
+/// \brief Replace some TensorRT plugins to TensorRT OSS(
+/// https://github.com/NVIDIA/TensorRT), with which some models's inference
+/// may be more high-performance. Libnvinfer_plugin.so greater than
+/// V7.2.1 is needed.
+///
+/// \param[in] pd_onfig config
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigEnableTensorRtOSS(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief A boolean state telling whether to use the TensorRT OSS.
+///
+/// \param[in] pd_onfig config
+/// \return Whether to use the TensorRT OSS.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigTensorRtOssEnabled(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Enable TensorRT DLA
+///
+/// \param[in] pd_onfig config
+/// \param[in] dla_core ID of DLACore, which should be 0, 1,
+///        ..., IBuilder.getNbDLACores() - 1
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigEnableTensorRtDla(
+    __pd_keep PD_Config* pd_config, int32_t dla_core);
+///
+/// \brief A boolean state telling whether to use the TensorRT DLA.
+///
+/// \param[in] pd_onfig config
+/// \return Whether to use the TensorRT DLA.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigTensorRtDlaEnabled(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Turn on the usage of Lite sub-graph engine.
+///
+/// \param[in] pd_onfig config
+/// \param[in] precision Precion used in Lite sub-graph engine.
+/// \param[in] zero_copy whether use zero copy.
+/// \param[in] passes_filter_num The number of passes used in Lite sub-graph
+/// engine.
+/// \param[in] passes_filter The name of passes used in Lite sub-graph engine.
+/// \param[in] ops_filter_num The number of operators not supported by Lite.
+/// \param[in] ops_filter The name of operators not supported by Lite.
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigEnableLiteEngine(
+    __pd_keep PD_Config* pd_config, PD_PrecisionType precision,
+    PD_Bool zero_copy, size_t passes_filter_num, const char** passes_filter,
+    size_t ops_filter_num, const char** ops_filter);
+///
+/// \brief A boolean state indicating whether the Lite sub-graph engine is
+/// used.
+///
+/// \param[in] pd_onfig config
+/// \return Whether the Lite sub-graph engine is used.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigLiteEngineEnabled(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Control whether to debug IR graph analysis phase.
+/// This will generate DOT files for visualizing the computation graph after
+/// each analysis pass applied.
+///
+/// \param[in] pd_onfig config
+/// \param[in] x whether to debug IR graph analysis phase.
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigSwitchIrDebug(
+    __pd_keep PD_Config* pd_config, PD_Bool x);
+///
+/// \brief Turn on MKLDNN.
+///
+/// \param[in] pd_onfig config
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigEnableMKLDNN(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Set the cache capacity of different input shapes for MKLDNN.
+/// Default value 0 means not caching any shape.
+/// Please see MKL-DNN Data Caching Design Document:
+/// https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/mkldnn/caching/caching.md
+///
+/// \param[in] pd_onfig config
+/// \param[in] capacity The cache capacity.
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigSetMkldnnCacheCapacity(
+    __pd_keep PD_Config* pd_config, int32_t capacity);
+///
+/// \brief A boolean state telling whether to use the MKLDNN.
+///
+/// \param[in] pd_onfig config
+/// \return Whether to use the MKLDNN.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigMkldnnEnabled(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Set the number of cpu math library threads.
+///
+/// \param[in] pd_onfig config
+/// \param cpu_math_library_num_threads The number of cpu math library
+/// threads.
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigSetCpuMathLibraryNumThreads(
+    __pd_keep PD_Config* pd_config, int32_t cpu_math_library_num_threads);
+///
+/// \brief An int state telling how many threads are used in the CPU math
+/// library.
+///
+/// \param[in] pd_onfig config
+/// \return The number of threads used in the CPU math library.
+///
+PADDLE_CAPI_EXPORT extern int32_t PD_ConfigGetCpuMathLibraryNumThreads(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Specify the operator type list to use MKLDNN acceleration.
+///
+/// \param[in] pd_onfig config
+/// \param[in] ops_num The number of operator type list.
+/// \param[in] op_list The name of operator type list.
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigSetMkldnnOp(
+    __pd_keep PD_Config* pd_config, size_t ops_num, const char** op_list);
+///
+/// \brief Turn on MKLDNN quantization.
+///
+/// \param[in] pd_onfig config
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigEnableMkldnnQuantizer(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief A boolean state telling whether the MKLDNN quantization is enabled.
+///
+/// \param[in] pd_onfig config
+/// \return Whether the MKLDNN quantization is enabled.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigMkldnnQuantizerEnabled(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Turn on MKLDNN bfloat16.
+///
+/// \param[in] pd_onfig config
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigEnableMkldnnBfloat16(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief A boolean state telling whether to use the MKLDNN Bfloat16.
+///
+/// \param[in] pd_onfig config
+/// \return Whether to use the MKLDNN Bfloat16.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigMkldnnBfloat16Enabled(
+    __pd_keep PD_Config* pd_config);
+/// \brief Specify the operator type list to use Bfloat16 acceleration.
+///
+/// \param[in] pd_onfig config
+/// \param[in] ops_num The number of operator type list.
+/// \param[in] op_list The name of operator type list.
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigSetBfloat16Op(
+    __pd_keep PD_Config* pd_config, size_t ops_num, const char** op_list);
+///
+/// \brief Enable the GPU multi-computing stream feature.
+/// NOTE: The current behavior of this interface is to bind the computation
+/// stream to the thread, and this behavior may be changed in the future.
+///
+/// \param[in] pd_onfig config
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigEnableGpuMultiStream(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief A boolean state telling whether the thread local CUDA stream is
+/// enabled.
+///
+/// \param[in] pd_onfig config
+/// \return Whether the thread local CUDA stream is enabled.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigThreadLocalStreamEnabled(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Specify the memory buffer of program and parameter.
+/// Used when model and params are loaded directly from memory.
+///
+/// \param[in] pd_onfig config
+/// \param[in] prog_buffer The memory buffer of program.
+/// \param[in] prog_buffer_size The size of the model data.
+/// \param[in] params_buffer The memory buffer of the combined parameters file.
+/// \param[in] params_buffer_size The size of the combined parameters data.
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigSetModelBuffer(
+    __pd_keep PD_Config* pd_config, const char* prog_buffer,
+    size_t prog_buffer_size, const char* params_buffer,
+    size_t params_buffer_size);
+///
+/// \brief A boolean state telling whether the model is set from the CPU
+/// memory.
+///
+/// \param[in] pd_onfig config
+/// \return Whether model and params are loaded directly from memory.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigModelFromMemory(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Turn on memory optimize
+/// NOTE still in development.
+///
+/// \param[in] pd_onfig config
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigEnableMemoryOptim(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief A boolean state telling whether the memory optimization is
+/// activated.
+///
+/// \param[in] pd_onfig config
+/// \return Whether the memory optimization is activated.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigMemoryOptimEnabled(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Turn on profiling report.
+/// If not turned on, no profiling report will be generated.
+///
+/// \param[in] pd_onfig config
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigEnableProfile(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief A boolean state telling whether the profiler is activated.
+///
+/// \param[in] pd_onfig config
+/// \return bool Whether the profiler is activated.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigProfileEnabled(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Mute all logs in Paddle inference.
+///
+/// \param[in] pd_onfig config
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigDisableGlogInfo(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief A boolean state telling whether logs in Paddle inference are muted.
+///
+/// \param[in] pd_onfig config
+/// \return Whether logs in Paddle inference are muted.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigGlogInfoDisabled(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Set the Config to be invalid.
+/// This is to ensure that an Config can only be used in one
+/// Predictor.
+///
+/// \param[in] pd_onfig config
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigSetInvalid(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief A boolean state telling whether the Config is valid.
+///
+/// \param[in] pd_onfig config
+/// \return Whether the Config is valid.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigIsValid(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Partially release the memory
+///
+/// \param[in] pd_onfig config
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigPartiallyRelease(
+    __pd_keep PD_Config* pd_config);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
diff --git a/paddle/fluid/operators/distributed/distributed_pb.h b/paddle/fluid/inference/capi_exp/pd_inference_api.h
similarity index 58%
rename from paddle/fluid/operators/distributed/distributed_pb.h
rename to paddle/fluid/inference/capi_exp/pd_inference_api.h
index f1c662be9af67..5f21dca1a7bf6 100644
--- a/paddle/fluid/operators/distributed/distributed_pb.h
+++ b/paddle/fluid/inference/capi_exp/pd_inference_api.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -14,17 +14,9 @@
 
 #pragma once
 
-#ifdef PADDLE_WITH_DISTRIBUTE
-
-#ifdef PADDLE_WITH_GRPC
-
-#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
-#include "paddle/fluid/operators/distributed/send_recv.pb.h"
-
-#else  // PADDLE_WITH_GRPC
-
-#include "paddle/fluid/operators/distributed/send_recv.pb.h"
-
-#endif  // PADDLE_WITH_GRPC
-
-#endif  // PADDLE_WITH_DISTRIBUTE
+#include "pd_common.h"     // NOLINT
+#include "pd_config.h"     // NOLINT
+#include "pd_predictor.h"  // NOLINT
+#include "pd_tensor.h"     // NOLINT
+#include "pd_types.h"      // NOLINT
+#include "pd_utils.h"      // NOLINT
diff --git a/paddle/fluid/inference/capi_exp/pd_predictor.cc b/paddle/fluid/inference/capi_exp/pd_predictor.cc
new file mode 100644
index 0000000000000..f5287a5152957
--- /dev/null
+++ b/paddle/fluid/inference/capi_exp/pd_predictor.cc
@@ -0,0 +1,109 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/capi_exp/pd_predictor.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/capi_exp/pd_types.h"
+#include "paddle/fluid/inference/capi_exp/pd_utils.h"
+#include "paddle/fluid/inference/capi_exp/types_internal.h"
+#include "paddle/fluid/inference/capi_exp/utils_internal.h"
+#include "paddle/fluid/platform/enforce.h"
+
+#define CHECK_AND_CONVERT_PD_PREDICTOR                              \
+  PADDLE_ENFORCE_NOT_NULL(                                          \
+      pd_predictor,                                                 \
+      paddle::platform::errors::InvalidArgument(                    \
+          "The pointer of paddle predictor shouldn't be nullptr")); \
+  auto& predictor = pd_predictor->predictor
+
+extern "C" {
+__pd_give PD_Predictor* PD_PredictorCreate(__pd_take PD_Config* pd_config) {
+  PADDLE_ENFORCE_NOT_NULL(
+      pd_config, paddle::platform::errors::InvalidArgument(
+                     "The pointer of paddle predictor shouldn't be nullptr"));
+  PD_Predictor* pd_predictor = new PD_Predictor();
+  paddle_infer::Config* config =
+      reinterpret_cast<paddle_infer::Config*>(pd_config);
+  pd_predictor->predictor = paddle_infer::CreatePredictor(*config);
+  delete config;
+  return pd_predictor;
+}
+
+__pd_give PD_Predictor* PD_PredictorClone(
+    __pd_keep PD_Predictor* pd_predictor) {
+  CHECK_AND_CONVERT_PD_PREDICTOR;
+  PD_Predictor* new_predictor = new PD_Predictor();
+  new_predictor->predictor = predictor->Clone();
+  return new_predictor;
+}
+
+__pd_give PD_OneDimArrayCstr* PD_PredictorGetInputNames(
+    __pd_keep PD_Predictor* pd_predictor) {
+  CHECK_AND_CONVERT_PD_PREDICTOR;
+  std::vector<std::string> names = predictor->GetInputNames();
+  return paddle_infer::CvtVecToOneDimArrayCstr(names);
+}
+
+__pd_give PD_OneDimArrayCstr* PD_PredictorGetOutputNames(
+    __pd_keep PD_Predictor* pd_predictor) {
+  CHECK_AND_CONVERT_PD_PREDICTOR;
+  std::vector<std::string> names = predictor->GetOutputNames();
+  return paddle_infer::CvtVecToOneDimArrayCstr(names);
+}
+
+size_t PD_PredictorGetInputNum(__pd_keep PD_Predictor* pd_predictor) {
+  CHECK_AND_CONVERT_PD_PREDICTOR;
+  return predictor->GetInputNames().size();
+}
+
+size_t PD_PredictorGetOutputNum(__pd_keep PD_Predictor* pd_predictor) {
+  CHECK_AND_CONVERT_PD_PREDICTOR;
+  return predictor->GetOutputNames().size();
+}
+__pd_give PD_Tensor* PD_PredictorGetInputHandle(
+    __pd_keep PD_Predictor* pd_predictor, const char* name) {
+  CHECK_AND_CONVERT_PD_PREDICTOR;
+  PD_Tensor* pd_tensor = new PD_Tensor();
+  pd_tensor->tensor = predictor->GetInputHandle(name);
+  return pd_tensor;
+}
+
+__pd_give PD_Tensor* PD_PredictorGetOutputHandle(
+    __pd_keep PD_Predictor* pd_predictor, const char* name) {
+  CHECK_AND_CONVERT_PD_PREDICTOR;
+  PD_Tensor* pd_tensor = new PD_Tensor();
+  pd_tensor->tensor = predictor->GetOutputHandle(name);
+  return pd_tensor;
+}
+
+PD_Bool PD_PredictorRun(__pd_keep PD_Predictor* pd_predictor) {
+  CHECK_AND_CONVERT_PD_PREDICTOR;
+  return predictor->Run();
+}
+
+void PD_PredictorClearIntermediateTensor(__pd_keep PD_Predictor* pd_predictor) {
+  CHECK_AND_CONVERT_PD_PREDICTOR;
+  predictor->ClearIntermediateTensor();
+}
+
+uint64_t PD_PredictorTryShrinkMemory(__pd_keep PD_Predictor* pd_predictor) {
+  CHECK_AND_CONVERT_PD_PREDICTOR;
+  return predictor->TryShrinkMemory();
+}
+
+void PD_PredictorDestroy(__pd_take PD_Predictor* pd_predictor) {
+  delete pd_predictor;
+}
+
+}  // extern "C"
diff --git a/paddle/fluid/inference/capi_exp/pd_predictor.h b/paddle/fluid/inference/capi_exp/pd_predictor.h
new file mode 100644
index 0000000000000..d4542d0b6d394
--- /dev/null
+++ b/paddle/fluid/inference/capi_exp/pd_predictor.h
@@ -0,0 +1,148 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+///
+/// \file pd_predictor.h
+///
+/// \brief interface for paddle predictor
+///
+/// \author paddle-infer@baidu.com
+/// \date 2021-04-21
+/// \since 2.1
+///
+
+#pragma once
+
+#include "pd_common.h"  // NOLINT
+
+typedef struct PD_Predictor PD_Predictor;
+typedef struct PD_Config PD_Config;
+typedef struct PD_Tensor PD_Tensor;
+typedef struct PD_OneDimArrayCstr PD_OneDimArrayCstr;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+///
+/// \brief Create a new Predictor
+///
+/// \param[in] Config config
+/// \return new predicor.
+///
+PADDLE_CAPI_EXPORT extern __pd_give PD_Predictor* PD_PredictorCreate(
+    __pd_take PD_Config* pd_config);
+///
+/// \brief Clone a new Predictor
+///
+/// \param[in] pd_predictor predictor
+/// \return new predictor.
+///
+PADDLE_CAPI_EXPORT extern __pd_give PD_Predictor* PD_PredictorClone(
+    __pd_keep PD_Predictor* pd_predictor);
+///
+/// \brief Get the input names
+///
+/// \param[in] pd_predictor predictor
+/// \return input names
+///
+PADDLE_CAPI_EXPORT extern __pd_give PD_OneDimArrayCstr*
+PD_PredictorGetInputNames(__pd_keep PD_Predictor* pd_predictor);
+///
+/// \brief Get the output names
+///
+/// \param[in] pd_predictor predictor
+/// \return output names
+///
+PADDLE_CAPI_EXPORT extern __pd_give PD_OneDimArrayCstr*
+PD_PredictorGetOutputNames(__pd_keep PD_Predictor* pd_predictor);
+
+///
+/// \brief Get the input number
+///
+/// \param[in] pd_predictor predictor
+/// \return input number
+///
+PADDLE_CAPI_EXPORT extern size_t PD_PredictorGetInputNum(
+    __pd_keep PD_Predictor* pd_predictor);
+
+///
+/// \brief Get the output number
+///
+/// \param[in] pd_predictor predictor
+/// \return output number
+///
+PADDLE_CAPI_EXPORT extern size_t PD_PredictorGetOutputNum(
+    __pd_keep PD_Predictor* pd_predictor);
+
+///
+/// \brief Get the Input Tensor object
+///
+/// \param[in] pd_predictor predictor
+/// \param[in] name input name
+/// \return input tensor
+///
+PADDLE_CAPI_EXPORT extern __pd_give PD_Tensor* PD_PredictorGetInputHandle(
+    __pd_keep PD_Predictor* pd_predictor, const char* name);
+
+///
+/// \brief Get the Output Tensor object
+///
+/// \param[in] pd_predictor predictor
+/// \param[in] name output name
+/// \return output tensor
+///
+PADDLE_CAPI_EXPORT extern __pd_give PD_Tensor* PD_PredictorGetOutputHandle(
+    __pd_keep PD_Predictor* pd_predictor, const char* name);
+
+///
+/// \brief Run the prediction engine
+///
+/// \param[in] pd_predictor predictor
+/// \return Whether the function executed successfully
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_PredictorRun(
+    __pd_keep PD_Predictor* pd_predictor);
+
+/// \brief Clear the intermediate tensors of the predictor
+///
+/// \param[in] pd_predictor predictor
+///
+PADDLE_CAPI_EXPORT extern void PD_PredictorClearIntermediateTensor(
+    __pd_keep PD_Predictor* pd_predictor);
+
+///
+/// \brief Release all tmp tensor to compress the size of the memory pool.
+/// The memory pool is considered to be composed of a list of chunks, if
+/// the chunk is not occupied, it can be released.
+///
+/// \param[in] pd_predictor predictor
+/// \return Number of bytes released. It may be smaller than the actual
+/// released memory, because part of the memory is not managed by the
+/// MemoryPool.
+///
+PADDLE_CAPI_EXPORT extern uint64_t PD_PredictorTryShrinkMemory(
+    __pd_keep PD_Predictor* pd_predictor);
+
+///
+/// \brief Destroy a predictor object
+///
+/// \param[in] pd_predictor predictor
+///
+PADDLE_CAPI_EXPORT extern void PD_PredictorDestroy(
+    __pd_take PD_Predictor* pd_predictor);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
diff --git a/paddle/fluid/inference/capi_exp/pd_tensor.cc b/paddle/fluid/inference/capi_exp/pd_tensor.cc
new file mode 100644
index 0000000000000..9c661dea6f2bb
--- /dev/null
+++ b/paddle/fluid/inference/capi_exp/pd_tensor.cc
@@ -0,0 +1,117 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/capi_exp/pd_tensor.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/capi_exp/pd_types.h"
+#include "paddle/fluid/inference/capi_exp/pd_utils.h"
+#include "paddle/fluid/inference/capi_exp/types_internal.h"
+#include "paddle/fluid/inference/capi_exp/utils_internal.h"
+#include "paddle/fluid/platform/enforce.h"
+
+#define CHECK_AND_CONVERT_PD_TENSOR                                         \
+  PADDLE_ENFORCE_NOT_NULL(                                                  \
+      pd_tensor, paddle::platform::errors::InvalidArgument(                 \
+                     "The pointer of paddle tensor shouldn't be nullptr")); \
+  auto& tensor = pd_tensor->tensor
+
+extern "C" {
+
+void PD_TensorDestroy(__pd_take PD_Tensor* pd_tensor) { delete pd_tensor; }
+void PD_TensorReshape(__pd_keep PD_Tensor* pd_tensor, size_t shape_size,
+                      int32_t* shape) {
+  CHECK_AND_CONVERT_PD_TENSOR;
+  std::vector<int> shapes(shape_size);
+  for (size_t index = 0; index < shape_size; ++index) {
+    shapes[index] = shape[index];
+  }
+  tensor->Reshape(shapes);
+}
+
+#define REPEAT_ALL_DATA_TYPE(func)                             \
+  func(float, Float) func(int64_t, Int64) func(int32_t, Int32) \
+      func(uint8_t, Uint8) func(int8_t, Int8)
+
+#define PD_TENSOR_MUTABLE_DATA_IMPL(type, Type)                                \
+  type* PD_TensorMutableData##Type(__pd_keep PD_Tensor* pd_tensor,             \
+                                   PD_PlaceType place) {                       \
+    CHECK_AND_CONVERT_PD_TENSOR;                                               \
+    return tensor->mutable_data<type>(paddle_infer::CvtToCxxPlaceType(place)); \
+  }
+REPEAT_ALL_DATA_TYPE(PD_TENSOR_MUTABLE_DATA_IMPL)
+#undef PD_TENSOR_MUTABLE_DATA_IMPL
+
+#define PD_TENSOR_DATA_IMPL(type, Type)                                        \
+  type* PD_TensorData##Type(__pd_keep PD_Tensor* pd_tensor,                    \
+                            PD_PlaceType* place, int32_t* size) {              \
+    CHECK_AND_CONVERT_PD_TENSOR;                                               \
+    PADDLE_ENFORCE_NOT_NULL(place,                                             \
+                            paddle::platform::errors::InvalidArgument(         \
+                                "The pointer of place shouldn't be nullptr")); \
+    PADDLE_ENFORCE_NOT_NULL(size,                                              \
+                            paddle::platform::errors::InvalidArgument(         \
+                                "The pointer of size shouldn't be nullptr"));  \
+    paddle_infer::PlaceType cxx_palce_type;                                    \
+    int cxx_size;                                                              \
+    type* data = tensor->data<type>(&cxx_palce_type, &cxx_size);               \
+    *place = paddle_infer::CvtFromCxxPlaceType(cxx_palce_type);                \
+    *size = static_cast<int32_t>(cxx_size);                                    \
+    return data;                                                               \
+  }
+REPEAT_ALL_DATA_TYPE(PD_TENSOR_DATA_IMPL)
+#undef PD_TENSOR_DATA_IMPL
+
+#define PD_TENSOR_COPY_FROM_CPU_IMPL(type, Type)                  \
+  void PD_TensorCopyFromCpu##Type(__pd_keep PD_Tensor* pd_tensor, \
+                                  const type* data) {             \
+    CHECK_AND_CONVERT_PD_TENSOR;                                  \
+    tensor->CopyFromCpu<type>(data);                              \
+  }
+REPEAT_ALL_DATA_TYPE(PD_TENSOR_COPY_FROM_CPU_IMPL)
+#undef PD_TENSOR_COPY_FROM_CPU_IMPL
+
+#define PD_TENSOR_COPY_TO_CPU_IMPL(type, Type)                                \
+  void PD_TensorCopyToCpu##Type(__pd_keep PD_Tensor* pd_tensor, type* data) { \
+    CHECK_AND_CONVERT_PD_TENSOR;                                              \
+    tensor->CopyToCpu<type>(data);                                            \
+  }
+REPEAT_ALL_DATA_TYPE(PD_TENSOR_COPY_TO_CPU_IMPL)
+#undef PD_TENSOR_COPY_TO_CPU_IMPL
+
+#undef REPEAT_ALL_DATA_TYPE
+
+__pd_give PD_OneDimArrayInt32* PD_TensorGetShape(
+    __pd_keep PD_Tensor* pd_tensor) {
+  CHECK_AND_CONVERT_PD_TENSOR;
+  return paddle_infer::CvtVecToOneDimArrayInt32(tensor->shape());
+}
+void PD_TensorSetLod(__pd_keep PD_Tensor* pd_tensor,
+                     __pd_keep PD_TwoDimArraySize* lod) {
+  CHECK_AND_CONVERT_PD_TENSOR;
+  tensor->SetLoD(paddle_infer::CvtTwoDimArrayToVecSize(lod));
+}
+__pd_give PD_TwoDimArraySize* PD_TensorGetLod(__pd_keep PD_Tensor* pd_tensor) {
+  CHECK_AND_CONVERT_PD_TENSOR;
+  return paddle_infer::CvtVecToTwoDimArraySize(tensor->lod());
+}
+const char* PD_TensorGetName(__pd_keep PD_Tensor* pd_tensor) {
+  CHECK_AND_CONVERT_PD_TENSOR;
+  return tensor->name().c_str();
+}
+PD_DataType PD_TensorGetDataType(__pd_keep PD_Tensor* pd_tensor) {
+  CHECK_AND_CONVERT_PD_TENSOR;
+  return paddle_infer::CvtFromCxxDatatype(tensor->type());
+}
+
+}  // extern "C"
diff --git a/paddle/fluid/inference/capi_exp/pd_tensor.h b/paddle/fluid/inference/capi_exp/pd_tensor.h
new file mode 100644
index 0000000000000..29ea4b5d62e43
--- /dev/null
+++ b/paddle/fluid/inference/capi_exp/pd_tensor.h
@@ -0,0 +1,287 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+///
+/// \file pd_tensor.h
+///
+/// \brief interface for paddle tensor
+///
+/// \author paddle-infer@baidu.com
+/// \date 2021-04-21
+/// \since 2.1
+///
+
+#pragma once
+
+#include "pd_common.h"  // NOLINT
+
+typedef struct PD_Tensor PD_Tensor;
+typedef struct PD_OneDimArrayInt32 PD_OneDimArrayInt32;
+typedef struct PD_TwoDimArraySize PD_TwoDimArraySize;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+///
+/// \brief Destroy the paddle tensor
+///
+/// \param[in] pd_tensor tensor
+///
+PADDLE_CAPI_EXPORT extern void PD_TensorDestroy(__pd_take PD_Tensor* pd_tensor);
+
+///
+/// \brief Reset the shape of the tensor.
+/// Generally it's only used for the input tensor.
+/// Reshape must be called before calling PD_TensorMutableData*() or
+/// PD_TensorCopyFromCpu*()
+///
+/// \param[in] pd_tensor tensor.
+/// \param[in] shape_size The size of shape.
+/// \param[in] shape The shape to set.
+///
+PADDLE_CAPI_EXPORT extern void PD_TensorReshape(__pd_keep PD_Tensor* pd_tensor,
+                                                size_t shape_size,
+                                                int32_t* shape);
+
+///
+/// \brief Get the memory pointer in CPU or GPU with 'float' data type.
+/// Please Reshape the tensor first before call this.
+/// It's usually used to get input data pointer.
+///
+/// \param[in] pd_tensor tensor.
+/// \param[in] place The place of the tensor.
+/// \return Memory pointer of pd_tensor
+///
+PADDLE_CAPI_EXPORT extern float* PD_TensorMutableDataFloat(
+    __pd_keep PD_Tensor* pd_tensor, PD_PlaceType place);
+///
+/// \brief Get the memory pointer in CPU or GPU with 'int64_t' data type.
+/// Please Reshape the tensor first before call this.
+/// It's usually used to get input data pointer.
+///
+/// \param[in] pd_tensor tensor.
+/// \param[in] place The place of the tensor.
+/// \return Memory pointer of pd_tensor
+///
+PADDLE_CAPI_EXPORT extern int64_t* PD_TensorMutableDataInt64(
+    __pd_keep PD_Tensor* pd_tensor, PD_PlaceType place);
+///
+/// \brief Get the memory pointer in CPU or GPU with 'int32_t' data type.
+/// Please Reshape the tensor first before call this.
+/// It's usually used to get input data pointer.
+///
+/// \param[in] pd_tensor tensor.
+/// \param[in] place The place of the tensor.
+/// \return Memory pointer of pd_tensor
+///
+PADDLE_CAPI_EXPORT extern int32_t* PD_TensorMutableDataInt32(
+    __pd_keep PD_Tensor* pd_tensor, PD_PlaceType place);
+///
+/// \brief Get the memory pointer in CPU or GPU with 'uint8_t' data type.
+/// Please Reshape the tensor first before call this.
+/// It's usually used to get input data pointer.
+///
+/// \param[in] pd_tensor tensor.
+/// \param[in] place The place of the tensor.
+/// \return Memory pointer of pd_tensor
+///
+PADDLE_CAPI_EXPORT extern uint8_t* PD_TensorMutableDataUint8(
+    __pd_keep PD_Tensor* pd_tensor, PD_PlaceType place);
+///
+/// \brief Get the memory pointer in CPU or GPU with 'int8_t' data type.
+/// Please Reshape the tensor first before call this.
+/// It's usually used to get input data pointer.
+///
+/// \param[in] pd_tensor tensor.
+/// \param[in] place The place of the tensor.
+/// \return Memory pointer of pd_tensor
+///
+PADDLE_CAPI_EXPORT extern int8_t* PD_TensorMutableDataInt8(
+    __pd_keep PD_Tensor* pd_tensor, PD_PlaceType place);
+///
+/// \brief Get the memory pointer directly.
+/// It's usually used to get the output data pointer.
+///
+/// \param[in] pd_tensor tensor.
+/// \param[out] place To get the device type of the tensor.
+/// \param[out] size To get the data size of the tensor.
+/// \return The tensor data buffer pointer.
+///
+PADDLE_CAPI_EXPORT extern float* PD_TensorDataFloat(
+    __pd_keep PD_Tensor* pd_tensor, PD_PlaceType* place, int32_t* size);
+///
+/// \brief Get the memory pointer directly.
+/// It's usually used to get the output data pointer.
+///
+/// \param[in] pd_tensor tensor.
+/// \param[out] place To get the device type of the tensor.
+/// \param[out] size To get the data size of the tensor.
+/// \return The tensor data buffer pointer.
+///
+PADDLE_CAPI_EXPORT extern int64_t* PD_TensorDataInt64(
+    __pd_keep PD_Tensor* pd_tensor, PD_PlaceType* place, int32_t* size);
+///
+/// \brief Get the memory pointer directly.
+/// It's usually used to get the output data pointer.
+///
+/// \param[in] pd_tensor tensor.
+/// \param[out] place To get the device type of the tensor.
+/// \param[out] size To get the data size of the tensor.
+/// \return The tensor data buffer pointer.
+///
+PADDLE_CAPI_EXPORT extern int32_t* PD_TensorDataInt32(
+    __pd_keep PD_Tensor* pd_tensor, PD_PlaceType* place, int32_t* size);
+///
+/// \brief Get the memory pointer directly.
+/// It's usually used to get the output data pointer.
+///
+/// \param[in] pd_tensor tensor.
+/// \param[out] place To get the device type of the tensor.
+/// \param[out] size To get the data size of the tensor.
+/// \return The tensor data buffer pointer.
+///
+PADDLE_CAPI_EXPORT extern uint8_t* PD_TensorDataUint8(
+    __pd_keep PD_Tensor* pd_tensor, PD_PlaceType* place, int32_t* size);
+///
+/// \brief Get the memory pointer directly.
+/// It's usually used to get the output data pointer.
+///
+/// \param[in] pd_tensor tensor.
+/// \param[out] place To get the device type of the tensor.
+/// \param[out] size To get the data size of the tensor.
+/// \return The tensor data buffer pointer.
+///
+PADDLE_CAPI_EXPORT extern int8_t* PD_TensorDataInt8(
+    __pd_keep PD_Tensor* pd_tensor, PD_PlaceType* place, int32_t* size);
+///
+/// \brief Copy the host memory to tensor data.
+/// It's usually used to set the input tensor data.
+/// \param[in] pd_tensor tensor.
+/// \param[in] data The pointer of the data, from which the tensor will copy.
+///
+PADDLE_CAPI_EXPORT extern void PD_TensorCopyFromCpuFloat(
+    __pd_keep PD_Tensor* pd_tensor, const float* data);
+///
+/// \brief Copy the host memory to tensor data.
+/// It's usually used to set the input tensor data.
+/// \param[in] pd_tensor tensor.
+/// \param[in] data The pointer of the data, from which the tensor will copy.
+///
+PADDLE_CAPI_EXPORT extern void PD_TensorCopyFromCpuInt64(
+    __pd_keep PD_Tensor* pd_tensor, const int64_t* data);
+///
+/// \brief Copy the host memory to tensor data.
+/// It's usually used to set the input tensor data.
+/// \param[in] pd_tensor tensor.
+/// \param[in] data The pointer of the data, from which the tensor will copy.
+///
+PADDLE_CAPI_EXPORT extern void PD_TensorCopyFromCpuInt32(
+    __pd_keep PD_Tensor* pd_tensor, const int32_t* data);
+///
+/// \brief Copy the host memory to tensor data.
+/// It's usually used to set the input tensor data.
+/// \param[in] pd_tensor tensor.
+/// \param[in] data The pointer of the data, from which the tensor will copy.
+///
+PADDLE_CAPI_EXPORT extern void PD_TensorCopyFromCpuUint8(
+    __pd_keep PD_Tensor* pd_tensor, const uint8_t* data);
+///
+/// \brief Copy the host memory to tensor data.
+/// It's usually used to set the input tensor data.
+/// \param[in] pd_tensor tensor.
+/// \param[in] data The pointer of the data, from which the tensor will copy.
+///
+PADDLE_CAPI_EXPORT extern void PD_TensorCopyFromCpuInt8(
+    __pd_keep PD_Tensor* pd_tensor, const int8_t* data);
+///
+/// \brief Copy the tensor data to the host memory.
+/// It's usually used to get the output tensor data.
+/// \param[in] pd_tensor tensor.
+/// \param[out] data The tensor will copy the data to the address.
+///
+PADDLE_CAPI_EXPORT extern void PD_TensorCopyToCpuFloat(
+    __pd_keep PD_Tensor* pd_tensor, float* data);
+///
+/// \brief Copy the tensor data to the host memory.
+/// It's usually used to get the output tensor data.
+/// \param[in] pd_tensor tensor.
+/// \param[out] data The tensor will copy the data to the address.
+///
+PADDLE_CAPI_EXPORT extern void PD_TensorCopyToCpuInt64(
+    __pd_keep PD_Tensor* pd_tensor, int64_t* data);
+///
+/// \brief Copy the tensor data to the host memory.
+/// It's usually used to get the output tensor data.
+/// \param[in] pd_tensor tensor.
+/// \param[out] data The tensor will copy the data to the address.
+///
+PADDLE_CAPI_EXPORT extern void PD_TensorCopyToCpuInt32(
+    __pd_keep PD_Tensor* pd_tensor, int32_t* data);
+///
+/// \brief Copy the tensor data to the host memory.
+/// It's usually used to get the output tensor data.
+/// \param[in] pd_tensor tensor.
+/// \param[out] data The tensor will copy the data to the address.
+///
+PADDLE_CAPI_EXPORT extern void PD_TensorCopyToCpuUint8(
+    __pd_keep PD_Tensor* pd_tensor, uint8_t* data);
+///
+/// \brief Copy the tensor data to the host memory.
+/// It's usually used to get the output tensor data.
+/// \param[in] pd_tensor tensor.
+/// \param[out] data The tensor will copy the data to the address.
+///
+PADDLE_CAPI_EXPORT extern void PD_TensorCopyToCpuInt8(
+    __pd_keep PD_Tensor* pd_tensor, int8_t* data);
+///
+/// \brief Get the tensor shape
+/// \param[in] pd_tensor tensor.
+/// \return The tensor shape.
+///
+PADDLE_CAPI_EXPORT extern __pd_give PD_OneDimArrayInt32* PD_TensorGetShape(
+    __pd_keep PD_Tensor* pd_tensor);
+
+///
+/// \brief Set the tensor lod information
+/// \param[in] pd_tensor tensor.
+/// \param[in] lod lod information.
+///
+PADDLE_CAPI_EXPORT extern void PD_TensorSetLod(
+    __pd_keep PD_Tensor* pd_tensor, __pd_keep PD_TwoDimArraySize* lod);
+///
+/// \brief Get the tensor lod information
+/// \param[in] pd_tensor tensor.
+/// \return the lod information.
+///
+PADDLE_CAPI_EXPORT extern __pd_give PD_TwoDimArraySize* PD_TensorGetLod(
+    __pd_keep PD_Tensor* pd_tensor);
+///
+/// \brief Get the tensor name
+/// \param[in] pd_tensor tensor.
+/// \return the tensor name.
+///
+PADDLE_CAPI_EXPORT extern const char* PD_TensorGetName(
+    __pd_keep PD_Tensor* pd_tensor);
+///
+/// \brief Get the tensor data type
+/// \param[in] pd_tensor tensor.
+/// \return the tensor data type.
+///
+PADDLE_CAPI_EXPORT extern PD_DataType PD_TensorGetDataType(
+    __pd_keep PD_Tensor* pd_tensor);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
diff --git a/paddle/fluid/inference/capi_exp/pd_types.h b/paddle/fluid/inference/capi_exp/pd_types.h
new file mode 100644
index 0000000000000..a5da2913a9b20
--- /dev/null
+++ b/paddle/fluid/inference/capi_exp/pd_types.h
@@ -0,0 +1,40 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <stdint.h>
+#include <stdio.h>
+
+#include "pd_common.h"  // NOLINT
+
+typedef struct PD_OneDimArrayInt32 {
+  size_t size;
+  int32_t* data;
+} PD_OneDimArrayInt32;  // std::vector<int32_t>
+
+typedef struct PD_OneDimArraySize {
+  size_t size;
+  size_t* data;
+} PD_OneDimArraySize;  // std::vector<size_t>
+
+typedef struct PD_OneDimArrayCstr {
+  size_t size;
+  char** data;
+} PD_OneDimArrayCstr;  // std::vector<std::string>
+
+typedef struct PD_TwoDimArraySize {
+  size_t size;
+  PD_OneDimArraySize** data;
+} PD_TwoDimArraySize;  // std::vector<std::vector<size_t>>
diff --git a/paddle/fluid/inference/capi_exp/pd_utils.cc b/paddle/fluid/inference/capi_exp/pd_utils.cc
new file mode 100644
index 0000000000000..2e762619f5567
--- /dev/null
+++ b/paddle/fluid/inference/capi_exp/pd_utils.cc
@@ -0,0 +1,221 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/capi_exp/pd_utils.h"
+#include "paddle/fluid/inference/capi_exp/utils_internal.h"
+#include "paddle/fluid/platform/enforce.h"
+
+#define DESTROY_ONE_DIM_ARRAY(type)                                           \
+  void PD_OneDimArray##type##Destroy(__pd_take PD_OneDimArray##type* array) { \
+    if (array != NULL) {                                                      \
+      delete[] array->data;                                                   \
+      delete array;                                                           \
+    }                                                                         \
+  }
+#define CONVERT_VEC_TO_ONE_DIM_ARRAY(type, Type, vec_type)   \
+  __pd_give PD_OneDimArray##Type* CvtVecToOneDimArray##Type( \
+      const std::vector<vec_type>& vec) {                    \
+    PD_OneDimArray##Type* array = new PD_OneDimArray##Type;  \
+    array->size = vec.size();                                \
+    array->data = vec.empty() ? NULL : new type[vec.size()]; \
+    for (size_t index = 0; index < vec.size(); ++index) {    \
+      array->data[index] = vec[index];                       \
+    }                                                        \
+    return array;                                            \
+  }
+#define CONVERT_ONE_DIM_ARRAY_TO_VEC(type, Type, vec_type)   \
+  std::vector<vec_type> CvtOneDimArrayToVec##Type(           \
+      __pd_keep const PD_OneDimArray##Type* array) {         \
+    std::vector<vec_type> vec;                               \
+    if (array != NULL) {                                     \
+      vec.resize(array->size);                               \
+      for (size_t index = 0; index < array->size; ++index) { \
+        vec[index] = array->data[index];                     \
+      }                                                      \
+    }                                                        \
+    return vec;                                              \
+  }
+
+#define ONE_DIM_ARRAY_UTILS_FUNC_IMPL(type, Type, vec_type) \
+  extern "C" {                                              \
+  DESTROY_ONE_DIM_ARRAY(Type);                              \
+  }                                                         \
+  namespace paddle_infer {                                  \
+  CONVERT_VEC_TO_ONE_DIM_ARRAY(type, Type, vec_type)        \
+  CONVERT_ONE_DIM_ARRAY_TO_VEC(type, Type, vec_type)        \
+  }
+
+ONE_DIM_ARRAY_UTILS_FUNC_IMPL(int32_t, Int32, int)
+ONE_DIM_ARRAY_UTILS_FUNC_IMPL(size_t, Size, size_t)
+
+#undef ONE_DIM_ARRAY_UTILS_FUNC_IMPL
+#undef CONVERT_ONE_DIM_ARRAY_TO_VEC
+#undef CONVERT_VEC_TO_ONE_DIM_ARRAY
+#undef DESTROY_ONE_DIM_ARRAY
+
+void PD_OneDimArrayCstrDestroy(__pd_take PD_OneDimArrayCstr* array) {
+  if (array != NULL) {
+    if (array->size != 0) {
+      for (size_t index = 0; index < array->size; ++index) {
+        delete[] array->data[index];
+      }
+    }
+    delete[] array->data;
+    delete array;
+  }
+}
+namespace paddle_infer {
+
+__pd_give PD_OneDimArrayCstr* CvtVecToOneDimArrayCstr(
+    const std::vector<std::string>& vec) {
+  PD_OneDimArrayCstr* array = new PD_OneDimArrayCstr;
+  array->size = vec.size();
+  array->data = vec.empty() ? NULL : new char*[vec.size()];
+  for (size_t index = 0u; index < vec.size(); ++index) {
+    array->data[index] = new char[vec[index].size() + 1];
+    memcpy(array->data[index], vec[index].c_str(), vec[index].size() + 1);
+  }
+  return array;
+}
+
+std::vector<std::string> CvtOneDimArrayToVecCstr(
+    __pd_keep const PD_OneDimArrayCstr* array) {
+  std::vector<std::string> vec;
+  for (size_t index = 0; index < array->size; ++index) {
+    vec.emplace_back(array->data[index]);
+  }
+  return vec;
+}
+
+}  // namespace paddle_infer
+
+#define DESTROY_TWO_DIM_ARRAY(type)                                           \
+  void PD_TwoDimArray##type##Destroy(__pd_take PD_TwoDimArray##type* array) { \
+    if (array != NULL) {                                                      \
+      if (array->size != 0) {                                                 \
+        for (size_t index = 0; index < array->size; ++index) {                \
+          PD_OneDimArray##type##Destroy(array->data[index]);                  \
+        }                                                                     \
+      }                                                                       \
+      delete[] array->data;                                                   \
+      delete array;                                                           \
+    }                                                                         \
+  }
+#define CONVERT_VEC_TO_TWO_DIM_ARRAY(type, Type, vec_type)                    \
+  __pd_give PD_TwoDimArray##Type* CvtVecToTwoDimArray##Type(                  \
+      const std::vector<std::vector<vec_type>>& vec) {                        \
+    PD_TwoDimArray##Type* array = new PD_TwoDimArray##Type;                   \
+    array->size = vec.size();                                                 \
+    array->data = vec.empty() ? NULL : new PD_OneDimArray##Type*[vec.size()]; \
+    for (size_t index = 0; index < vec.size(); ++index) {                     \
+      array->data[index] = CvtVecToOneDimArray##Type(vec[index]);             \
+    }                                                                         \
+    return array;                                                             \
+  }
+#define CONVERT_TWO_DIM_ARRAY_TO_VEC(type, Type, vec_type)            \
+  std::vector<std::vector<vec_type>> CvtTwoDimArrayToVec##Type(       \
+      __pd_keep const PD_TwoDimArray##Type* array) {                  \
+    std::vector<std::vector<vec_type>> vec;                           \
+    if (array != NULL && array->size != 0) {                          \
+      vec.resize(array->size);                                        \
+      for (size_t index = 0; index < array->size; ++index) {          \
+        vec[index] = CvtOneDimArrayToVec##Type((array->data)[index]); \
+      }                                                               \
+    }                                                                 \
+    return vec;                                                       \
+  }
+#define TWO_DIM_ARRAY_UTILS_FUNC_IMPL(type, Type, vec_type) \
+  extern "C" {                                              \
+  DESTROY_TWO_DIM_ARRAY(Type);                              \
+  }                                                         \
+  namespace paddle_infer {                                  \
+  CONVERT_VEC_TO_TWO_DIM_ARRAY(type, Type, vec_type)        \
+  CONVERT_TWO_DIM_ARRAY_TO_VEC(type, Type, vec_type)        \
+  }
+
+TWO_DIM_ARRAY_UTILS_FUNC_IMPL(size_t, Size, size_t)
+
+#undef TWO_DIM_ARRAY_UTILS_FUNC_IMPL
+#undef CONVERT_TWO_DIM_ARRAY_TO_VEC
+#undef CONVERT_VEC_TO_TWO_DIM_ARRAY
+#undef DESTROY_TWO_DIM_ARRAY
+
+namespace paddle_infer {
+
+PlaceType CvtToCxxPlaceType(PD_PlaceType place_type) {
+  switch (place_type) {
+    case PD_PLACE_UNK:
+      return PlaceType::kUNK;
+    case PD_PLACE_CPU:
+      return PlaceType::kCPU;
+    case PD_PLACE_GPU:
+      return PlaceType::kGPU;
+    case PD_PLACE_XPU:
+      return PlaceType::kXPU;
+    default:
+      PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+          "Unsupport paddle place type %d.", place_type));
+      return PlaceType::kUNK;
+  }
+}
+
+PD_PlaceType CvtFromCxxPlaceType(PlaceType place_type) {
+  switch (place_type) {
+    case PlaceType::kCPU:
+      return PD_PLACE_CPU;
+    case PlaceType::kGPU:
+      return PD_PLACE_GPU;
+    case PlaceType::kXPU:
+      return PD_PLACE_XPU;
+    default:
+      return PD_PLACE_UNK;
+  }
+}
+
+DataType CvtToCxxDatatype(PD_DataType data_type) {
+  switch (data_type) {
+    case PD_DATA_FLOAT32:
+      return DataType::FLOAT32;
+    case PD_DATA_INT64:
+      return DataType::INT64;
+    case PD_DATA_INT32:
+      return DataType::INT32;
+    case PD_DATA_UINT8:
+      return DataType::UINT8;
+    default:
+      PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+          "Unsupport paddle data type %d.", data_type));
+      return DataType::FLOAT32;
+  }
+}
+
+PD_DataType CvtFromCxxDatatype(DataType data_type) {
+  switch (data_type) {
+    case DataType::FLOAT32:
+      return PD_DATA_FLOAT32;
+    case DataType::INT64:
+      return PD_DATA_INT64;
+    case DataType::INT32:
+      return PD_DATA_INT32;
+    case DataType::UINT8:
+      return PD_DATA_UINT8;
+    default:
+      return PD_DATA_UNK;
+  }
+}
+
+}  // namespace paddle_infer
diff --git a/paddle/fluid/inference/capi_exp/pd_utils.h b/paddle/fluid/inference/capi_exp/pd_utils.h
new file mode 100644
index 0000000000000..68e519d4bb5e9
--- /dev/null
+++ b/paddle/fluid/inference/capi_exp/pd_utils.h
@@ -0,0 +1,70 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+///
+/// \file pd_utils.h
+///
+/// \brief Some utility function to destroy paddle struct.
+///
+/// \author paddle-infer@baidu.com
+/// \date 2021-04-21
+/// \since 2.1
+///
+
+#pragma once
+
+#include <stdint.h>
+#include <stdio.h>
+
+#include "pd_types.h"  // NOLINT
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+///
+/// \brief Destroy the PD_OneDimArrayInt32 object pointed to by the pointer.
+///
+/// \param[in] array pointer to the PD_OneDimArrayInt32 object.
+///
+PADDLE_CAPI_EXPORT extern void PD_OneDimArrayInt32Destroy(
+    __pd_take PD_OneDimArrayInt32* array);
+
+///
+/// \brief Destroy the PD_OneDimArrayCstr object pointed to by the pointer.
+///
+/// \param[in] array pointer to the PD_OneDimArrayCstr object.
+///
+PADDLE_CAPI_EXPORT extern void PD_OneDimArrayCstrDestroy(
+    __pd_take PD_OneDimArrayCstr* array);
+
+///
+/// \brief Destroy the PD_OneDimArraySize object pointed to by the pointer.
+///
+/// \param[in] array pointer to the PD_OneDimArraySize object.
+///
+PADDLE_CAPI_EXPORT extern void PD_OneDimArraySizeDestroy(
+    __pd_take PD_OneDimArraySize* array);
+
+///
+/// \brief Destroy the PD_TwoDimArraySize object pointed to by the pointer.
+///
+/// \param[in] array pointer to the PD_TwoDimArraySize object.
+///
+PADDLE_CAPI_EXPORT extern void PD_TwoDimArraySizeDestroy(
+    __pd_take PD_TwoDimArraySize* array);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
diff --git a/paddle/fluid/inference/capi_exp/types_internal.h b/paddle/fluid/inference/capi_exp/types_internal.h
new file mode 100644
index 0000000000000..8a61b9a884c3b
--- /dev/null
+++ b/paddle/fluid/inference/capi_exp/types_internal.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <cstdio>
+
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/capi_exp/pd_common.h"
+
+typedef struct PD_Tensor {
+  std::unique_ptr<paddle_infer::Tensor> tensor;
+} PD_Tensor;
+
+typedef struct PD_Predictor {
+  std::shared_ptr<paddle_infer::Predictor> predictor;
+} PD_Predictor;
diff --git a/paddle/fluid/inference/capi_exp/utils_internal.h b/paddle/fluid/inference/capi_exp/utils_internal.h
new file mode 100644
index 0000000000000..fbae512ecd855
--- /dev/null
+++ b/paddle/fluid/inference/capi_exp/utils_internal.h
@@ -0,0 +1,153 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+///
+/// \file utils_internal.h
+///
+/// \brief Some utility function used to convert object between C Struct and C++
+/// Class.
+///
+/// \author paddle-infer@baidu.com
+/// \date 2021-04-21
+/// \since 2.1
+///
+
+#pragma once
+
+#include <cstdint>
+#include <cstdio>
+#include <vector>
+
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/capi_exp/pd_types.h"
+
+namespace paddle_infer {
+
+///
+/// \brief Convert the 'std::vector<int>' object to a 'PD_OneDimArrayInt32'
+/// object.
+///
+/// \param[in] vec source object.
+/// \return target object.
+///
+__pd_give PD_OneDimArrayInt32* CvtVecToOneDimArrayInt32(
+    const std::vector<int>& vec);
+
+///
+/// \brief Convert the 'PD_OneDimArrayInt32' object to a 'std::vector<int>'
+/// object.
+///
+/// \param[in] array source object.
+/// \return target object.
+///
+std::vector<int> CvtOneDimArrayToVecInt32(
+    __pd_keep const PD_OneDimArrayInt32* array);
+
+///
+/// \brief Convert the 'std::vector<size_t>' object to a 'PD_OneDimArraySize'
+/// object.
+///
+/// \param[in] vec source object.
+/// \return target object.
+///
+__pd_give PD_OneDimArraySize* CvtVecToOneDimArraySize(
+    const std::vector<size_t>& vec);
+
+///
+/// \brief Convert the 'PD_OneDimArraySize' object to a 'std::vector<size_t>'
+/// object.
+///
+/// \param[in] array source object.
+/// \return target object.
+///
+std::vector<size_t> CvtOneDimArrayToVecSize(
+    __pd_keep const PD_OneDimArraySize* array);
+
+///
+/// \brief Convert the 'std::vector<std::string>' object to a
+/// 'PD_OneDimArrayCstr' object.
+///
+/// \param[in] vec source object.
+/// \return target object.
+///
+__pd_give PD_OneDimArrayCstr* CvtVecToOneDimArrayCstr(
+    const std::vector<std::string>& vec);
+
+///
+/// \brief Convert the 'PD_OneDimArrayCstr' object to a
+/// 'std::vector<std::string>' object.
+///
+/// \param[in] array source object.
+/// \return target object.
+///
+std::vector<std::string> CvtOneDimArrayToVecCstr(
+    __pd_keep const PD_OneDimArrayCstr* array);
+
+///
+/// \brief Convert the 'std::vector<std::vector<size_t>>' object to a
+/// 'PD_TwoDimArraySize' object.
+///
+/// \param[in] vec source object.
+/// \return target object.
+///
+__pd_give PD_TwoDimArraySize* CvtVecToTwoDimArraySize(
+    const std::vector<std::vector<size_t>>& vec);
+
+///
+/// \brief Convert the 'PD_TwoDimArraySize' object to a
+/// 'std::vector<std::vector<size_t>>' object.
+///
+/// \param[in] array source object.
+/// \return target object.
+///
+std::vector<std::vector<size_t>> CvtTwoDimArrayToVecSize(
+    __pd_keep const PD_TwoDimArraySize* array);
+
+///
+/// \brief Convert the 'PD_PlaceType' object to a 'paddle_infer::PlaceType'
+/// object.
+///
+/// \param[in] place_type source object.
+/// \return target object.
+///
+PlaceType CvtToCxxPlaceType(PD_PlaceType place_type);
+
+///
+/// \brief Convert the 'paddle_infer::PlaceType' object to a 'PD_PlaceType'
+/// object.
+///
+/// \param[in] place_type source object.
+/// \return target object.
+///
+PD_PlaceType CvtFromCxxPlaceType(PlaceType place_type);
+
+///
+/// \brief Convert the 'PD_DataType' object to a 'paddle_infer::DataType'
+/// object.
+///
+/// \param[in] place_type source object.
+/// \return target object.
+///
+DataType CvtToCxxDatatype(PD_DataType data_type);
+
+///
+/// \brief Convert the 'paddle_infer::DataType' object to a 'PD_DataType'
+/// object.
+///
+/// \param[in] place_type source object.
+/// \return target object.
+///
+PD_DataType CvtFromCxxDatatype(DataType data_type);
+
+}  // namespace paddle_infer
diff --git a/paddle/fluid/inference/lite/engine.cc b/paddle/fluid/inference/lite/engine.cc
index 59a786e46c98b..908e1ab990bb7 100644
--- a/paddle/fluid/inference/lite/engine.cc
+++ b/paddle/fluid/inference/lite/engine.cc
@@ -59,8 +59,14 @@ paddle::lite_api::PaddlePredictor* EngineManager::Create(
 #endif
 
 #ifdef LITE_SUBGRAPH_WITH_XPU
+  // Deprecated in Paddle-Lite release/v2.8
   lite_cxx_config.set_xpu_workspace_l3_size_per_thread(
       cfg.xpu_l3_workspace_size);
+  lite_cxx_config.set_xpu_l3_cache_method(cfg.xpu_l3_workspace_size,
+                                          cfg.locked);
+  lite_cxx_config.set_xpu_conv_autotune(cfg.autotune, cfg.autotune_file);
+  lite_cxx_config.set_xpu_multi_encoder_method(cfg.precision,
+                                               cfg.adaptive_seqlen);
 #endif
 
   // create predictor
diff --git a/paddle/fluid/inference/lite/engine.h b/paddle/fluid/inference/lite/engine.h
index 5ba487cc24d7d..a64ef1eda828b 100644
--- a/paddle/fluid/inference/lite/engine.h
+++ b/paddle/fluid/inference/lite/engine.h
@@ -42,6 +42,11 @@ struct EngineConfig {
 
   // for xpu
   size_t xpu_l3_workspace_size;
+  bool locked = false;
+  bool autotune = true;
+  std::string autotune_file = "";
+  std::string precision = "int16";
+  bool adaptive_seqlen = false;
 
   // for x86 or arm
   int cpu_math_library_num_threads{1};
diff --git a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
index a6484a1355705..7ea41839cb939 100644
--- a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
@@ -38,38 +38,6 @@ class BatchNormOpConverter : public OpConverter {
     VLOG(3) << "convert a fluid batch norm op to tensorrt batch_norm";
 
     framework::OpDesc op_desc(op, nullptr);
-    PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1,
-                      platform::errors::InvalidArgument(
-                          "Invalid input X's size of batch_norm TRT converter. "
-                          "Expected 1, received %d.",
-                          op_desc.Input("X").size()));
-    PADDLE_ENFORCE_EQ(op_desc.Input("Bias").size(), 1,
-                      platform::errors::InvalidArgument(
-                          "Invalid input Bias's size of batch_norm TRT "
-                          "converter. Expected 1, received %d.",
-                          op_desc.Input("Bias").size()));  // Bias is a weight
-    PADDLE_ENFORCE_EQ(op_desc.Input("Mean").size(), 1,
-                      platform::errors::InvalidArgument(
-                          "Invalid input Mean's size of batch_norm TRT "
-                          "converter. Expected 1, received %d.",
-                          op_desc.Input("Mean").size()));  // Mean is a weight
-    PADDLE_ENFORCE_EQ(op_desc.Input("Scale").size(), 1,
-                      platform::errors::InvalidArgument(
-                          "Invalid input Scale's size of batch_norm TRT "
-                          "converter. Expected 1, received %d.",
-                          op_desc.Input("Scale").size()));  // Scale is a weight
-    PADDLE_ENFORCE_EQ(
-        op_desc.Input("Variance").size(), 1,
-        platform::errors::InvalidArgument(
-            "Invalid input Variance's size of batch_norm TRT converter. "
-            "Expected 1, received %d.",
-            op_desc.Input("Variance").size()));  // Variance is a weight
-    PADDLE_ENFORCE_EQ(op_desc.Output("Y").size(), 1,
-                      platform::errors::InvalidArgument(
-                          "Invalid output Y's size of batch_norm TRT "
-                          "converter. Expected 1, received %d.",
-                          op_desc.Output("Y").size()));
-
     auto* X = engine_->GetITensor(op_desc.Input("X").front());
     // Declare weights
     auto* Bias_v = scope.FindVar(op_desc.Input("Bias").front());
diff --git a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
index 5515cd35daedc..ba47358b147db 100644
--- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
@@ -36,18 +36,6 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op,
   VLOG(3) << "convert a fluid " << name << " op to tensorrt layer without bias";
 
   framework::OpDesc op_desc(op, nullptr);
-  PADDLE_ENFORCE_EQ(op_desc.Input("Input").size(), 1UL,
-                    platform::errors::InvalidArgument(
-                        "TRT Conv2d expect 1 input, but got %d input.",
-                        op_desc.Input("Input").size()));
-  PADDLE_ENFORCE_EQ(op_desc.Input("Filter").size(), 1UL,
-                    platform::errors::InvalidArgument(
-                        "TRT Conv2d expect 1 filter, but got %d filter.",
-                        op_desc.Input("Filter").size()));
-  PADDLE_ENFORCE_EQ(op_desc.Output("Output").size(), 1UL,
-                    platform::errors::InvalidArgument(
-                        "TRT Conv2d expect 1 output, but got %d output.",
-                        op_desc.Output("Output").size()));
 
   auto* X = engine->GetITensor(op_desc.Input("Input").front());
   std::string filter_var_name = op_desc.Input("Filter").front();
@@ -61,13 +49,6 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op,
 
   if (enable_int8) {
 #if IS_TRT_VERSION_GE(5000)
-    if (op_desc.Type() != "conv2d_transpose") {
-      PADDLE_ENFORCE_EQ(
-          op_desc.HasAttr("Input_scale"), true,
-          platform::errors::InvalidArgument("Input scale not found. TRT int8"
-                                            " requires conv/deconv to have "
-                                            "input quantization scales."));
-    }
     float in_scale =
         BOOST_GET_CONST(float, op_desc.GetAttr("Input_scale")) * 127;
     auto weight_scale =
@@ -184,14 +165,6 @@ class Deconv2dOpConverter : public OpConverter {
           return layer;
         },
         [](nvinfer1::IDeconvolutionLayer* layer, nvinfer1::DimsHW& dilations) {
-          // In trt Deconv, dilation should be 1, ohter values are not
-          // supported.
-          bool condition = (dilations.d[0] == 1 && dilations.d[1] == 1);
-          PADDLE_ENFORCE_EQ(condition, true,
-                            platform::errors::InvalidArgument(
-                                "In Deconv, Dilations must be (1, 1) for "
-                                "tensorRT, but given (%d, %d)",
-                                dilations.d[0], dilations.d[1]));
         },
         "conv2d_transpose");
   }
diff --git a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
index 74057addecd1f..5419933e40736 100644
--- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
@@ -43,25 +43,6 @@ class ElementwiseWeightOpConverter : public OpConverter {
     framework::OpDesc op_desc(op, nullptr);
     VLOG(3) << "Convert a fluid elementwise op to TensorRT IScaleLayer";
 
-    PADDLE_ENFORCE_EQ(
-        op_desc.Input("X").size(), 1,
-        platform::errors::InvalidArgument(
-            "The input op's Input(\"X\").size() "
-            "should equal to 1, but received Input(\"X\").size() = %u.",
-            op_desc.Input("X").size()));
-    PADDLE_ENFORCE_EQ(
-        op_desc.Input("Y").size(), 1,
-        platform::errors::InvalidArgument(
-            "The input op's Input(\"Y\").size() "
-            "should equal to 1, but received Input(\"Y\").size() = %u.",
-            op_desc.Input("Y").size()));  // Y is a weight
-    PADDLE_ENFORCE_EQ(
-        op_desc.Output("Out").size(), 1,
-        platform::errors::InvalidArgument(
-            "The input op's Output(\"Out\").size() "
-            "should equal to 1, but reveceid Output(\"Out\").size() = %u.",
-            op_desc.Output("Out").size()));
-
     auto* X = engine_->GetITensor(op_desc.Input("X").front());
     auto* Y_v = scope.FindVar(op_desc.Input("Y").front());
     PADDLE_ENFORCE_NOT_NULL(
@@ -193,25 +174,6 @@ class ElementwiseTensorOpConverter : public OpConverter {
     framework::OpDesc op_desc(op, nullptr);
     nvinfer1::ILayer* layer = nullptr;
 
-    PADDLE_ENFORCE_EQ(
-        op_desc.Input("X").size(), 1,
-        platform::errors::InvalidArgument(
-            "The input op's Input(\"X\").size() "
-            "should equal to 1, but received Input(\"X\").size() = %u.",
-            op_desc.Input("X").size()));
-    PADDLE_ENFORCE_EQ(
-        op_desc.Input("Y").size(), 1,
-        platform::errors::InvalidArgument(
-            "The input op's Input(\"Y\").size() "
-            "should equal to 1, but received Input(\"Y\").size() = %u.",
-            op_desc.Input("Y").size()));  // Y is a weight
-    PADDLE_ENFORCE_EQ(
-        op_desc.Output("Out").size(), 1,
-        platform::errors::InvalidArgument(
-            "The input op's Output(\"Out\").size() "
-            "should equal to 1, but received Output(\"Out\").size() = %u.",
-            op_desc.Output("Out").size()));
-
     auto* X = engine_->GetITensor(op_desc.Input("X").front());
     auto* Y = engine_->GetITensor(op_desc.Input("Y").front());
     std::vector<nvinfer1::ITensor*> itensors;
diff --git a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
index 957dfe0369898..57ac30b5f6bd7 100644
--- a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
+++ b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
@@ -31,16 +31,11 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
   void operator()(const framework::proto::OpDesc& op,
                   const framework::Scope& scope, bool test_mode) override {
 #if IS_TRT_VERSION_GE(6000)
-    VLOG(4) << "convert fluid swish op to tensorrt layer";
+    VLOG(4) << "convert fluid EmbEltwiseLayerNorm op to tensorrt layer";
 
     framework::OpDesc op_desc(op, nullptr);
     auto id_names = op_desc.Input("Ids");
     auto emb_names = op_desc.Input("Embs");
-
-    PADDLE_ENFORCE_EQ(id_names.size(), emb_names.size(),
-                      platform::errors::InvalidArgument(
-                          "The id and emb size of fused EmbEltwiseLayerNormOp "
-                          "should be same "));
     int input_num = id_names.size();
 
     // Declare inputs
@@ -89,97 +84,92 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
     int64_t bias_size = framework::product(bias_dims);
     int64_t scale_size = framework::product(scale_dims);
     nvinfer1::ILayer* layer = nullptr;
+    bool enable_int8 = op_desc.HasAttr("enable_int8");
 
-    if (engine_->with_dynamic_shape()) {
-      if (engine_->use_oss()) {
-        int output_fp16 = static_cast<int>((engine_->WithFp16() == 1) ? 1 : 0);
-        PADDLE_ENFORCE_EQ(
-            output_fp16, 1,
-            platform::errors::InvalidArgument(
-                "Only Precision::KHalf(fp16) is supported when infering "
-                "ernie(bert) model with config.EnableTensorRtOSS(). "
-                "But Precision::KFloat32 is setted."));
-        const std::vector<nvinfer1::PluginField> fields{
-            {"bert_embeddings_layernorm_beta", bias,
-             nvinfer1::PluginFieldType::kFLOAT32,
-             static_cast<int32_t>(bias_size)},
-            {"bert_embeddings_layernorm_gamma", scale,
-             nvinfer1::PluginFieldType::kFLOAT32,
-             static_cast<int32_t>(scale_size)},
-            {"bert_embeddings_word_embeddings", input_embs[0],
-             nvinfer1::PluginFieldType::kFLOAT32,
-             static_cast<int32_t>(emb_sizes[0])},
-            {"bert_embeddings_token_type_embeddings", input_embs[2],
-             nvinfer1::PluginFieldType::kFLOAT32,
-             static_cast<int32_t>(emb_sizes[2])},
-            {"bert_embeddings_position_embeddings", input_embs[1],
-             nvinfer1::PluginFieldType::kFLOAT32,
-             static_cast<int32_t>(emb_sizes[1])},
-            {"output_fp16", &output_fp16, nvinfer1::PluginFieldType::kINT32, 1},
-        };
-
-        // remember to free
-        nvinfer1::PluginFieldCollection* plugin_ptr =
-            static_cast<nvinfer1::PluginFieldCollection*>(
-                malloc(sizeof(*plugin_ptr) +
-                       fields.size() * sizeof(nvinfer1::PluginField)));
-        plugin_ptr->nbFields = static_cast<int>(fields.size());
-        plugin_ptr->fields = fields.data();
-
-        std::vector<nvinfer1::ITensor*> plugin_inputs;
-        plugin_inputs.emplace_back(engine_->GetITensor(
-            engine_->network()->getInput(0)->getName()));  // word_embedding,
-                                                           // eval_placeholder_0
-        plugin_inputs.emplace_back(engine_->GetITensor(
-            engine_->network()->getInput(1)->getName()));  // sent_embedding,
-                                                           // eval_placeholder_1
-        plugin_inputs.emplace_back(engine_->GetITensor(
-            engine_->network()->getInput(2)->getName()));  // cu_seqlens,
-                                                           // eval_placeholder_2
-        auto max_seqlen_tensor =
-            engine_->GetITensor(engine_->network()->getInput(3)->getName());
-        auto* shuffle_layer = TRT_ENGINE_ADD_LAYER(
-            engine_, Shuffle,
-            *const_cast<nvinfer1::ITensor*>(max_seqlen_tensor));
-        nvinfer1::Dims shape_dim;
-        shape_dim.nbDims = 1;
-        shape_dim.d[0] = -1;
-        shuffle_layer->setReshapeDimensions(shape_dim);
-        plugin_inputs.emplace_back(
-            shuffle_layer->getOutput(0));  // max_seqlen, eval_placeholder_3
-
-        auto creator = GetPluginRegistry()->getPluginCreator(
-            "CustomEmbLayerNormPluginDynamic", "2");
-
-        auto plugin_obj = creator->createPlugin(
-            "CustomEmbLayerNormPluginDynamic", plugin_ptr);
-        auto plugin_layer = engine_->network()->addPluginV2(
-            plugin_inputs.data(), plugin_inputs.size(), *plugin_obj);
-        layer = plugin_layer;
-        free(plugin_ptr);
-        auto output_name = op_desc.Output("Out")[0];
-        RreplenishLayerAndOutput(layer, "emb_eltwise_layernorm",
-                                 {output_name, std::string("qkv_plugin_mask")},
-                                 test_mode);
-      } else {
-        bool with_fp16 =
-            engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
-        float eps = BOOST_GET_CONST(float, op_desc.GetAttr("epsilon"));
-        plugin::DynamicPluginTensorRT* plugin = nullptr;
-        plugin = new plugin::EmbEltwiseLayernormPluginDynamic(
-            input_embs, bias, scale, emb_sizes, bias_size, scale_size, hidden,
-            eps, with_fp16);
-        layer = engine_->AddDynamicPlugin(input_ids.data(), input_num, plugin);
-        auto output_name = op_desc.Output("Out")[0];
-        RreplenishLayerAndOutput(layer, "emb_eltwise_layernorm", {output_name},
-                                 test_mode);
+    if (engine_->use_oss()) {
+      int output_fp16 = static_cast<int>((engine_->WithFp16() == 1) ? 1 : 0);
+      if (enable_int8) {
+        output_fp16 = 1;
       }
+      PADDLE_ENFORCE_EQ(
+          output_fp16, 1,
+          platform::errors::InvalidArgument(
+              "Only Precision::KHalf(fp16) is supported when infering "
+              "ernie(bert) model with config.EnableTensorRtOSS(). "
+              "But Precision::KFloat32 is setted."));
+      const std::vector<nvinfer1::PluginField> fields{
+          {"bert_embeddings_layernorm_beta", bias,
+           nvinfer1::PluginFieldType::kFLOAT32,
+           static_cast<int32_t>(bias_size)},
+          {"bert_embeddings_layernorm_gamma", scale,
+           nvinfer1::PluginFieldType::kFLOAT32,
+           static_cast<int32_t>(scale_size)},
+          {"bert_embeddings_word_embeddings", input_embs[0],
+           nvinfer1::PluginFieldType::kFLOAT32,
+           static_cast<int32_t>(emb_sizes[0])},
+          {"bert_embeddings_token_type_embeddings", input_embs[2],
+           nvinfer1::PluginFieldType::kFLOAT32,
+           static_cast<int32_t>(emb_sizes[2])},
+          {"bert_embeddings_position_embeddings", input_embs[1],
+           nvinfer1::PluginFieldType::kFLOAT32,
+           static_cast<int32_t>(emb_sizes[1])},
+          {"output_fp16", &output_fp16, nvinfer1::PluginFieldType::kINT32, 1},
+      };
+
+      // remember to free
+      nvinfer1::PluginFieldCollection* plugin_ptr =
+          static_cast<nvinfer1::PluginFieldCollection*>(
+              malloc(sizeof(*plugin_ptr) +
+                     fields.size() * sizeof(nvinfer1::PluginField)));
+      plugin_ptr->nbFields = static_cast<int>(fields.size());
+      plugin_ptr->fields = fields.data();
+
+      std::vector<nvinfer1::ITensor*> plugin_inputs;
+      plugin_inputs.emplace_back(engine_->GetITensor(
+          engine_->network()->getInput(0)->getName()));  // word_embedding,
+                                                         // eval_placeholder_0
+      plugin_inputs.emplace_back(engine_->GetITensor(
+          engine_->network()->getInput(1)->getName()));  // sent_embedding,
+                                                         // eval_placeholder_1
+      plugin_inputs.emplace_back(engine_->GetITensor(
+          engine_->network()->getInput(2)->getName()));  // cu_seqlens,
+                                                         // eval_placeholder_2
+      auto max_seqlen_tensor =
+          engine_->GetITensor(engine_->network()->getInput(3)->getName());
+      auto* shuffle_layer =
+          TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *max_seqlen_tensor);
+      nvinfer1::Dims shape_dim;
+      shape_dim.nbDims = 1;
+      shape_dim.d[0] = -1;
+      shuffle_layer->setReshapeDimensions(shape_dim);
+      plugin_inputs.emplace_back(
+          shuffle_layer->getOutput(0));  // max_seqlen, eval_placeholder_3
+
+      auto creator = GetPluginRegistry()->getPluginCreator(
+          "CustomEmbLayerNormPluginDynamic", "2");
+
+      auto plugin_obj =
+          creator->createPlugin("CustomEmbLayerNormPluginDynamic", plugin_ptr);
+      auto plugin_layer = engine_->network()->addPluginV2(
+          plugin_inputs.data(), plugin_inputs.size(), *plugin_obj);
+      layer = plugin_layer;
+      free(plugin_ptr);
+      auto output_name = op_desc.Output("Out")[0];
+      RreplenishLayerAndOutput(layer, "emb_eltwise_layernorm",
+                               {output_name, std::string("qkv_plugin_mask")},
+                               test_mode);
     } else {
-      PADDLE_THROW(platform::errors::Fatal(
-          "You are running the Ernie(Bert) model in static"
-          "shape mode, which is not supported for the time being.\n"
-          "You can use the config.SetTRTDynamicShapeInfo(...) interface"
-          " to set the shape information to run the dynamic shape mode."));
+      bool with_fp16 =
+          engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
+      float eps = BOOST_GET_CONST(float, op_desc.GetAttr("epsilon"));
+      plugin::DynamicPluginTensorRT* plugin = nullptr;
+      plugin = new plugin::EmbEltwiseLayernormPluginDynamic(
+          input_embs, bias, scale, emb_sizes, bias_size, scale_size, hidden,
+          eps, with_fp16);
+      layer = engine_->AddDynamicPlugin(input_ids.data(), input_num, plugin);
+      auto output_name = op_desc.Output("Out")[0];
+      RreplenishLayerAndOutput(layer, "emb_eltwise_layernorm", {output_name},
+                               test_mode);
     }
 
 #else
diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
index 527d0ee208578..aebdb8f884c2c 100644
--- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
@@ -106,8 +106,22 @@ class FcOpConverter : public OpConverter {
     auto regist_fc = [&](nvinfer1::ITensor* inputs, int n_output,
                          TensorRTEngine::Weight& weight,
                          TensorRTEngine::Weight& bias) {
-      auto* fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *inputs,
-                                            n_output, weight.get(), bias.get());
+      nvinfer1::ILayer* fc_layer = nullptr;
+      if (enable_int8) {
+        PADDLE_ENFORCE_EQ(
+            op_desc.HasAttr("out_threshold"), true,
+            platform::errors::InvalidArgument(
+                "must have out threshold in fc layers in int8 mode"));
+        float out_scale =
+            BOOST_GET_CONST(float, op_desc.GetAttr("out_threshold"));
+        nvinfer1::DimsHW nv_ksize(1, 1);
+        fc_layer = TRT_ENGINE_ADD_LAYER(engine_, Convolution, *inputs, n_output,
+                                        nv_ksize, weight.get(), bias.get());
+        engine_->SetTensorDynamicRange(fc_layer->getOutput(0), out_scale);
+      } else {
+        fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *inputs,
+                                        n_output, weight.get(), bias.get());
+      }
 
       auto output_name = op_desc.Output("Out").front();
       if (activation_type == "relu") {
@@ -146,66 +160,61 @@ class FcOpConverter : public OpConverter {
     if (engine_->with_dynamic_shape()) {
       // not NCHW layout, but NLP layout with added 'x 1 x 1'
       auto x_dim = X->getDimensions();
-      if (x_dim.nbDims == 3 || x_dim.nbDims == 2) {
-        auto output_name = op_desc.Output("Out").front();
-        // add shuffle before fc
-        nvinfer1::Dims reshape_before_fc_dim;
-        reshape_before_fc_dim.nbDims = x_dim.nbDims + 2;
-        for (int i = 0; i < x_dim.nbDims; i++) {
-          reshape_before_fc_dim.d[i] = 0;
-        }
-        reshape_before_fc_dim.d[x_dim.nbDims] = 1;
-        reshape_before_fc_dim.d[x_dim.nbDims + 1] = 1;
-        auto* reshape_before_fc_layer =
-            TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *X);
-        reshape_before_fc_layer->setReshapeDimensions(reshape_before_fc_dim);
-        reshape_before_fc_layer->setName(
-            ("shuffle_before_fc(Output: " + output_name + ")").c_str());
+      PADDLE_ENFORCE_LE(
+          x_dim.nbDims - x_num_col_dims, 3,
+          platform::errors::InvalidArgument(
+              "Params and input dims mismatch. Paddle-TRT FC "
+              "converter expects x_dim.nbDims - x_num_col_dims <= 3, but "
+              "x_dim.nbDims = %d, x_num_col_dims = %d.",
+              x_dim.nbDims, x_num_col_dims));
+      auto output_name = op_desc.Output("Out").front();
+      // add shuffle before fc
+      nvinfer1::Dims reshape_before_fc_dim;
+      // padding shape "x 1 x 1"
+      int padding_length = 3 - (x_dim.nbDims - x_num_col_dims);
+      reshape_before_fc_dim.nbDims = x_dim.nbDims + padding_length;
+      int cur_dim_index = reshape_before_fc_dim.nbDims - 1;
+      while (padding_length-- > 0) {
+        reshape_before_fc_dim.d[cur_dim_index--] = 1;
+      }
+      while (cur_dim_index >= 0) {
+        reshape_before_fc_dim.d[cur_dim_index--] = 0;
+      }
 
-        // add fc layer
-        auto* fc_layer = TRT_ENGINE_ADD_LAYER(
-            engine_, FullyConnected, *reshape_before_fc_layer->getOutput(0),
-            n_output, weight.get(), bias.get());
-        fc_layer->setName(("fc_layer(Output: " + output_name + ")").c_str());
+      auto* reshape_before_fc_layer =
+          TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *X);
+      reshape_before_fc_layer->setReshapeDimensions(reshape_before_fc_dim);
+      reshape_before_fc_layer->setName(
+          ("shuffle_before_fc(Output: " + output_name + ")").c_str());
 
-        // add shuffle after fc
-        nvinfer1::Dims reshape_after_fc_dim;
-        if (x_dim.nbDims == 3) {
-          if (x_num_col_dims == 2) {
-            reshape_after_fc_dim.nbDims = 3;
-            reshape_after_fc_dim.d[0] = 0;
-            reshape_after_fc_dim.d[1] = 0;
-            reshape_after_fc_dim.d[2] = 0;
-          } else {
-            reshape_after_fc_dim.nbDims = 2;
-            reshape_after_fc_dim.d[0] = 0;
-            auto dim = fc_layer->getOutput(0)->getDimensions();
-            reshape_after_fc_dim.d[1] = dim.d[1] * dim.d[2];
-          }
-          // x_dim.nbDims == 2
-        } else {
-          reshape_after_fc_dim.nbDims = 2;
-          reshape_after_fc_dim.d[0] = 0;
-          reshape_after_fc_dim.d[1] = 0;
-        }
-        auto* reshape_after_fc_layer =
-            TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *fc_layer->getOutput(0));
-        reshape_after_fc_layer->setReshapeDimensions(reshape_after_fc_dim);
+      // add fc layer
+      auto* fc_layer = TRT_ENGINE_ADD_LAYER(
+          engine_, FullyConnected, *reshape_before_fc_layer->getOutput(0),
+          n_output, weight.get(), bias.get());
+      fc_layer->setName(("fc_layer(Output: " + output_name + ")").c_str());
 
-        if (activation_type == "relu") {
-          reshape_after_fc_layer->setName(
-              ("shuffle_after_fc(Output: " + output_name + ")").c_str());
-          nvinfer1::IActivationLayer* relu_layer = TRT_ENGINE_ADD_LAYER(
-              engine_, Activation, *(reshape_after_fc_layer->getOutput(0)),
-              nvinfer1::ActivationType::kRELU);
-          RreplenishLayerAndOutput(relu_layer, "relu_after_fc_shuffle",
-                                   {output_name}, test_mode);
-        } else {
-          RreplenishLayerAndOutput(reshape_after_fc_layer, "shuffle_after_fc",
-                                   {output_name}, test_mode);
-        }
+      // add shuffle after fc
+      nvinfer1::Dims reshape_after_fc_dim;
+      reshape_after_fc_dim.nbDims = x_num_col_dims + 1;
+      for (int i = 0; i < reshape_after_fc_dim.nbDims; i++) {
+        reshape_after_fc_dim.d[i] = 0;
+      }
+
+      auto* reshape_after_fc_layer =
+          TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *fc_layer->getOutput(0));
+      reshape_after_fc_layer->setReshapeDimensions(reshape_after_fc_dim);
+
+      if (activation_type == "relu") {
+        reshape_after_fc_layer->setName(
+            ("shuffle_after_fc(Output: " + output_name + ")").c_str());
+        nvinfer1::IActivationLayer* relu_layer = TRT_ENGINE_ADD_LAYER(
+            engine_, Activation, *(reshape_after_fc_layer->getOutput(0)),
+            nvinfer1::ActivationType::kRELU);
+        RreplenishLayerAndOutput(relu_layer, "relu_after_fc_shuffle",
+                                 {output_name}, test_mode);
       } else {
-        regist_fc(X, n_output, weight, bias);
+        RreplenishLayerAndOutput(reshape_after_fc_layer, "shuffle_after_fc",
+                                 {output_name}, test_mode);
       }
       return;
     }
@@ -229,13 +238,24 @@ class FcOpConverter : public OpConverter {
                 "dims equals to 4, the last dim of input must be 1, but got %d",
                 input_d[3]));
       }
-      for (int i = 0; i < 3; i++) {
-        if (i < input_dims) {
-          reshape_dim3[i] = input_d[i];
-        } else {
-          reshape_dim3[i] = 1;
+      if (enable_int8) {
+        reshape_dim3[0] = 1;
+        for (int i = 0; i < 3; i++) {
+          reshape_dim3[0] *= input_d[i];
+          if (i > 0) {
+            reshape_dim3[i] = 1;
+          }
+        }
+      } else {
+        for (int i = 0; i < 3; i++) {
+          if (i < input_dims) {
+            reshape_dim3[i] = input_d[i];
+          } else {
+            reshape_dim3[i] = 1;
+          }
         }
       }
+
       nvinfer1::Dims3 reshape_dim(reshape_dim3[0], reshape_dim3[1],
                                   reshape_dim3[2]);
       auto* reshape_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *X);
@@ -249,11 +269,25 @@ class FcOpConverter : public OpConverter {
                         platform::errors::InvalidArgument(
                             "Invalid dimensions. When x_num_col_dims equals to "
                             "2, input_dims should not be 1"));
-      for (int i = 0; i < 4; i++) {
-        if (i < input_dims) {
-          reshape_dim4[i] = input_d[i];
-        } else {
-          reshape_dim4[i] = 1;
+
+      if (enable_int8) {
+        for (int i = 0; i < 4; i++) {
+          if (i == 0) {
+            reshape_dim4[i] = input_d[i];
+          } else {
+            reshape_dim4[i] = 1;
+            if (i < input_dims) {
+              reshape_dim4[1] *= input_d[i];
+            }
+          }
+        }
+      } else {
+        for (int i = 0; i < 4; i++) {
+          if (i < input_dims) {
+            reshape_dim4[i] = input_d[i];
+          } else {
+            reshape_dim4[i] = 1;
+          }
         }
       }
       nvinfer1::Dims4 reshape_dim(reshape_dim4[0], reshape_dim4[1],
diff --git a/paddle/fluid/inference/tensorrt/convert/gelu_op.cc b/paddle/fluid/inference/tensorrt/convert/gelu_op.cc
index ca5b6a8b52e79..0436499cd4075 100644
--- a/paddle/fluid/inference/tensorrt/convert/gelu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/gelu_op.cc
@@ -47,15 +47,7 @@ class GeluOpConverter : public OpConverter {
     framework::OpDesc op_desc(op, nullptr);
     // Declare inputs
     int input_num = op_desc.Input("X").size();
-    PADDLE_ENFORCE_EQ(input_num, 1,
-                      platform::errors::InvalidArgument(
-                          "gelu op has only 1 input, but got %d", input_num));
     auto* input = engine_->GetITensor(op_desc.Input("X")[0]);
-    // Get output
-    size_t output_num = op_desc.Output("Out").size();
-    PADDLE_ENFORCE_EQ(output_num, 1,
-                      platform::errors::InvalidArgument(
-                          "gelu op has only 1 output, but got %d", output_num));
 
     nvinfer1::ILayer* layer = nullptr;
     if (engine_->with_dynamic_shape()) {
diff --git a/paddle/fluid/inference/tensorrt/convert/hard_swish_op.cc b/paddle/fluid/inference/tensorrt/convert/hard_swish_op.cc
index 9dc40ceec4809..7ef79e547d09a 100644
--- a/paddle/fluid/inference/tensorrt/convert/hard_swish_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/hard_swish_op.cc
@@ -41,17 +41,7 @@ class HardSwishOpConverter : public OpConverter {
     framework::OpDesc op_desc(op, nullptr);
     // Declare inputs
     int input_num = op_desc.Input("X").size();
-    PADDLE_ENFORCE_EQ(
-        input_num, 1,
-        platform::errors::InvalidArgument(
-            "HardSwish op has only 1 input, but got %d", input_num));
     auto* input = engine_->GetITensor(op_desc.Input("X")[0]);
-    // Get output
-    size_t output_num = op_desc.Output("Out").size();
-    PADDLE_ENFORCE_EQ(
-        output_num, 1,
-        platform::errors::InvalidArgument(
-            "HardSwish op has only 1 output, but got %d", output_num));
 
     const float threshold =
         op_desc.HasAttr("threshold")
diff --git a/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc
index c1f266bacfec5..0b97b5d87a3d5 100644
--- a/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc
@@ -25,25 +25,6 @@ class LayerNormOpConverter : public OpConverter {
                   const framework::Scope& scope, bool test_mode) override {
     VLOG(4) << "convert a fluid layer_norm op to tensorrt layer_norm plugin";
     framework::OpDesc op_desc(op, nullptr);
-    PADDLE_ENFORCE_EQ(
-        op_desc.Input("X").size(), 1,
-        platform::errors::InvalidArgument(
-            "input of layer_norm op converter should be 1, got %d",
-            op_desc.Input("X").size()));
-    PADDLE_ENFORCE_EQ(op_desc.Input("Bias").size(), 1,
-                      platform::errors::InvalidArgument(
-                          "Bias of layer_norm op converter should be 1, got %d",
-                          op_desc.Input("Bias").size()));  // Bias is a weight
-    PADDLE_ENFORCE_EQ(
-        op_desc.Input("Scale").size(), 1,
-        platform::errors::InvalidArgument(
-            "Scale of layer_norm op converter should be 1, got %d",
-            op_desc.Input("Scale").size()));  // Scale is a weight
-    PADDLE_ENFORCE_EQ(
-        op_desc.Output("Y").size(), 1,
-        platform::errors::InvalidArgument(
-            "output of layer_norm op converter should be 1, got %d",
-            op_desc.Input("Y").size()));
 
     auto* X = engine_->GetITensor(op_desc.Input("X").front());
     auto* Bias_v = scope.FindVar(op_desc.Input("Bias").front());
diff --git a/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc b/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc
index c2ffb3f3197c1..d6277b5208d5a 100644
--- a/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc
@@ -36,21 +36,7 @@ class LeakyReluOpConverter : public OpConverter {
     VLOG(4) << "convert fluid leaky_relu op to tensorrt layer";
 
     framework::OpDesc op_desc(op, nullptr);
-    // Declare inputs
-    size_t input_num = op_desc.Input("X").size();
-    PADDLE_ENFORCE_EQ(input_num, 1UL,
-                      platform::errors::InvalidArgument(
-                          "Invalid number of TRT leaky_relu op converter "
-                          "inputs. Expected 1, but received %d",
-                          input_num));
     auto* input = engine_->GetITensor(op_desc.Input("X")[0]);
-    // Get output
-    size_t output_num = op_desc.Output("Out").size();
-    PADDLE_ENFORCE_EQ(output_num, 1UL,
-                      platform::errors::InvalidArgument(
-                          "Invalid number of TRT leaky_relu op converter "
-                          "outputs. Expected 1, but received %d",
-                          output_num));
     // Get attrs
     float alpha = BOOST_GET_CONST(float, op_desc.GetAttr("alpha"));
     nvinfer1::ILayer* output_layer = nullptr;
diff --git a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
index 2008646549132..f2f45c694ab44 100644
--- a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
@@ -40,8 +40,25 @@ class MultiheadMatMulOpConverter : public OpConverter {
     auto* bias_v = scope.FindVar(bias_name);
     auto* bias_t = bias_v->GetMutable<framework::LoDTensor>();
 
-    float* weight_data =
-        engine_->GetWeightCPUData(weight_name, weight_t, false);
+    float* weight_data = nullptr;
+    bool enable_int8 = op_desc.HasAttr("enable_int8");
+    float in_scale = 0.;
+
+    if (enable_int8) {
+      PADDLE_ENFORCE_EQ(
+          op_desc.HasAttr("Input_scale"), true,
+          platform::errors::InvalidArgument(
+              "must have input scale in multihead layers in int8 mode"));
+      in_scale = BOOST_GET_CONST(float, op_desc.GetAttr("Input_scale")) * 127;
+      auto weight_scale =
+          BOOST_GET_CONST(std::vector<float>, op_desc.GetAttr("weight_scale"));
+      weight_data =
+          engine_->GetWeightCPUData(weight_name, weight_t, true, weight_scale);
+      engine_->SetTensorDynamicRange(input, in_scale);
+    } else {
+      weight_data = engine_->GetWeightCPUData(weight_name, weight_t, false);
+    }
+
     float* bias_data = engine_->GetWeightCPUData(bias_name, bias_t, false);
     std::vector<float> weight_data_tmp;
     weight_data_tmp.reserve(weight_t->numel());
@@ -117,8 +134,27 @@ class MultiheadMatMulOpConverter : public OpConverter {
                                static_cast<void*>(bias_data),
                                static_cast<int32_t>(bias_t->numel())};
 
-        auto* fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *input,
-                                              n, weight, bias);
+        nvinfer1::ILayer* fc_layer = nullptr;
+        float dp_probs = 1.0 / 127.0;
+        if (enable_int8) {
+          nvinfer1::DimsHW nv_ksize(1, 1);
+          fc_layer = TRT_ENGINE_ADD_LAYER(engine_, Convolution, *input, n,
+                                          nv_ksize, weight, bias);
+        } else {
+          fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *input, n,
+                                          weight, bias);
+        }
+
+        if (enable_int8) {
+          PADDLE_ENFORCE_EQ(
+              op_desc.HasAttr("out_threshold"), true,
+              platform::errors::InvalidArgument(
+                  "must have out threshold in multihead layers in int8 mode"));
+          float out_scale =
+              BOOST_GET_CONST(float, op_desc.GetAttr("out_threshold"));
+          engine_->SetTensorDynamicRange(fc_layer->getOutput(0), out_scale);
+          dp_probs = out_scale / 127.0;
+        }
 
         auto mask_tensor = engine_->GetITensor("qkv_plugin_mask");
 
@@ -128,6 +164,9 @@ class MultiheadMatMulOpConverter : public OpConverter {
         int type = static_cast<int>((engine_->WithFp16() == 1)
                                         ? nvinfer1::DataType::kHALF
                                         : nvinfer1::DataType::kFLOAT);
+        if (enable_int8) {
+          type = static_cast<int>(nvinfer1::DataType::kHALF);
+        }
         bool has_mask = true;
         int var_seqlen = 1;
         const std::vector<nvinfer1::PluginField> fields{
@@ -136,7 +175,7 @@ class MultiheadMatMulOpConverter : public OpConverter {
             {"num_heads", &head_number, nvinfer1::PluginFieldType::kINT32, 1},
             {"has_mask", &has_mask, nvinfer1::PluginFieldType::kINT32, 1},
             {"var_seqlen", &var_seqlen, nvinfer1::PluginFieldType::kINT32, 1},
-        };
+            { "dq_probs", &dp_probs, nvinfer1::PluginFieldType::kFLOAT32, 1 }};
         nvinfer1::PluginFieldCollection* plugin_collection =
             static_cast<nvinfer1::PluginFieldCollection*>(
                 malloc(sizeof(*plugin_collection) +
diff --git a/paddle/fluid/inference/tensorrt/convert/nearest_interp_op.cc b/paddle/fluid/inference/tensorrt/convert/nearest_interp_op.cc
index e91a2ee13f4c2..3940cc5dce1b0 100644
--- a/paddle/fluid/inference/tensorrt/convert/nearest_interp_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/nearest_interp_op.cc
@@ -65,13 +65,6 @@ class NearestInterpolateOpConverter : public OpConverter {
       scale_w = scale;
     } else {
       // axis are different in static/dynamic mode
-      PADDLE_ENFORCE_GT(
-          out_h, 0, platform::errors::InvalidArgument(
-                        "out_h must be greater than 0 if scale is not set."));
-      PADDLE_ENFORCE_GT(
-          out_w, 0, platform::errors::InvalidArgument(
-                        "out_w must be greater than 0 if scale is not set."));
-
       bool with_dynamic = engine_->with_dynamic_shape();
 
       int h_axis = (data_layout == framework::DataLayout::kNCHW) + with_dynamic;
diff --git a/paddle/fluid/inference/tensorrt/convert/pad_op.cc b/paddle/fluid/inference/tensorrt/convert/pad_op.cc
index 6bf50e4742dd2..d6711bbbd2cb5 100644
--- a/paddle/fluid/inference/tensorrt/convert/pad_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/pad_op.cc
@@ -43,8 +43,6 @@ class PadOpConverter : public OpConverter {
 
     const std::vector<int> paddings =
         BOOST_GET_CONST(std::vector<int>, op_desc.GetAttr("paddings"));
-    const float pad_value =
-        BOOST_GET_CONST(float, op_desc.GetAttr("pad_value"));
 
     nvinfer1::Dims input_shape = input->getDimensions();
     int nbDims = input_shape.nbDims;
@@ -62,9 +60,6 @@ class PadOpConverter : public OpConverter {
                                           "(nbDims + 1) * 2 == pad_size. But "
                                           "received nbDims:%d, pad_size:%d.",
                                           nbDims, pad_size));
-    PADDLE_ENFORCE_EQ(pad_value, 0.0,
-                      platform::errors::InvalidArgument(
-                          "The pad layer of TRT only support zero."));
 
     nvinfer1::DimsHW pre_pad(paddings[pad_size - 4], paddings[pad_size - 2]);
     nvinfer1::DimsHW post_pad(paddings[pad_size - 3], paddings[pad_size - 1]);
diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
index c10072602d7c5..90d6392fd6404 100644
--- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
@@ -66,15 +66,6 @@ class Pool2dOpConverter : public OpConverter {
     VLOG(4)
         << "convert a fluid pool2d op to tensorrt pool2d layer without bias";
     framework::OpDesc op_desc(op, nullptr);
-    PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1UL,
-                      platform::errors::InvalidArgument(
-                          "TRT Pool2d expect 1 input, but got %d input.",
-                          op_desc.Input("X").size()));
-    PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1UL,
-                      platform::errors::InvalidArgument(
-                          "TRT Pool2d expect 1 Output, but got %d output.",
-                          op_desc.Output("Out").size()));
-
     auto *input1 = engine_->GetITensor(op_desc.Input("X")[0]);
     nvinfer1::Dims input_shape = input1->getDimensions();
     int input_dims = input_shape.nbDims;
@@ -110,10 +101,6 @@ class Pool2dOpConverter : public OpConverter {
       nv_pool_type = nvinfer1::PoolingType::kAVERAGE;
       reduce_operation = nvinfer1::ReduceOperation::kAVG;
       plugin_pool_type = plugin::PoolPlugin::PoolType::avg;
-    } else {
-      PADDLE_THROW(platform::errors::Fatal(
-          "Wrong pool op type, the trt do not support the %s pool type.",
-          pool_type));
     }
 
     nvinfer1::DimsHW nv_ksize(ksize[0], ksize[1]);
diff --git a/paddle/fluid/inference/tensorrt/convert/prelu_op.cc b/paddle/fluid/inference/tensorrt/convert/prelu_op.cc
index 74d77d8be4493..a8a36e1238168 100644
--- a/paddle/fluid/inference/tensorrt/convert/prelu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/prelu_op.cc
@@ -31,19 +31,7 @@ class PReluOpConverter : public OpConverter {
     framework::OpDesc op_desc(op, nullptr);
     // Declare inputs
     size_t input_num = op_desc.Input("X").size();
-    PADDLE_ENFORCE_EQ(input_num, 1UL,
-                      platform::errors::InvalidArgument(
-                          "Invalid input X's size of prelu TRT converter. "
-                          "Expected 1, received %d.",
-                          input_num));
     auto* input = engine_->GetITensor(op_desc.Input("X")[0]);
-    // Get output
-    size_t output_num = op_desc.Output("Out").size();
-    PADDLE_ENFORCE_EQ(output_num, 1UL,
-                      platform::errors::InvalidArgument(
-                          "Invalid output Out's size of prelu TRT converter. "
-                          "Expected 1, received %d.",
-                          output_num));
     // Get attrs
     std::string mode = BOOST_GET_CONST(std::string, op_desc.GetAttr("mode"));
     //
diff --git a/paddle/fluid/inference/tensorrt/convert/roi_align_op.cc b/paddle/fluid/inference/tensorrt/convert/roi_align_op.cc
index 1329608aecd20..654fe7e013379 100644
--- a/paddle/fluid/inference/tensorrt/convert/roi_align_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/roi_align_op.cc
@@ -62,12 +62,6 @@ class RoiAlignOpConverter : public OpConverter {
     std::vector<nvinfer1::ITensor*> inputs{input_tensor, rois_tensor};
     nvinfer1::ILayer* layer = nullptr;
 
-    PADDLE_ENFORCE_EQ(
-        engine_->with_dynamic_shape(), true,
-        platform::errors::InvalidArgument(
-            "TRT roi align plugin only accept the dynamic shape, because that "
-            "the roi_align will change the batch size."));
-
     auto* roi_align_plugin = new plugin::RoiAlignPluginDynamic(
         data_type_, pooled_height, pooled_width, spatial_scale, sampling_ratio);
     auto roi_align_layer = engine_->network()->addPluginV2(
diff --git a/paddle/fluid/inference/tensorrt/convert/shuffle_channel_op.cc b/paddle/fluid/inference/tensorrt/convert/shuffle_channel_op.cc
index bf1f82076a66c..0fdc262f7e740 100644
--- a/paddle/fluid/inference/tensorrt/convert/shuffle_channel_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/shuffle_channel_op.cc
@@ -50,12 +50,6 @@ class ShuffleChannelOpConverter : public OpConverter {
     int w = input_dims.d[2];
     int group = BOOST_GET_CONST(int, op_desc.GetAttr("group"));
 
-    if (engine_->with_dynamic_shape()) {
-      PADDLE_THROW(platform::errors::Fatal(
-          "You are running the TRT Dynamic Shape mode, "
-          "the shuffle_channel op does not support dynamic shape yet"));
-    }
-
     auto* layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input);
     nvinfer1::Dims4 reshape_dim(group, c / group, h, w);
     layer->setReshapeDimensions(reshape_dim);
diff --git a/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc
index 3db7709acc22d..e621ac0514109 100644
--- a/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc
+++ b/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc
@@ -49,55 +49,60 @@ class SkipLayerNormOpConverter : public OpConverter {
     auto* scale = get_persistable_data("Scale", &scale_dims);
     int bias_size = framework::product(bias_dims);
     int scale_size = framework::product(scale_dims);
+    bool enable_int8 = op_desc.HasAttr("enable_int8");
 
     nvinfer1::ILayer* layer = nullptr;
-    if (engine_->with_dynamic_shape()) {
-      if (engine_->use_oss()) {
-        auto creator = GetPluginRegistry()->getPluginCreator(
-            "CustomSkipLayerNormPluginDynamic", "2");
-        assert(creator != nullptr);
-        int type = static_cast<int>((engine_->WithFp16() == 1)
-                                        ? nvinfer1::DataType::kHALF
-                                        : nvinfer1::DataType::kFLOAT);
-        int ld = input1->getDimensions().d[2];  // hidden dimension
-        assert(ld > 0);
-
-        const std::vector<nvinfer1::PluginField> fields{
-            {"type_id", &type, nvinfer1::PluginFieldType::kINT32, 1},
-            {"ld", &ld, nvinfer1::PluginFieldType::kINT32, 1},
-            {"beta", bias, nvinfer1::PluginFieldType::kFLOAT32, bias_size},
-            {"gamma", scale, nvinfer1::PluginFieldType::kFLOAT32, scale_size},
-        };
-        nvinfer1::PluginFieldCollection* pluginPtr =
-            static_cast<nvinfer1::PluginFieldCollection*>(
-                malloc(sizeof(*pluginPtr) +
-                       fields.size() *
-                           sizeof(nvinfer1::PluginField)));  // remember to free
-        pluginPtr->nbFields = static_cast<int>(fields.size());
-        pluginPtr->fields = fields.data();
-
-        auto pluginObj = creator->createPlugin(
-            "CustomSkipLayerNormPluginDynamic", pluginPtr);
-        auto plugin_layer = engine_->network()->addPluginV2(
-            inputs.data(), inputs.size(), *pluginObj);
-
-        assert(plugin_layer != nullptr);
-        layer = plugin_layer;
-      } else {
-        float eps = BOOST_GET_CONST(float, op_desc.GetAttr("epsilon"));
-        bool with_fp16 =
-            engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
-        plugin::SkipLayerNormPluginDynamic* plugin =
-            new plugin::SkipLayerNormPluginDynamic(bias, scale, bias_size,
-                                                   scale_size, eps, with_fp16);
-        layer = engine_->AddDynamicPlugin(inputs.data(), 2, plugin);
+
+    if (engine_->use_oss()) {
+      auto creator = GetPluginRegistry()->getPluginCreator(
+          "CustomSkipLayerNormPluginDynamic", "2");
+      PADDLE_ENFORCE_NE(
+          creator, nullptr,
+          platform::errors::InvalidArgument(
+              "fail to get creator of CustomSkipLayerNormPluginDynamic"));
+      int type = static_cast<int>((engine_->WithFp16() == 1)
+                                      ? nvinfer1::DataType::kHALF
+                                      : nvinfer1::DataType::kFLOAT);
+      int ld = input1->getDimensions().d[2];  // hidden dimension
+      PADDLE_ENFORCE_GT(ld, 0, platform::errors::InvalidArgument(
+                                   "in CustomSkipLayerNormPluginDynamic hidden "
+                                   "dimension should > 0"));
+      if (enable_int8) {
+        type = static_cast<int>(nvinfer1::DataType::kHALF);
       }
+
+      const std::vector<nvinfer1::PluginField> fields{
+          {"type_id", &type, nvinfer1::PluginFieldType::kINT32, 1},
+          {"ld", &ld, nvinfer1::PluginFieldType::kINT32, 1},
+          {"beta", bias, nvinfer1::PluginFieldType::kFLOAT32, bias_size},
+          {"gamma", scale, nvinfer1::PluginFieldType::kFLOAT32, scale_size},
+      };
+      nvinfer1::PluginFieldCollection* pluginPtr =
+          static_cast<nvinfer1::PluginFieldCollection*>(
+              malloc(sizeof(*pluginPtr) +
+                     fields.size() *
+                         sizeof(nvinfer1::PluginField)));  // remember to free
+      pluginPtr->nbFields = static_cast<int>(fields.size());
+      pluginPtr->fields = fields.data();
+
+      auto pluginObj =
+          creator->createPlugin("CustomSkipLayerNormPluginDynamic", pluginPtr);
+      auto plugin_layer = engine_->network()->addPluginV2(
+          inputs.data(), inputs.size(), *pluginObj);
+
+      PADDLE_ENFORCE_NE(
+          plugin_layer, nullptr,
+          platform::errors::InvalidArgument(
+              "fail to add CustomSkipLayerNormPluginDynamic layer"));
+      layer = plugin_layer;
     } else {
-      PADDLE_THROW(platform::errors::Fatal(
-          "You are running the Ernie(Bert) model in static"
-          "shape mode, which is not supported for the time being.\n"
-          "You can use the config.SetTRTDynamicShapeInfo(...) interface"
-          " to set the shape information to run the dynamic shape mode."));
+      float eps = BOOST_GET_CONST(float, op_desc.GetAttr("epsilon"));
+      bool with_fp16 =
+          engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
+      plugin::SkipLayerNormPluginDynamic* plugin =
+          new plugin::SkipLayerNormPluginDynamic(bias, scale, bias_size,
+                                                 scale_size, eps, with_fp16);
+      layer = engine_->AddDynamicPlugin(inputs.data(), 2, plugin);
     }
 
     auto output_name = op_desc.Output("Out")[0];
diff --git a/paddle/fluid/inference/tensorrt/convert/slice_op.cc b/paddle/fluid/inference/tensorrt/convert/slice_op.cc
index 38521d256419d..2ab024dff327f 100644
--- a/paddle/fluid/inference/tensorrt/convert/slice_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/slice_op.cc
@@ -31,6 +31,12 @@ class SliceOpConverter : public OpConverter {
     // Declare inputs
     auto* input = engine_->GetITensor(op_desc.Input("Input")[0]);
 
+    if (op_desc.HasAttr("out_threshold")) {
+      float out_scale =
+          BOOST_GET_CONST(float, op_desc.GetAttr("out_threshold"));
+      engine_->SetTensorDynamicRange(input, out_scale);
+    }
+
     std::vector<int> axes =
         BOOST_GET_CONST(std::vector<int>, op_desc.GetAttr("axes"));
     std::vector<int> starts =
@@ -38,15 +44,6 @@ class SliceOpConverter : public OpConverter {
     std::vector<int> ends =
         BOOST_GET_CONST(std::vector<int>, op_desc.GetAttr("ends"));
 
-    PADDLE_ENFORCE_EQ(
-        starts.size(), axes.size(),
-        platform::errors::InvalidArgument(
-            "The size of starts must be equal to the size of axes."));
-    PADDLE_ENFORCE_EQ(
-        ends.size(), axes.size(),
-        platform::errors::InvalidArgument(
-            "The size of ends must be equal to the size of axes."));
-
     auto input_dims = input->getDimensions();
     if (!engine_->with_dynamic_shape()) {
       // notice that input shape is [CHW] without batch axis when input has
@@ -56,10 +53,6 @@ class SliceOpConverter : public OpConverter {
       }
       input_dims.d[0] = 1;  // fake batchsize, not useful here
       for (size_t i = 0; i < axes.size(); i++) {
-        // split on batch is not supported in TensorRT
-        PADDLE_ENFORCE_NE(axes[i], 0, platform::errors::InvalidArgument(
-                                          "Invalid slice axis. Slice on batch "
-                                          "axis is not supported in TensorRT"));
         if (starts[i] < 0) {
           starts[i] = std::max(starts[i] + input_dims.d[axes[i]], 0);
         }
diff --git a/paddle/fluid/inference/tensorrt/convert/split_op.cc b/paddle/fluid/inference/tensorrt/convert/split_op.cc
index 75b317e7bfd90..47a6dd783a70c 100644
--- a/paddle/fluid/inference/tensorrt/convert/split_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/split_op.cc
@@ -33,17 +33,7 @@ class SplitOpConverter : public OpConverter {
     size_t output_num = op_desc.Output("Out").size();
 
     // Get Attrs
-    PADDLE_ENFORCE_EQ(input_num, 1UL,
-                      platform::errors::InvalidArgument(
-                          "Invalid input X's size of split TRT converter. "
-                          "Expected 1, received %d.",
-                          input_num));
     int axis = BOOST_GET_CONST(int, op_desc.GetAttr("axis"));
-    // split on batch is not supported in TensorRT
-    PADDLE_ENFORCE_NE(
-        axis, 0,
-        platform::errors::InvalidArgument(
-            "Invalid split axis. Split on batch is not supported in TensorRT"));
 
     std::vector<int> output_lengths =
         BOOST_GET_CONST(std::vector<int>, op_desc.GetAttr("sections"));
diff --git a/paddle/fluid/inference/tensorrt/convert/stack_op.cc b/paddle/fluid/inference/tensorrt/convert/stack_op.cc
index d538c58879d78..6105e10799e55 100644
--- a/paddle/fluid/inference/tensorrt/convert/stack_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/stack_op.cc
@@ -45,6 +45,11 @@ class StackOpConverter : public OpConverter {
 
     for (int i = 0; i < input_num; ++i) {
       inputs[i] = engine_->GetITensor(input[i]);
+      if (op_desc.HasAttr("out_threshold")) {
+        float out_scale =
+            BOOST_GET_CONST(float, op_desc.GetAttr("out_threshold"));
+        engine_->SetTensorDynamicRange(inputs[i], out_scale);
+      }
     }
 
     int axis = BOOST_GET_CONST(int, op_desc.GetAttr("axis"));
@@ -53,26 +58,19 @@ class StackOpConverter : public OpConverter {
     }
 
     nvinfer1::ILayer* layer = nullptr;
-    if (engine_->with_dynamic_shape()) {
 #if IS_TRT_VERSION_GE(6000)
-      bool with_fp16 =
-          engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
-      plugin::StackPluginDynamic* plugin =
-          new plugin::StackPluginDynamic(axis, input_num, with_fp16);
-      layer = engine_->AddDynamicPlugin(inputs, input_num, plugin);
-      assert(layer != nullptr);
+    bool with_fp16 = engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
+    plugin::StackPluginDynamic* plugin =
+        new plugin::StackPluginDynamic(axis, input_num, with_fp16);
+    layer = engine_->AddDynamicPlugin(inputs, input_num, plugin);
+    PADDLE_ENFORCE_NOT_NULL(
+        layer, platform::errors::InvalidArgument(
+                   "trt stack layer in converter could not be created."));
 #else
-      PADDLE_THROW(platform::errors::Fatal(
-          "You are running the TRT Dynamic Shape mode, need to confirm that "
-          "your TRT version is no less than 6.0"));
+    PADDLE_THROW(platform::errors::Fatal(
+        "You are running the TRT Dynamic Shape mode, need to confirm that "
+        "your TRT version is no less than 6.0"));
 #endif
-    } else {
-      PADDLE_THROW(platform::errors::Fatal(
-          "You are running the Ernie(Bert) model in static"
-          "shape mode, which is not supported for the time being.\n"
-          "You can use the config.SetTRTDynamicShapeInfo(...) interface"
-          " to set the shape information to run the dynamic shape mode."));
-    }
     auto output_name = op_desc.Output("Y").front();
     RreplenishLayerAndOutput(layer, "stack", {output_name}, test_mode);
     free(inputs);
diff --git a/paddle/fluid/inference/tensorrt/helper.h b/paddle/fluid/inference/tensorrt/helper.h
index 971f99e691972..6158fd130bad8 100644
--- a/paddle/fluid/inference/tensorrt/helper.h
+++ b/paddle/fluid/inference/tensorrt/helper.h
@@ -60,6 +60,9 @@ static nvinfer1::IRuntime* createInferRuntime(nvinfer1::ILogger* logger) {
 static nvinfer1::IPluginRegistry* GetPluginRegistry() {
   return static_cast<nvinfer1::IPluginRegistry*>(dy::getPluginRegistry());
 }
+static int GetInferLibVersion() {
+  return static_cast<int>(dy::getInferLibVersion());
+}
 #endif
 
 // A logger for create TensorRT infer builder.
@@ -67,9 +70,12 @@ class NaiveLogger : public nvinfer1::ILogger {
  public:
   void log(nvinfer1::ILogger::Severity severity, const char* msg) override {
     switch (severity) {
-      case Severity::kINFO:
+      case Severity::kVERBOSE:
         VLOG(3) << msg;
         break;
+      case Severity::kINFO:
+        VLOG(2) << msg;
+        break;
       case Severity::kWARNING:
         LOG(WARNING) << msg;
         break;
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index b681b098c8c76..c8dfc169535da 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -42,6 +42,10 @@ struct SimpleOpTypeSetTeller : public Teller {
     teller_set.insert("multihead_matmul");
     teller_set.insert("skip_layernorm");
     teller_set.insert("slice");
+    int8_teller_set.insert("fused_embedding_eltwise_layernorm");
+    int8_teller_set.insert("multihead_matmul");
+    int8_teller_set.insert("skip_layernorm");
+    int8_teller_set.insert("slice");
 #endif
 #if IS_TRT_VERSION_GE(7130)
     teller_set.insert("group_norm");
@@ -61,6 +65,8 @@ struct SimpleOpTypeSetTeller : public Teller {
   // use this set for no calib int8.
   std::unordered_set<std::string> int8_teller_set{"mul",
                                                   "conv2d",
+                                                  "matmul",
+                                                  "stack",
                                                   "conv2d_fusion",
                                                   "pool2d",
                                                   "relu",
@@ -114,7 +120,6 @@ struct SimpleOpTypeSetTeller : public Teller {
       "yolo_box",
       "roi_align",
       "affine_channel",
-      "multiclass_nms",
       "nearest_interp",
       "anchor_generator",
   };
@@ -132,13 +137,93 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
     return false;
 
   for (auto& teller : tellers_) {
-    if (op_type == "pool2d" || op_type == "conv2d" ||
-        op_type == "depthwise_conv2d" || op_type == "conv2d_transpose") {
+    if (op_type == "depthwise_conv2d") {
       std::vector<int> paddings =
           BOOST_GET_CONST(std::vector<int>, desc.GetAttr("paddings"));
 
       if (paddings.size() > 2) return false;
     }
+
+    if (op_type == "pool2d") {
+      std::vector<int> paddings =
+          BOOST_GET_CONST(std::vector<int>, desc.GetAttr("paddings"));
+      if (paddings.size() > 2) return false;
+      if (desc.Input("X").size() != 1) {
+        VLOG(3) << "TRT Pool2d expect 1 input, but got "
+                << desc.Input("X").size();
+        return false;
+      }
+      if (desc.Output("Out").size() != 1) {
+        VLOG(3) << "TRT Pool2d has only 1 output, but got "
+                << desc.Output("Out").size();
+        return false;
+      }
+      if (!desc.HasAttr("pooling_type")) {
+        return false;
+      } else {
+        std::string pool_type =
+            BOOST_GET_CONST(std::string, desc.GetAttr("pooling_type"));
+        if (pool_type != "max" && pool_type != "avg") {
+          VLOG(3) << "Wrong pool op type, the trt do not support the "
+                  << pool_type << " pool type.";
+          return false;
+        }
+      }
+    }
+
+    if (op_type == "conv2d" || op_type == "conv2d_transpose" ||
+        op_type == "conv2d_fusion") {
+      std::vector<int> paddings =
+          BOOST_GET_CONST(std::vector<int>, desc.GetAttr("paddings"));
+
+      // conv2d and conv2d_transpose need padding check
+      if (paddings.size() > 2 && op_type != "conv2d_fusion") return false;
+
+      if (desc.Input("Input").size() != 1) {
+        VLOG(3) << "TRT Conv2d expect 1 input, but got "
+                << desc.Input("Input").size() << " input.";
+        return false;
+      }
+
+      if (desc.Input("Filter").size() != 1) {
+        VLOG(3) << "TRT Conv2d expect 1 filter, but got "
+                << desc.Input("Filter").size() << " filter.";
+        return false;
+      }
+
+      if (desc.HasAttr("enable_int8")) {
+        if (op_type == "conv2d" || op_type == "conv2d_fusion") {
+          if (!desc.HasAttr("Input_scale")) {
+            VLOG(3) << "Input scale not found. TRT int8"
+                       " requires conv/deconv to have "
+                       "input quantization scales.";
+            return false;
+          }
+        }
+      }
+
+      if (op_type == "conv2d_transpose") {
+        if (!desc.HasAttr("dilations")) {
+          return false;
+        } else {
+          const std::vector<int> dilations =
+              BOOST_GET_CONST(std::vector<int>, desc.GetAttr("dilations"));
+          if (dilations[0] != 1 || dilations[1] != 1) {
+            VLOG(3) << "In conv2d_transpose, Dilations must be (1, 1) for "
+                       "tensorRT, but given ("
+                    << dilations[0] << ", " << dilations[1] << ")";
+            return false;
+          }
+        }
+      }
+
+      if (desc.Output("Output").size() != 1) {
+        VLOG(3) << "TRT Conv2d expect 1 output, but got "
+                << desc.Output("Output").size() << " output.";
+        return false;
+      }
+    }
+
     if (op_type == "matmul") {
       auto* block = desc.Block();
       for (auto& param_name : desc.Inputs()) {
@@ -146,7 +231,7 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
           auto* var_desc = block->FindVar(var_name);
           const auto shape = var_desc->GetShape();
           if (shape.size() < 3) {
-            VLOG(1)
+            VLOG(3)
                 << "matmul op dims < 3 not supported in tensorrt, but got dims "
                 << shape.size() << ", so jump it.";
             return false;
@@ -184,7 +269,18 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
         if (axis.size() >= nvinfer1::Dims::MAX_DIMS) return false;
       }
     }
-    if (op_type == "flatten2" || op_type == "flatten") {
+    if (op_type == "flatten2") {
+      // flatten doesn't support dynamic shape currently
+      if (!desc.HasAttr("axis")) {
+        return false;
+      } else {
+        if (with_dynamic_shape) return false;
+        int axis = BOOST_GET_CONST(int, desc.GetAttr("axis"));
+        if (axis != 1) return false;
+      }
+    }
+
+    if (op_type == "flatten") {
       // flatten doesn't support dynamic shape currently
       if (!desc.HasAttr("axis")) {
         return false;
@@ -224,7 +320,7 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
           auto* var_desc = block->FindVar(var_name);
           const auto shape = var_desc->GetShape();
           if (shape.size() != 3) {
-            VLOG(1) << "multiclass_nms op dims != 3 not supported in tensorrt, "
+            VLOG(3) << "multiclass_nms op dims != 3 not supported in tensorrt, "
                        "but got dims "
                     << shape.size() << ", so jump it.";
             return false;
@@ -247,18 +343,6 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
       if (registry == nullptr) return false;
     }
 
-    if (op_type == "fc" || op_type == "mul") {
-      const int x_num_col_dims =
-          desc.HasAttr("x_num_col_dims")
-              ? BOOST_GET_CONST(int, desc.GetAttr("x_num_col_dims"))
-              : (desc.HasAttr("in_num_col_dims")
-                     ? BOOST_GET_CONST(int, desc.GetAttr("in_num_col_dims"))
-                     : 1);
-      if (x_num_col_dims != 1 && x_num_col_dims != 2) {
-        return false;
-      }
-    }
-
     if (op_type == "nearest_interp") {
       std::vector<std::string> attrs{"data_layout",   "interp_method",
                                      "align_corners", "scale",
@@ -274,6 +358,25 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
       auto interp_method =
           BOOST_GET_CONST(std::string, desc.GetAttr("interp_method"));
       if (interp_method != "nearest") return false;
+
+      if (!desc.HasAttr("scale") || !desc.HasAttr("out_h") ||
+          !desc.HasAttr("out_w")) {
+        return false;
+      } else {
+        auto scale = BOOST_GET_CONST(float, desc.GetAttr("scale"));
+        auto out_h = BOOST_GET_CONST(int, desc.GetAttr("out_h"));
+        auto out_w = BOOST_GET_CONST(int, desc.GetAttr("out_w"));
+        if (!(scale > 0.f && (out_h <= 0 && out_w <= 0))) {
+          if (out_h <= 0) {
+            VLOG(3) << "out_h must be greater than 0 if scale is not set.";
+            return false;
+          }
+          if (out_w <= 0) {
+            VLOG(3) << "out_w must be greater than 0 if scale is not set.";
+            return false;
+          }
+        }
+      }
     }
 
     if (op_type == "roi_align") {
@@ -298,6 +401,235 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
       if (spatial_scale <= 0.f) return false;
     }
 
+    if (op_type == "hard_swish") {
+      if (desc.Input("X").size() != 1) {
+        VLOG(3) << "HardSwish op has only 1 input, but got "
+                << desc.Input("X").size();
+        return false;
+      }
+
+      if (desc.Output("Out").size() != 1) {
+        VLOG(3) << "HardSwish op has only 1 output, but got "
+                << desc.Output("Out").size();
+        return false;
+      }
+    }
+
+    if (op_type == "batch_norm") {
+      const std::vector<std::string> bn_inputs = {"X", "Bias", "Mean", "Scale",
+                                                  "Variance"};
+      for (unsigned int i = 0; i < bn_inputs.size(); i++) {
+        if (desc.Input(bn_inputs[i]).size() != 1) {
+          VLOG(3) << "Invalid " << bn_inputs[i]
+                  << "'s size of batch_norm TRT "
+                     "converter. Expected 1, received "
+                  << desc.Input(bn_inputs[i]).size() << ".";
+          return false;
+        }
+      }
+
+      if (desc.Output("Y").size() != 1) {
+        VLOG(3) << "Invalid output Y's size of batch_norm TRT "
+                   "converter. Expected 1, received "
+                << desc.Output("Y").size() << ".";
+        return false;
+      }
+    }
+
+    if (op_type == "split") {
+      if (desc.Input("X").size() != 1) {
+        VLOG(3) << "Invalid input X's size of split TRT converter. "
+                   "Expected 1, received "
+                << desc.Input("X").size() << ".";
+        return false;
+      }
+      if (!desc.HasAttr("axis")) {
+        return false;
+      } else {
+        int axis = BOOST_GET_CONST(int, desc.GetAttr("axis"));
+        if (axis == 0) {
+          VLOG(3) << "Invalid split axis. Split on batch is not supported in "
+                     "TensorRT";
+          return false;
+        }
+      }
+    }
+
+    if (op_type == "slice") {
+      if (!desc.HasAttr("axes") || !desc.HasAttr("starts") ||
+          !desc.HasAttr("ends")) {
+        return false;
+      } else {
+        std::vector<int> axes =
+            BOOST_GET_CONST(std::vector<int>, desc.GetAttr("axes"));
+        std::vector<int> starts =
+            BOOST_GET_CONST(std::vector<int>, desc.GetAttr("starts"));
+        std::vector<int> ends =
+            BOOST_GET_CONST(std::vector<int>, desc.GetAttr("ends"));
+        if (axes.size() != starts.size() || axes.size() != ends.size()) {
+          return false;
+        }
+        if (!with_dynamic_shape) {
+          for (size_t i = 0; i < axes.size(); i++) {
+            if (axes[i] == 0) {
+              VLOG(3) << "Invalid slice axis. Slice on batch axis is not "
+                         "supported in TensorRT";
+              return false;
+            }
+          }
+        }
+      }
+    }
+
+    if (op_type == "elementwise_add" || op_type == "elementwise_mul") {
+      if (desc.Input("X").size() != 1) {
+        VLOG(3) << "The input op's Input(\"X\").size() "
+                   "should equal to 1, but received Input(\"X\").size() = "
+                << desc.Input("X").size() << ".";
+        return false;
+      }
+      if (desc.Input("Y").size() != 1) {
+        VLOG(3) << "The input op's Input(\"Y\").size() "
+                   "should equal to 1, but received Input(\"Y\").size() = "
+                << desc.Input("Y").size() << ".";
+        return false;
+      }
+      if (desc.Output("Out").size() != 1) {
+        VLOG(3) << "The input op's Output(\"Out\").size() "
+                   "should equal to 1, but reveceid Output(\"Out\").size() = "
+                << desc.Output("Out").size() << ".";
+        return false;
+      }
+    }
+
+    if (op_type == "stack") {
+      if (!with_dynamic_shape) {
+        VLOG(3)
+            << "static shape mode is not supported for TRT stack.\n"
+               "You can use the config.SetTRTDynamicShapeInfo(...) interface"
+               " to set the shape information to run the dynamic shape "
+               "mode.";
+        return false;
+      }
+    }
+
+    if (op_type == "fused_embedding_eltwise_layernorm") {
+      if (!with_dynamic_shape) {
+        VLOG(3) << "fused_embedding_eltwise_layernorm should run on dynamic "
+                   "shape mode.";
+        return false;
+      }
+      if (desc.Input("Ids").size() != desc.Input("Embs").size()) {
+        VLOG(3) << "The id and emb size of fused EmbEltwiseLayerNormOp "
+                   "should be same ";
+        return false;
+      }
+    }
+
+    if (op_type == "gelu") {
+      if (desc.Input("X").size() != 1) {
+        VLOG(3) << "gelu op has only 1 input, but got "
+                << desc.Input("X").size();
+        return false;
+      }
+      if (desc.Output("Out").size() != 1) {
+        VLOG(3) << "gelu op has only 1 output, but got "
+                << desc.Output("Out").size();
+        return false;
+      }
+    }
+
+    if (op_type == "layer_norm") {
+      if (desc.Input("X").size() != 1) {
+        VLOG(3) << "input of layer_norm op converter should be 1, got "
+                << desc.Input("X").size();
+        return false;
+      }
+      if (desc.Input("Bias").size() != 1) {
+        VLOG(3) << "Bias of layer_norm op converter should be 1, got "
+                << desc.Input("Bias").size();
+        return false;
+      }
+      if (desc.Input("Scale").size() != 1) {
+        VLOG(3) << "Scale of layer_norm op converter should be 1, got "
+                << desc.Input("Scale").size();
+        return false;
+      }
+      if (desc.Output("Y").size() != 1) {
+        VLOG(3) << "output of layer_norm op converter should be 1, got "
+                << desc.Output("Y").size();
+        return false;
+      }
+    }
+
+    if (op_type == "leaky_relu") {
+      if (desc.Input("X").size() != 1) {
+        VLOG(3) << "Invalid number of TRT leaky_relu op converter "
+                   "inputs. Expected 1, but received "
+                << desc.Input("X").size();
+        return false;
+      }
+      if (desc.Output("Out").size() != 1) {
+        VLOG(3) << "output of leaky_relu op converter should be 1, got "
+                << desc.Output("Out").size();
+        return false;
+      }
+    }
+
+    if (op_type == "pad") {
+      const float pad_value = BOOST_GET_CONST(float, desc.GetAttr("pad_value"));
+      if (pad_value != 0.0f) {
+        VLOG(3) << "The pad layer of TRT only support zero.";
+        return false;
+      }
+    }
+
+    if (op_type == "prelu") {
+      if (desc.Input("X").size() != 1) {
+        VLOG(3) << "Invalid input X's size of prelu TRT converter. "
+                   "Expected 1, received "
+                << desc.Input("X").size() << ".";
+        return false;
+      }
+      if (desc.Output("Out").size() != 1) {
+        VLOG(3) << "Invalid output Out's size of prelu TRT converter. "
+                   "Expected 1, received "
+                << desc.Output("Out").size() << ".";
+        return false;
+      }
+    }
+
+    if (op_type == "roi_align") {
+      if (!with_dynamic_shape) {
+        VLOG(3) << "TRT roi align plugin only accept the dynamic shape, "
+                   "because that "
+                   "the roi_align will change the batch size.";
+        return false;
+      }
+    }
+
+    if (op_type == "shuffle_channel") {
+      if (with_dynamic_shape) {
+        VLOG(3) << "You are running the TRT Dynamic Shape mode, "
+                   "the shuffle_channel op does not support dynamic shape yet";
+        return false;
+      }
+    }
+
+    if (op_type == "skip_layernorm") {
+      if (!with_dynamic_shape) {
+        VLOG(3) << "the skip_layernorm does not support static shape yet";
+        return false;
+      }
+    }
+
+    if (op_type == "multihead_matmul") {
+      if (!with_dynamic_shape) {
+        VLOG(3) << "the multihead_matmul does not support static shape yet";
+        return false;
+      }
+    }
+
     if ((*teller)(op_type, desc, use_no_calib_int8)) return true;
   }
   return false;
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 75628adbe8a85..f74cd671d6dca 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -522,10 +522,10 @@ if(WITH_GPU AND TENSORRT_FOUND)
     inference_analysis_test(trt_instance_norm_test SRCS trt_instance_norm_converter_test.cc
             EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} 
             ARGS --infer_model=${TEST_INSTANCE_NORM_MODEL}/)
-    inference_analysis_test(test_analyzer_capi_gpu SRCS analyzer_capi_gpu_tester.cc
+    inference_analysis_test(test_analyzer_capi_exp_gpu SRCS analyzer_capi_exp_gpu_tester.cc
             EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c
             ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
-    inference_analysis_test(test_analyzer_capi_xpu SRCS analyzer_capi_xpu_tester.cc
+    inference_analysis_test(test_analyzer_capi_exp_xpu SRCS analyzer_capi_exp_xpu_tester.cc
             EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c
             ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
             
@@ -604,14 +604,23 @@ inference_analysis_test(lite_resnet50_test SRCS lite_resnet50_test.cc
         EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
         ARGS --infer_model=${RESNET50_MODEL_DIR})
 
-inference_analysis_test(test_analyzer_capi SRCS analyzer_capi_tester.cc
-            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c
-            ARGS --infer_model=${RESNET50_MODEL_DIR}/model)
+inference_analysis_test(test_analyzer_capi_exp SRCS analyzer_capi_exp_tester.cc
+        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c
+        ARGS --infer_model=${RESNET50_MODEL_DIR}/model)
+
+inference_analysis_test(test_analyzer_capi_exp_pd_config SRCS analyzer_capi_exp_pd_config_tester.cc
+        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c
+        ARGS --infer_model=${MOBILENET_INSTALL_DIR}/model)
+
+inference_analysis_test(test_analyzer_capi_exp_pd_tensor SRCS analyzer_capi_exp_pd_tensor_tester.cc
+        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c
+        ARGS --infer_model=${MOBILENET_INSTALL_DIR}/model)
 
-inference_analysis_test(test_analyzer_capi_pd_tensor SRCS analyzer_capi_pd_tensor_tester.cc
+if (NOT APPLE AND NOT WIN32)
+    inference_analysis_test(test_analyzer_capi_exp_pd_threads SRCS analyzer_capi_exp_pd_threads_tester.cc
             EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c
             ARGS --infer_model=${MOBILENET_INSTALL_DIR}/model)
-
+endif()
 inference_analysis_test(test_analyzer_zerocopytensor_tensor SRCS analyzer_zerocopy_tensor_tester.cc
             EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} 
             ARGS --infer_model=${OCR_INSTALL_DIR}/model)        
@@ -621,17 +630,17 @@ inference_analysis_test(test_analyzer_paddletensor_tensor SRCS analyzer_paddle_t
             ARGS --infer_model=${OCR_INSTALL_DIR}/model --infer_data=${OCR_INSTALL_DIR}/data.txt --refer_result=${OCR_INSTALL_DIR}/result.txt)    
             
 if(WITH_MKLDNN)
-  inference_analysis_test(test_analyzer_capi_int SRCS analyzer_capi_int_tester.cc
+  inference_analysis_test(test_analyzer_capi_exp_int SRCS analyzer_capi_exp_int_tester.cc
             EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c
             ARGS --infer_model=${INT8_DATA_DIR}/resnet50/model)
- endif()
+endif()
 
-inference_analysis_test(test_analyzer_capi_ner SRCS analyzer_capi_ner_tester.cc 
+inference_analysis_test(test_analyzer_capi_exp_ner SRCS analyzer_capi_exp_ner_tester.cc 
         EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c
         ARGS --infer_model=${CHINESE_NER_INSTALL_DIR}/model)
 
 if(WITH_GPU)
-  inference_analysis_test(paddle_infer_api_test SRCS paddle_infer_api_test.cc
+    inference_analysis_test(paddle_infer_api_test SRCS paddle_infer_api_test.cc
         EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
         ARGS --infer_model=${RESNET50_MODEL_DIR})
 endif()
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_gpu_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_gpu_tester.cc
new file mode 100644
index 0000000000000..de9e2afd705f9
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_gpu_tester.cc
@@ -0,0 +1,160 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string>
+#include <vector>
+#include "paddle/fluid/inference/capi_exp/pd_inference_api.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+TEST(PD_Config, gpu_interface) {
+  std::string model_dir = FLAGS_infer_model + "/mobilenet";
+  std::string prog_file = model_dir + "/__model__";
+  std::string param_file = model_dir + "/__params__";
+  std::string opt_cache_dir = FLAGS_infer_model + "/OptimCacheDir";
+  const char* ops_name = "conv_2d";
+
+  PD_Config* config = PD_ConfigCreate();
+  PD_ConfigSetModel(config, prog_file.c_str(), param_file.c_str());
+  PD_ConfigSetOptimCacheDir(config, opt_cache_dir.c_str());
+
+  PD_ConfigEnableUseGpu(config, 100, 0);
+  bool use_gpu = PD_ConfigUseGpu(config);
+  EXPECT_TRUE(use_gpu);
+  int init_size = PD_ConfigMemoryPoolInitSizeMb(config);
+  EXPECT_EQ(init_size, 100);
+  int gpu_device_id = PD_ConfigGpuDeviceId(config);
+  EXPECT_EQ(gpu_device_id, 0);
+  float frac = PD_ConfigFractionOfGpuMemoryForPool(config);
+  LOG(INFO) << frac;
+  PD_ConfigEnableCudnn(config);
+  bool cudnn = PD_ConfigCudnnEnabled(config);
+  EXPECT_TRUE(cudnn);
+
+  PD_ConfigEnableTensorRtEngine(config, 1 << 20, 1, 3, PD_PRECISION_INT8, FALSE,
+                                TRUE);
+  bool trt_enable = PD_ConfigTensorRtEngineEnabled(config);
+  EXPECT_TRUE(trt_enable);
+
+  const char* tensor_name = "image";
+  size_t shapes_num[1] = {4};
+  int32_t min_shape[4] = {1, 3, 36, 36};
+  int32_t max_shape[4] = {1, 3, 224, 224};
+  int32_t opt_shape[4] = {1, 3, 224, 224};
+  int32_t* min_shape_ptr = min_shape;
+  int32_t* max_shape_ptr = max_shape;
+  int32_t* opt_shape_ptr = opt_shape;
+  PD_ConfigSetTrtDynamicShapeInfo(config, 1, &tensor_name, shapes_num,
+                                  &min_shape_ptr, &max_shape_ptr,
+                                  &opt_shape_ptr, FALSE);
+  PD_ConfigDisableTensorRtOPs(config, 1, &ops_name);
+  PD_ConfigEnableTensorRtOSS(config);
+  bool oss_enabled = PD_ConfigTensorRtOssEnabled(config);
+  EXPECT_TRUE(oss_enabled);
+
+  PD_ConfigEnableTensorRtDla(config, 4);
+  bool dla_enabled = PD_ConfigTensorRtDlaEnabled(config);
+  EXPECT_TRUE(dla_enabled);
+
+  PD_ConfigEnableGpuMultiStream(config);
+  bool thread_local_thread = PD_ConfigThreadLocalStreamEnabled(config);
+  EXPECT_TRUE(thread_local_thread);
+
+  PD_ConfigDisableGpu(config);
+  PD_ConfigDestroy(config);
+}
+
+TEST(PD_Config, use_gpu) {
+  std::string model_dir = FLAGS_infer_model + "/mobilenet";
+  PD_Config* config = PD_ConfigCreate();
+
+  PD_ConfigDisableGpu(config);
+  PD_ConfigSetCpuMathLibraryNumThreads(config, 10);
+  int num_thread = PD_ConfigGetCpuMathLibraryNumThreads(config);
+  EXPECT_EQ(num_thread, 10);
+
+  PD_ConfigSwitchIrDebug(config, TRUE);
+  PD_ConfigSetModelDir(config, model_dir.c_str());
+  PD_ConfigSetOptimCacheDir(config,
+                            (FLAGS_infer_model + "/OptimCacheDir").c_str());
+  const char* model_dir_ = PD_ConfigGetModelDir(config);
+  LOG(INFO) << model_dir_;
+
+  PD_ConfigEnableUseGpu(config, 100, 0);
+  bool use_gpu = PD_ConfigUseGpu(config);
+  EXPECT_TRUE(use_gpu);
+  int device_id = PD_ConfigGpuDeviceId(config);
+  EXPECT_EQ(device_id, 0);
+  int init_size = PD_ConfigMemoryPoolInitSizeMb(config);
+  EXPECT_EQ(init_size, 100);
+
+  float frac = PD_ConfigFractionOfGpuMemoryForPool(config);
+  LOG(INFO) << frac;
+
+  PD_ConfigEnableCudnn(config);
+  bool cudnn = PD_ConfigCudnnEnabled(config);
+  EXPECT_TRUE(cudnn);
+
+  PD_ConfigSwitchIrOptim(config, TRUE);
+  bool ir_optim = PD_ConfigIrOptim(config);
+  EXPECT_TRUE(ir_optim);
+
+  PD_ConfigEnableTensorRtEngine(config, 1 << 20, 1, 3, PD_PRECISION_FLOAT32,
+                                FALSE, FALSE);
+  bool trt_enable = PD_ConfigTensorRtEngineEnabled(config);
+  EXPECT_TRUE(trt_enable);
+  PD_ConfigEnableMemoryOptim(config);
+  bool memory_optim_enable = PD_ConfigMemoryOptimEnabled(config);
+  EXPECT_TRUE(memory_optim_enable);
+  PD_ConfigEnableProfile(config);
+  bool profiler_enable = PD_ConfigProfileEnabled(config);
+  EXPECT_TRUE(profiler_enable);
+  PD_ConfigSetInvalid(config);
+  bool is_valid = PD_ConfigIsValid(config);
+  EXPECT_FALSE(is_valid);
+  PD_ConfigDestroy(config);
+}
+
+TEST(PD_Config, trt_int8) {
+  std::string model_dir = FLAGS_infer_model + "/mobilenet";
+  PD_Config* config = PD_ConfigCreate();
+  PD_ConfigEnableUseGpu(config, 100, 0);
+  PD_ConfigEnableTensorRtEngine(config, 1 << 20, 1, 3, PD_PRECISION_INT8, FALSE,
+                                TRUE);
+  bool trt_enable = PD_ConfigTensorRtEngineEnabled(config);
+  EXPECT_TRUE(trt_enable);
+  PD_ConfigDestroy(config);
+}
+
+TEST(PD_Config, trt_fp16) {
+  std::string model_dir = FLAGS_infer_model + "/mobilenet";
+  PD_Config* config = PD_ConfigCreate();
+  PD_ConfigEnableUseGpu(config, 100, 0);
+  PD_ConfigEnableTensorRtEngine(config, 1 << 20, 1, 3, PD_PRECISION_HALF, FALSE,
+                                FALSE);
+  bool trt_enable = PD_ConfigTensorRtEngineEnabled(config);
+  EXPECT_TRUE(trt_enable);
+  PD_Predictor* predictor = PD_PredictorCreate(config);
+  PD_PredictorDestroy(predictor);
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_int_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_int_tester.cc
new file mode 100644
index 0000000000000..d3a15cb285772
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_int_tester.cc
@@ -0,0 +1,89 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string>
+#include <vector>
+#include "paddle/fluid/inference/capi_exp/pd_inference_api.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+void predictor_run() {
+  std::string model_dir = FLAGS_infer_model;
+  PD_Config* config = PD_ConfigCreate();
+  PD_ConfigDisableGpu(config);
+  PD_ConfigSetCpuMathLibraryNumThreads(config, 10);
+  PD_ConfigSwitchIrDebug(config, TRUE);
+  PD_ConfigSetModelDir(config, model_dir.c_str());
+  PD_Predictor* predictor = PD_PredictorCreate(config);
+  PD_OneDimArrayCstr* input_names = PD_PredictorGetInputNames(predictor);
+  LOG(INFO) << "The inputs' size is: " << input_names->size;
+  EXPECT_EQ(input_names->size, 2u);
+
+  int32_t shape_0[4] = {1, 3, 224, 224};
+  float data_0[1 * 3 * 224 * 224] = {0};
+  PD_Tensor* input_0 = PD_PredictorGetInputHandle(predictor, "image");
+  PD_TensorReshape(input_0, 4, shape_0);
+  PD_TensorCopyFromCpuFloat(input_0, data_0);
+  int32_t shape_1[2] = {1, 1};
+  int64_t data_1[1] = {0};
+  PD_Tensor* input_1 = PD_PredictorGetInputHandle(predictor, "label");
+  PD_TensorReshape(input_1, 2, shape_1);
+  PD_TensorCopyFromCpuInt64(input_1, data_1);
+
+  LOG(INFO) << "Run Inference in CAPI encapsulation. ";
+  EXPECT_TRUE(PD_PredictorRun(predictor));
+
+  PD_OneDimArrayCstr* output_names = PD_PredictorGetOutputNames(predictor);
+  LOG(INFO) << "output size is: " << output_names->size;
+  for (size_t index = 0; index < output_names->size; ++index) {
+    LOG(INFO) << "output[" << index
+              << "]'s name is: " << output_names->data[index];
+    PD_Tensor* output =
+        PD_PredictorGetOutputHandle(predictor, output_names->data[index]);
+    PD_OneDimArrayInt32* shape = PD_TensorGetShape(output);
+    LOG(INFO) << "output[" << index << "]'s shape_size is: " << shape->size;
+    int32_t out_size = 1;
+    for (size_t i = 0; i < shape->size; ++i) {
+      LOG(INFO) << "output[" << index << "]'s shape is: " << shape->data[i];
+      out_size = out_size * shape->data[i];
+    }
+    float* out_data = new float[out_size];
+    PD_TensorCopyToCpuFloat(output, out_data);
+    LOG(INFO) << "output[" << index << "]'s DATA is: " << out_data[0];
+    delete[] out_data;
+    PD_OneDimArrayInt32Destroy(shape);
+    PD_TensorDestroy(output);
+  }
+  PD_PredictorClearIntermediateTensor(predictor);
+  PD_PredictorTryShrinkMemory(predictor);
+  PD_OneDimArrayCstrDestroy(output_names);
+  PD_TensorDestroy(input_1);
+  PD_TensorDestroy(input_0);
+  PD_OneDimArrayCstrDestroy(input_names);
+  PD_PredictorDestroy(predictor);
+}
+
+#ifdef PADDLE_WITH_MKLDNN
+TEST(PD_PredictorRun, predictor_run) { predictor_run(); }
+#endif
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_ner_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_ner_tester.cc
new file mode 100644
index 0000000000000..4369cd78dfa37
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_ner_tester.cc
@@ -0,0 +1,105 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string>
+#include <vector>
+#include "paddle/fluid/inference/capi_exp/pd_inference_api.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+TEST(PD_PredictorRun, predictor_run) {
+  auto model_dir = FLAGS_infer_model;
+  PD_Config *config = PD_ConfigCreate();
+  PD_ConfigSetModel(config, (model_dir + "/__model__").c_str(),
+                    (model_dir + "/param").c_str());
+  PD_ConfigDisableGpu(config);
+
+  PD_Predictor *predictor = PD_PredictorCreate(config);
+  size_t input_num = PD_PredictorGetInputNum(predictor);
+  LOG(INFO) << "Input num: " << input_num;
+  size_t output_num = PD_PredictorGetOutputNum(predictor);
+  LOG(INFO) << "Output num: " << output_num;
+
+  PD_OneDimArrayCstr *input_names = PD_PredictorGetInputNames(predictor);
+  EXPECT_EQ(input_names->size, 2u);
+  LOG(INFO) << "Predictor start run!";
+  PD_Tensor *inputs[2];
+  inputs[0] = PD_PredictorGetInputHandle(predictor, input_names->data[0]);
+  inputs[1] = PD_PredictorGetInputHandle(predictor, input_names->data[1]);
+  LOG(INFO) << "Predictor start run!";
+  // inputs[0]: word, use lod memory in stack
+  int32_t shape_0[2] = {11, 1};
+  int64_t data_0[11 * 1] = {12673, 9763, 905, 284, 45, 7474, 20, 17, 1, 4, 9};
+  size_t lod_layer_0[2] = {0, 11};
+  PD_OneDimArraySize layer_0;
+  layer_0.size = 2;
+  layer_0.data = lod_layer_0;
+  PD_OneDimArraySize *layer_0_ptr = &layer_0;
+  PD_TwoDimArraySize lod_0;
+  lod_0.size = 1;
+  lod_0.data = &layer_0_ptr;
+  PD_TensorReshape(inputs[0], 2, shape_0);
+  PD_TensorCopyFromCpuInt64(inputs[0], data_0);
+  PD_TensorSetLod(inputs[0], &lod_0);
+
+  // inputs[1]: mention, use lod memory in heap
+  int32_t shape_1[2] = {11, 1};
+  int64_t data_1[11 * 1] = {27, 0, 0, 33, 34, 33, 0, 0, 0, 1, 2};
+  PD_TwoDimArraySize *lod_1_ptr = new PD_TwoDimArraySize();
+  lod_1_ptr->size = 1;
+  lod_1_ptr->data = new PD_OneDimArraySize *[1];
+  lod_1_ptr->data[0] = new PD_OneDimArraySize();
+  lod_1_ptr->data[0]->size = 2;
+  lod_1_ptr->data[0]->data = new size_t[2];
+  lod_1_ptr->data[0]->data[0] = 0;
+  lod_1_ptr->data[0]->data[1] = 11;
+
+  PD_TensorReshape(inputs[1], 2, shape_1);
+  PD_TensorCopyFromCpuInt64(inputs[1], data_1);
+  PD_TensorSetLod(inputs[1], lod_1_ptr);
+  // retrieve the lod memory
+  delete[] lod_1_ptr->data[0]->data;
+  delete lod_1_ptr->data[0];
+  delete[] lod_1_ptr->data;
+  delete lod_1_ptr;
+  lod_1_ptr = nullptr;
+
+  LOG(INFO) << "Predictor start run!";
+  bool success = PD_PredictorRun(predictor);
+  EXPECT_TRUE(success);
+  LOG(INFO) << "Predictor run success!";
+  PD_OneDimArrayCstr *output_names = PD_PredictorGetOutputNames(predictor);
+  PD_Tensor *output =
+      PD_PredictorGetOutputHandle(predictor, output_names->data[0]);
+  PD_TwoDimArraySize *output_lod = PD_TensorGetLod(output);
+
+  PD_TwoDimArraySizeDestroy(output_lod);
+  PD_TensorDestroy(output);
+  PD_OneDimArrayCstrDestroy(output_names);
+
+  PD_TensorDestroy(inputs[0]);
+  PD_TensorDestroy(inputs[1]);
+  PD_OneDimArrayCstrDestroy(input_names);
+  PD_PredictorDestroy(predictor);
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_config_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_config_tester.cc
new file mode 100644
index 0000000000000..18107704ae420
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_config_tester.cc
@@ -0,0 +1,108 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string>
+#include <vector>
+#include "paddle/fluid/inference/capi_exp/pd_inference_api.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+TEST(PD_Config, interface) {
+  std::string model_dir = FLAGS_infer_model + "/mobilenet";
+  std::string prog_file = model_dir + "/__model__";
+  std::string param_file = model_dir + "/__params__";
+  std::string opt_cache_dir = FLAGS_infer_model + "/OptimCacheDir";
+
+  PD_Config* config = PD_ConfigCreate();
+  PD_ConfigSetModelDir(config, model_dir.c_str());
+  std::string model_dir_ = PD_ConfigGetModelDir(config);
+  EXPECT_EQ(model_dir, model_dir_);
+
+  PD_ConfigSetModel(config, prog_file.c_str(), param_file.c_str());
+  PD_ConfigSetProgFile(config, prog_file.c_str());
+  PD_ConfigSetParamsFile(config, param_file.c_str());
+  PD_ConfigSetOptimCacheDir(config, opt_cache_dir.c_str());
+  std::string prog_file_ = PD_ConfigGetProgFile(config);
+  std::string param_file_ = PD_ConfigGetParamsFile(config);
+  EXPECT_EQ(prog_file, prog_file_);
+  EXPECT_EQ(param_file, param_file_);
+
+  PD_ConfigDisableFCPadding(config);
+  bool fc_padding = PD_ConfigUseFcPadding(config);
+  EXPECT_FALSE(fc_padding);
+
+  PD_ConfigDisableGpu(config);
+  PD_ConfigSwitchIrOptim(config, TRUE);
+  bool ir_optim = PD_ConfigIrOptim(config);
+  EXPECT_TRUE(ir_optim);
+
+#ifndef PADDLE_WITH_LITE
+  PD_ConfigEnableLiteEngine(config, PD_PRECISION_FLOAT32, TRUE, 0, nullptr, 0,
+                            nullptr);
+  bool lite_enabled = PD_ConfigLiteEngineEnabled(config);
+  EXPECT_TRUE(lite_enabled);
+#endif
+
+  PD_ConfigSwitchIrDebug(config, TRUE);
+#ifdef PADDLE_WITH_MKLDNN
+  const char* ops_name = "conv_2d";
+  PD_ConfigEnableMKLDNN(config);
+  PD_ConfigSetMkldnnOp(config, 1, &ops_name);
+  PD_ConfigSetMkldnnCacheCapacity(config, 100);
+  bool mkldnn_enabled = PD_ConfigMkldnnEnabled(config);
+  EXPECT_TRUE(mkldnn_enabled);
+
+  PD_ConfigSetCpuMathLibraryNumThreads(config, 10);
+  int32_t cpu_threads = PD_ConfigGetCpuMathLibraryNumThreads(config);
+  EXPECT_EQ(cpu_threads, 10);
+
+  PD_ConfigEnableMkldnnQuantizer(config);
+  bool mkldnn_qt_enabled = PD_ConfigMkldnnQuantizerEnabled(config);
+  EXPECT_TRUE(mkldnn_qt_enabled);
+
+  PD_ConfigEnableMkldnnBfloat16(config);
+  PD_ConfigSetBfloat16Op(config, 1, &ops_name);
+  bool mkldnn_bf16_enabled = PD_ConfigMkldnnBfloat16Enabled(config);
+  EXPECT_TRUE(mkldnn_bf16_enabled);
+#endif
+
+  PD_ConfigEnableMemoryOptim(config);
+  bool memory_enabled = PD_ConfigMemoryOptimEnabled(config);
+  EXPECT_TRUE(memory_enabled);
+
+  PD_ConfigEnableProfile(config);
+  bool profile_enabled = PD_ConfigProfileEnabled(config);
+  EXPECT_TRUE(profile_enabled);
+
+  PD_ConfigDisableGlogInfo(config);
+  bool glog_diabled = PD_ConfigGlogInfoDisabled(config);
+  EXPECT_TRUE(glog_diabled);
+
+  PD_ConfigSetInvalid(config);
+  bool is_valid = PD_ConfigIsValid(config);
+  EXPECT_FALSE(is_valid);
+
+  PD_ConfigPartiallyRelease(config);
+  PD_ConfigDestroy(config);
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_tensor_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_tensor_tester.cc
new file mode 100644
index 0000000000000..f4017fc5a7f34
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_tensor_tester.cc
@@ -0,0 +1,196 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
+#include "paddle/fluid/inference/capi_exp/pd_inference_api.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+void PD_run() {
+  auto model_dir = FLAGS_infer_model;
+  PD_Config* config = PD_ConfigCreate();
+  PD_ConfigSetModel(config, (model_dir + "/__model__").c_str(),
+                    (model_dir + "/__params__").c_str());
+  PD_Predictor* predictor = PD_PredictorCreate(config);
+  PD_OneDimArrayCstr* input_names = PD_PredictorGetInputNames(predictor);
+  PD_Tensor* tensor =
+      PD_PredictorGetInputHandle(predictor, input_names->data[0]);
+
+  int32_t shapes[4] = {1, 3, 300, 300};
+  std::vector<float> input(1 * 3 * 300 * 300, 0);
+  int32_t size;
+  PD_PlaceType place;
+  PD_TensorReshape(tensor, 4, shapes);
+  PD_TensorCopyFromCpuFloat(tensor, input.data());
+  PD_TensorDataFloat(tensor, &place, &size);
+  PD_TensorMutableDataFloat(tensor, place);
+
+  PD_TwoDimArraySize lod;
+  lod.size = 0;
+  lod.data = NULL;
+  PD_TensorSetLod(tensor, &lod);
+
+  PD_PredictorRun(predictor);
+
+  std::vector<float> out_data;
+  PD_OneDimArrayCstr* output_names = PD_PredictorGetOutputNames(predictor);
+  PD_Tensor* output_tensor =
+      PD_PredictorGetOutputHandle(predictor, output_names->data[0]);
+  PD_OneDimArrayInt32* output_shape = PD_TensorGetShape(output_tensor);
+  int32_t out_num = std::accumulate(output_shape->data,
+                                    output_shape->data + output_shape->size, 1,
+                                    std::multiplies<int32_t>());
+  out_data.resize(out_num);
+  PD_TensorCopyToCpuFloat(output_tensor, out_data.data());
+  LOG(INFO) << "Output tensor name is: " << PD_TensorGetName(output_tensor);
+  PD_DataType data_type = PD_TensorGetDataType(output_tensor);
+  EXPECT_EQ(data_type, PD_DATA_FLOAT32);
+
+  PD_TwoDimArraySize* out_lod = PD_TensorGetLod(output_tensor);
+
+  PD_TwoDimArraySizeDestroy(out_lod);
+  PD_OneDimArrayInt32Destroy(output_shape);
+  PD_TensorDestroy(output_tensor);
+  PD_OneDimArrayCstrDestroy(output_names);
+  PD_TensorDestroy(tensor);
+  PD_OneDimArrayCstrDestroy(input_names);
+  PD_PredictorDestroy(predictor);
+}
+TEST(PD_Tensor, PD_run) { PD_run(); }
+
+TEST(PD_Tensor, int32) {
+  auto model_dir = FLAGS_infer_model;
+  PD_Config* config = PD_ConfigCreate();
+  PD_ConfigSetModel(config, (model_dir + "/__model__").c_str(),
+                    (model_dir + "/__params__").c_str());
+  PD_Predictor* predictor = PD_PredictorCreate(config);
+  PD_OneDimArrayCstr* input_names = PD_PredictorGetInputNames(predictor);
+  PD_Tensor* tensor =
+      PD_PredictorGetInputHandle(predictor, input_names->data[0]);
+  int32_t shapes[4] = {1, 3, 300, 300};
+  std::vector<int32_t> input(1 * 3 * 300 * 300, 0);
+  int32_t size;
+  PD_PlaceType place;
+  PD_TensorReshape(tensor, 4, shapes);
+  PD_TensorCopyFromCpuInt32(tensor, input.data());
+  int32_t* data_ptr = PD_TensorDataInt32(tensor, &place, &size);
+  EXPECT_EQ(place, PD_PLACE_CPU);
+  EXPECT_EQ(size, 1 * 3 * 300 * 300);
+  int32_t* mutable_data_ptr = PD_TensorMutableDataInt32(tensor, place);
+  EXPECT_EQ(data_ptr, mutable_data_ptr);
+
+  PD_DataType data_type = PD_TensorGetDataType(tensor);
+  EXPECT_EQ(data_type, PD_DATA_INT32);
+  PD_TensorCopyToCpuInt32(tensor, input.data());
+
+  PD_TensorDestroy(tensor);
+  PD_OneDimArrayCstrDestroy(input_names);
+  PD_PredictorDestroy(predictor);
+}
+
+TEST(PD_Tensor, int64) {
+  auto model_dir = FLAGS_infer_model;
+  PD_Config* config = PD_ConfigCreate();
+  PD_ConfigSetModel(config, (model_dir + "/__model__").c_str(),
+                    (model_dir + "/__params__").c_str());
+  PD_Predictor* predictor = PD_PredictorCreate(config);
+  PD_OneDimArrayCstr* input_names = PD_PredictorGetInputNames(predictor);
+  PD_Tensor* tensor =
+      PD_PredictorGetInputHandle(predictor, input_names->data[0]);
+  int32_t shapes[4] = {1, 3, 300, 300};
+  std::vector<int64_t> input(1 * 3 * 300 * 300, 0);
+  int32_t size;
+  PD_PlaceType place;
+  PD_TensorReshape(tensor, 4, shapes);
+  PD_TensorCopyFromCpuInt64(tensor, input.data());
+  int64_t* data_ptr = PD_TensorDataInt64(tensor, &place, &size);
+  EXPECT_EQ(place, PD_PLACE_CPU);
+  EXPECT_EQ(size, 1 * 3 * 300 * 300);
+  int64_t* mutable_data_ptr = PD_TensorMutableDataInt64(tensor, place);
+  EXPECT_EQ(data_ptr, mutable_data_ptr);
+
+  PD_DataType data_type = PD_TensorGetDataType(tensor);
+  EXPECT_EQ(data_type, PD_DATA_INT64);
+  PD_TensorCopyToCpuInt64(tensor, input.data());
+
+  PD_TensorDestroy(tensor);
+  PD_OneDimArrayCstrDestroy(input_names);
+  PD_PredictorDestroy(predictor);
+}
+
+TEST(PD_Tensor, uint8) {
+  auto model_dir = FLAGS_infer_model;
+  PD_Config* config = PD_ConfigCreate();
+  PD_ConfigSetModel(config, (model_dir + "/__model__").c_str(),
+                    (model_dir + "/__params__").c_str());
+  PD_Predictor* predictor = PD_PredictorCreate(config);
+  PD_OneDimArrayCstr* input_names = PD_PredictorGetInputNames(predictor);
+  PD_Tensor* tensor =
+      PD_PredictorGetInputHandle(predictor, input_names->data[0]);
+  int32_t shapes[4] = {1, 3, 300, 300};
+  uint8_t input[1 * 3 * 300 * 300] = {0};
+  int32_t size;
+  PD_PlaceType place;
+  PD_TensorReshape(tensor, 4, shapes);
+  PD_TensorCopyFromCpuUint8(tensor, input);
+  uint8_t* data_ptr = PD_TensorDataUint8(tensor, &place, &size);
+  EXPECT_EQ(place, PD_PLACE_CPU);
+  EXPECT_EQ(size, 1 * 3 * 300 * 300);
+  uint8_t* mutable_data_ptr = PD_TensorMutableDataUint8(tensor, place);
+  EXPECT_EQ(data_ptr, mutable_data_ptr);
+
+  PD_DataType data_type = PD_TensorGetDataType(tensor);
+  EXPECT_EQ(data_type, PD_DATA_UINT8);
+  PD_TensorCopyToCpuUint8(tensor, input);
+
+  PD_TensorDestroy(tensor);
+  PD_OneDimArrayCstrDestroy(input_names);
+  PD_PredictorDestroy(predictor);
+}
+
+std::string read_file(std::string filename) {
+  std::ifstream file(filename);
+  return std::string((std::istreambuf_iterator<char>(file)),
+                     std::istreambuf_iterator<char>());
+}
+
+TEST(PD_Tensor, from_buffer) {
+  PD_Config* config = PD_ConfigCreate();
+  std::string prog_file = FLAGS_infer_model + "/__model__";
+  std::string params_file = FLAGS_infer_model + "/__params__";
+
+  std::string prog_str = read_file(prog_file);
+  std::string params_str = read_file(params_file);
+
+  PD_ConfigSetModelBuffer(config, prog_str.c_str(), prog_str.size(),
+                          params_str.c_str(), params_str.size());
+
+  bool model_from_memory = PD_ConfigModelFromMemory(config);
+  EXPECT_TRUE(model_from_memory);
+  PD_ConfigDestroy(config);
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_threads_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_threads_tester.cc
new file mode 100644
index 0000000000000..8951c446b1f83
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_threads_tester.cc
@@ -0,0 +1,119 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
+#include "paddle/fluid/inference/capi_exp/pd_inference_api.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+typedef struct RunParameter {
+  PD_Predictor* predictor;
+  int32_t* shapes;
+  size_t shape_size;
+  float* input_data;
+  int32_t out_size;
+  float* out_data;
+  int32_t thread_index;
+} RunParameter;
+
+void* run(void* thread_param) {
+  struct RunParameter* param = (struct RunParameter*)thread_param;
+  LOG(INFO) << "Thread " << param->thread_index << " start run!";
+  PD_OneDimArrayCstr* input_names = PD_PredictorGetInputNames(param->predictor);
+  PD_Tensor* tensor =
+      PD_PredictorGetInputHandle(param->predictor, input_names->data[0]);
+  PD_TensorReshape(tensor, param->shape_size, param->shapes);
+  PD_TensorCopyFromCpuFloat(tensor, param->input_data);
+  PD_PredictorRun(param->predictor);
+  PD_OneDimArrayCstr* output_names =
+      PD_PredictorGetOutputNames(param->predictor);
+  PD_Tensor* output_tensor =
+      PD_PredictorGetOutputHandle(param->predictor, output_names->data[0]);
+  PD_OneDimArrayInt32* output_shape = PD_TensorGetShape(output_tensor);
+  param->out_size = 1;
+  for (size_t index = 0; index < output_shape->size; ++index) {
+    param->out_size = param->out_size * output_shape->data[index];
+  }
+  PD_OneDimArrayInt32Destroy(output_shape);
+  param->out_data =
+      reinterpret_cast<float*>(malloc(param->out_size * sizeof(float)));
+  PD_TensorCopyToCpuFloat(output_tensor, param->out_data);
+  PD_TensorDestroy(output_tensor);
+  PD_OneDimArrayCstrDestroy(output_names);
+  PD_TensorDestroy(tensor);
+  PD_OneDimArrayCstrDestroy(input_names);
+  LOG(INFO) << "Thread " << param->thread_index << " end run!";
+  return NULL;
+}
+void threads_run(int thread_num) {
+  auto model_dir = FLAGS_infer_model;
+  PD_Config* config = PD_ConfigCreate();
+  PD_ConfigSetModel(config, (model_dir + "/__model__").c_str(),
+                    (model_dir + "/__params__").c_str());
+  PD_Predictor* predictor = PD_PredictorCreate(config);
+
+  pthread_t* threads =
+      reinterpret_cast<pthread_t*>(malloc(thread_num * sizeof(pthread_t)));
+  RunParameter* params = reinterpret_cast<RunParameter*>(
+      malloc(thread_num * sizeof(RunParameter)));
+  int32_t shapes[4] = {1, 3, 300, 300};
+  float* input =
+      reinterpret_cast<float*>(malloc(1 * 3 * 300 * 300 * sizeof(float)));
+  memset(input, 0, 1 * 3 * 300 * 300 * sizeof(float));
+  for (int i = 0; i < thread_num; ++i) {
+    params[i].predictor = PD_PredictorClone(predictor);
+    params[i].shapes = shapes;
+    params[i].shape_size = 4;
+    params[i].input_data = input;
+    params[i].out_size = 0;
+    params[i].out_data = NULL;
+    params[i].thread_index = i;
+    pthread_create(&(threads[i]), NULL, run, (params + i));
+  }
+  for (int i = 0; i < thread_num; ++i) {
+    pthread_join(threads[i], NULL);
+  }
+  ASSERT_GT(params[0].out_size, 0);
+
+  for (int i = 1; i < thread_num; ++i) {
+    ASSERT_EQ(params[i].out_size, params[0].out_size);
+    for (int j = 0; j < params[i].out_size; ++j) {
+      ASSERT_EQ(params[i].out_data[j], params[0].out_data[j]);
+    }
+  }
+  for (int i = 0; i < thread_num; ++i) {
+    PD_PredictorDestroy(params[i].predictor);
+    free(params[i].out_data);
+  }
+  free(input);
+  free(params);
+  free(threads);
+  PD_PredictorDestroy(predictor);
+}
+
+TEST(PD_Predictor, PD_multi_threads_run) { threads_run(10); }
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_tester.cc
new file mode 100644
index 0000000000000..11de1a5a6fab4
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_tester.cc
@@ -0,0 +1,83 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/inference/capi_exp/pd_inference_api.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+void predictor_run() {
+  std::string model_dir = FLAGS_infer_model;
+  std::string prog_file = model_dir + "/model";
+  std::string params_file = model_dir + "/params";
+  PD_Config *config = PD_ConfigCreate();
+  PD_ConfigDisableGpu(config);
+  PD_ConfigSetCpuMathLibraryNumThreads(config, 10);
+  PD_ConfigSwitchIrDebug(config, TRUE);
+  PD_ConfigSetModel(config, prog_file.c_str(), params_file.c_str());
+
+  PD_Predictor *predictor = PD_PredictorCreate(config);
+  PD_Tensor *tensor = PD_PredictorGetInputHandle(predictor, "data");
+
+  const int batch_size = 1;
+  const int channels = 3;
+  const int height = 318;
+  const int width = 318;
+  float *input = new float[batch_size * channels * height * width]();
+
+  int32_t shape[4] = {batch_size, channels, height, width};
+  PD_TensorReshape(tensor, 4, shape);
+  PD_TensorCopyFromCpuFloat(tensor, input);
+  EXPECT_TRUE(PD_PredictorRun(predictor));
+
+  delete[] input;
+  PD_TensorDestroy(tensor);
+  PD_PredictorDestroy(predictor);
+}
+
+TEST(PD_PredictorRun, predictor_run) { predictor_run(); }
+
+#ifdef PADDLE_WITH_MKLDNN
+TEST(PD_Config, profile_mkldnn) {
+  std::string model_dir = FLAGS_infer_model;
+  std::string prog_file = model_dir + "/model";
+  std::string params_file = model_dir + "/params";
+  PD_Config *config = PD_ConfigCreate();
+  PD_ConfigDisableGpu(config);
+  PD_ConfigSetCpuMathLibraryNumThreads(config, 10);
+  PD_ConfigSwitchIrDebug(config, TRUE);
+  PD_ConfigEnableMKLDNN(config);
+  bool mkldnn_enable = PD_ConfigMkldnnEnabled(config);
+  EXPECT_TRUE(mkldnn_enable);
+  PD_ConfigEnableMkldnnQuantizer(config);
+  bool quantizer_enable = PD_ConfigMkldnnQuantizerEnabled(config);
+  EXPECT_TRUE(quantizer_enable);
+  PD_ConfigEnableMkldnnBfloat16(config);
+  PD_ConfigSetMkldnnCacheCapacity(config, 0);
+  PD_ConfigSetModel(config, prog_file.c_str(), params_file.c_str());
+  PD_ConfigDestroy(config);
+}
+#endif
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_xpu_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_xpu_tester.cc
new file mode 100644
index 0000000000000..f4fd04e85840d
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_xpu_tester.cc
@@ -0,0 +1,60 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string>
+#include <vector>
+#include "paddle/fluid/inference/capi_exp/pd_inference_api.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+#ifdef PADDLE_WITH_XPU
+TEST(PD_Config, use_xpu) {
+  std::string model_dir = FLAGS_infer_model + "/mobilenet";
+  PD_Config *config = PD_Config();
+  PD_ConfigSwitchIrDebug(config, TRUE);
+  PD_ConfigSetModelDir(config, model_dir.c_str());
+  PD_ConfigSetOptimCacheDir(config,
+                            (FLAGS_infer_model + "/OptimCacheDir").c_str());
+  const char *model_dir_ = PD_ConfigGetModelDir(config);
+  LOG(INFO) << model_dir_;
+  PD_ConfigEnableXpu(config, 0xfffc00);
+  bool use_xpu = PD_ConfigUseXpu(config);
+  EXPECT_TRUE(use_xpu);
+  int32_t device_id = PD_ConfigXpuDeviceId(config);
+  EXPECT_EQ(devive_id, 0);
+  PD_ConfigSwitchIrOptim(config, TRUE);
+  bool ir_optim = PD_IrOptim(config);
+  EXPECT_TRUE(ir_optim);
+  PD_ConfigEnableMemoryOptim(config);
+  bool memory_optim_enable = PD_ConfigMemoryOptimEnabled(config);
+  EXPECT_TRUE(memory_optim_enable);
+  PD_ConfigEnableProfile(config);
+  bool profiler_enable = PD_ConfigProfileEnabled(config);
+  EXPECT_TRUE(profiler_enable);
+  PD_SetInValid(config);
+  bool is_valid = PD_ConfigIsValid(config);
+  EXPECT_FALSE(is_valid);
+  PD_ConfigDestroy(config);
+}
+#endif
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc
index d9a4503cc1e5f..730d49e8acd93 100644
--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -206,12 +206,20 @@ void Copy<platform::NPUPlace, platform::CPUPlace>(platform::NPUPlace dst_place,
   if (UNLIKELY(num == 0)) return;
 
   platform::SetNPUDeviceId(dst_place.device);
+
   VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
           << dst_place << " by thream(" << stream << ")";
+
   if (stream) {
     platform::RecordEvent record_event("NpuMemcpyAsync:CPU->NPU");
     platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE, stream);
   } else {
+    // On NPU, async operation after sync operation is ok, while sync operation
+    // after async is not ok, since the async operation may not done.
+    // So, its needed to do wait before sync operation.
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    static_cast<platform::NPUDeviceContext*>(pool.Get(dst_place))->Wait();
+
     platform::RecordEvent record_event("NpuMemcpySync:CPU->NPU");
     platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE);
   }
@@ -226,12 +234,17 @@ void Copy<platform::CPUPlace, platform::NPUPlace>(platform::CPUPlace dst_place,
   if (UNLIKELY(num == 0)) return;
 
   platform::SetNPUDeviceId(src_place.device);
+
   VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
           << dst_place << " by thream(" << stream << ")";
+
   if (stream) {
     platform::RecordEvent record_event("NpuMemcpyAsync:NPU->CPU");
     platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST, stream);
   } else {
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    static_cast<platform::NPUDeviceContext*>(pool.Get(src_place))->Wait();
+
     platform::RecordEvent record_event("GpuMemcpySync:NPU->CPU");
     platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST);
   }
@@ -254,6 +267,10 @@ void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place,
       platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE,
                                stream);
     } else {
+      platform::DeviceContextPool& pool =
+          platform::DeviceContextPool::Instance();
+      static_cast<platform::NPUDeviceContext*>(pool.Get(dst_place))->Wait();
+
       platform::RecordEvent record_event("NpuMemcpySync(same_npu):NPU->NPU");
       platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE);
     }
@@ -268,6 +285,10 @@ void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place,
       platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE,
                                stream);
     } else {
+      platform::DeviceContextPool& pool =
+          platform::DeviceContextPool::Instance();
+      static_cast<platform::NPUDeviceContext*>(pool.Get(dst_place))->Wait();
+
       platform::RecordEvent record_event("NpuMemcpyPeerSync:NPU->NPU");
       platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE);
     }
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index dac8c7b03e517..6e11c64afc4bd 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -42,6 +42,10 @@ if (WITH_GPU AND TENSORRT_FOUND)
     add_subdirectory(tensorrt)
 endif()
 
+if (WITH_DLNNE)
+    add_subdirectory(dlnne)
+endif()
+
 if (WITH_LITE)
     add_subdirectory(lite)
 endif()
@@ -69,7 +73,7 @@ if(WITH_UNITY_BUILD)
     include(unity_build_rule.cmake)
 endif()
 
-register_operators(EXCLUDES py_func_op warpctc_op dgc_op lstm_op run_program_op eye_op recurrent_op
+register_operators(EXCLUDES py_layer_op py_func_op warpctc_op dgc_op lstm_op run_program_op eye_op recurrent_op
         sync_batch_norm_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS})
 
 op_library(run_program_op SRCS run_program_op.cc run_program_op.cu.cc DEPS executor_cache ${OP_HEADER_DEPS})
@@ -124,6 +128,7 @@ if (WITH_ASCEND)
 endif()
 
 if (WITH_ASCEND_CL)
+  cc_test(assign_op_npu_test SRCS assign_op_npu_test.cc DEPS assign_op)
   cc_library(npu_op_runner SRCS npu_op_runner.cc DEPS operator npu_info)
   set(COMMON_OP_DEPS ${COMMON_OP_DEPS} npu_op_runner)
 endif()
@@ -141,8 +146,8 @@ set(OPERATOR_DEPS ${OPERATOR_DEPS} ${COMMON_OP_DEPS})
 set(GLOB_OPERATOR_DEPS ${OPERATOR_DEPS} CACHE INTERNAL "Global Op dependencies")
 
 cc_test(test_common_infer_shape_functions SRCS test_common_infer_shape_functions.cc DEPS common_infer_shape_functions ${COMMON_OP_DEPS} activation_op elementwise_add_op softmax_op softmax)
-cc_test(assign_op_test SRCS assign_op_test.cc DEPS assign_op)
 cc_test(gather_test SRCS gather_test.cc DEPS tensor)
+cc_test(assign_op_test SRCS assign_op_test.cc DEPS assign_op)
 cc_test(scatter_test SRCS scatter_test.cc DEPS tensor math_function)
 cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor)
 cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor memory)
@@ -161,12 +166,22 @@ endif()
 cc_library(tensor_formatter SRCS tensor_formatter.cc DEPS ${OP_HEADER_DEPS})
 if (WITH_PYTHON)
   cc_library(py_func_op SRCS py_func_op.cc DEPS op_registry python pybind)
+  cc_library(py_layer_op SRCS py_layer_op.cc DEPS op_registry python pybind)
+endif()
+
+if (WITH_ASCEND_CL)
+  cc_test(range_op_npu_test SRCS range_op_npu_test.cc DEPS op_registry range_op scope device_context enforce executor)
+  cc_test(expand_op_npu_test SRCS expand_op_npu_test.cc DEPS op_registry expand_op eigen_cc_function scope device_context enforce executor compare_op)
 endif()
 
 set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
 add_subdirectory(benchmark)
 
 cc_test(op_debug_string_test SRCS op_debug_string_test.cc DEPS elementwise_add_op)
+if (WITH_ASCEND_CL)
+    cc_test(transpose_op_npu_test SRCS transpose_op_npu_test.cc DEPS op_registry transpose_op scope device_context enforce executor)
+endif()
+
 
 if(WITH_MKLDNN)
 include(mkldnn/inplace_op_tests.cmake)
@@ -180,3 +195,11 @@ if(WITH_UNITY_BUILD)
     # The specified link dependency needs to be displayed here.
     target_link_libraries(paddle_operators_unity ${OP_HEADER_DEPS} ${COMMON_OP_DEPS})
 endif()
+
+if(WITH_ASCEND_CL)
+cc_test(gelu_op_npu_test SRCS gelu_op_npu_test.cc DEPS op_registry gelu_op scope device_context enforce executor)
+endif()
+
+if (WITH_GPU OR WITH_ASCEND_CL)
+cc_test(copy_cross_scope_test SRCS copy_cross_scope_test.cc DEPS op_registry copy_cross_scope_op scope device_context enforce executor)
+endif()
diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index 94f2eb3672bd5..055909ba6f486 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -162,6 +162,12 @@ Sigmoid Activation Operator
 
 )DOC";
 
+UNUSED constexpr char SiluDoc[] = R"DOC(
+Silu Activation Operator
+
+$$out = x * \\frac{1}{1 + e^{-x}}$$
+)DOC";
+
 UNUSED constexpr char LogSigmoidDoc[] = R"DOC(
 Logsigmoid Activation Operator
 
@@ -697,6 +703,7 @@ It is recommended to use the defaults for this activation.
 };
 
 REGISTER_ACTIVATION_OP_MAKER(Sigmoid, SigmoidDoc);
+REGISTER_ACTIVATION_OP_MAKER(Silu, SiluDoc);
 REGISTER_ACTIVATION_OP_MAKER(LogSigmoid, LogSigmoidDoc);
 REGISTER_ACTIVATION_OP_MAKER(Exp, ExpDoc);
 REGISTER_ACTIVATION_OP_MAKER(Relu, ReluDoc);
@@ -782,6 +789,26 @@ class ActivationOpDoubleGrad2 : public framework::OperatorWithKernel {
   }
 };
 
+template <typename T>
+class TanhDoubleGradMaker : public ::paddle::framework::SingleGradOpMaker<T> {
+ public:
+  using ::paddle::framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("tanh_grad_grad");
+    // input1: Out
+    op->SetInput("Out", this->Input("Out"));
+    // input2: ddx
+    op->SetInput("DDX", this->OutputGrad(framework::GradVarName("X")));
+    op->SetInput("DOut", this->Input(framework::GradVarName("Out")));
+    op->SetAttrMap(this->Attrs());
+    // output: ddy
+    op->SetOutput("DOutNew", this->InputGrad("Out"));
+    op->SetOutput("DDOut", this->InputGrad(framework::GradVarName("Out")));
+  }
+};
+
 // ReluGrad: dx = dy if y >= 0 else 0
 // ReluGradGrad: ddy = ddx if y >= 0 else 0
 template <typename T>
@@ -1041,6 +1068,34 @@ namespace plat = paddle::platform;
 FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_OP);
 FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_CPU_KERNEL);
 
+/* ==========================    tanh register  ============================= */
+REGISTER_OPERATOR(
+    tanh, ops::ActivationOp, ops::TanhOpMaker, ops::ActivationOpInferVarType,
+    ops::ActivationGradOpMaker<ops::TanhGradFunctor<float>::FwdDeps(),
+                               paddle::framework::OpDesc>,
+    ops::ActivationGradOpMaker<ops::TanhGradFunctor<float>::FwdDeps(),
+                               paddle::imperative::OpBase>,
+    std::conditional<ops::CanInplaceAct<ops::TanhGradFunctor<float>>(),
+                     ops::ActFwdInplaceInferer, void>::type);
+REGISTER_OPERATOR(tanh_grad, ops::ActivationOpGrad,
+                  ops::ActivationGradOpInplaceInferer,
+                  ops::TanhDoubleGradMaker<paddle::framework::OpDesc>,
+                  ops::TanhDoubleGradMaker<paddle::imperative::OpBase>)
+REGISTER_OPERATOR(
+    tanh_grad_grad,
+    ops::ActivationOpDoubleGrad<ops::TanhGradFunctor<float>::FwdDeps()>,
+    ops::ActivationDoubleGradOpInplaceInferer);
+
+REGISTER_ACTIVATION_CPU_KERNEL(tanh, Tanh, TanhFunctor, TanhGradFunctor);
+REGISTER_OP_CPU_KERNEL(
+    tanh_grad_grad, ops::TanhDoubleGradKernel<plat::CPUDeviceContext,
+                                              ops::TanhGradGradFunctor<float>>,
+    ops::TanhDoubleGradKernel<plat::CPUDeviceContext,
+                              ops::TanhGradGradFunctor<double>>,
+    ops::TanhDoubleGradKernel<plat::CPUDeviceContext,
+                              ops::TanhGradGradFunctor<plat::float16>>);
+/* ========================================================================== */
+
 /* ==========================    relu register  ============================= */
 REGISTER_OPERATOR(
     relu, ops::ActivationOp, ops::ReluOpMaker, ops::ActivationOpInferVarType,
diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu
index 04f329088fafe..781a97c1ffcc1 100644
--- a/paddle/fluid/operators/activation_op.cu
+++ b/paddle/fluid/operators/activation_op.cu
@@ -468,6 +468,19 @@ REGISTER_OP_CUDA_KERNEL(
                                     ops::ReluGradGradFunctor<plat::float16>>);
 /* ========================================================================== */
 
+/* ===========================    tanh register  ============================ */
+REGISTER_ACTIVATION_CUDA_KERNEL(tanh, Tanh, TanhFunctor, TanhGradFunctor);
+
+REGISTER_OP_CUDA_KERNEL(
+    tanh_grad_grad,
+    ops::TanhDoubleGradKernel<paddle::platform::CUDADeviceContext,
+                              ops::TanhGradGradFunctor<float>>,
+    ops::TanhDoubleGradKernel<paddle::platform::CUDADeviceContext,
+                              ops::TanhGradGradFunctor<double>>,
+    ops::TanhDoubleGradKernel<plat::CUDADeviceContext,
+                              ops::TanhGradGradFunctor<plat::float16>>);
+/* ========================================================================== */
+
 /* ===========================   sqrt register  ============================= */
 REGISTER_ACTIVATION_CUDA_KERNEL(sqrt, Sqrt, SqrtFunctor, SqrtGradFunctor);
 
diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h
index fb5c4db91ec20..7245dea9cf949 100644
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -258,6 +258,31 @@ struct SigmoidGradFunctor : public BaseActivationFunctor<T> {
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
 };
 
+// silu(x) = x / (1 + exp(-x))
+template <typename T>
+struct SiluFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    auto temp = static_cast<T>(1) / (static_cast<T>(1) + (-x).exp());
+    out.device(d) = x * temp;
+  }
+};
+
+// silu'(x) = (1 / (1 + e^{-x}))  * (1 + out * e^{-x}))
+template <typename T>
+struct SiluGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    auto temp1 = static_cast<T>(1) + (-x).exp();  // 1+e^(-x)
+    auto temp2 = x * (-x).exp();                  // x*e^(-x)
+    dx.device(d) = dout * ((static_cast<T>(1) / temp1) *
+                           (static_cast<T>(1) + (temp2 / temp1)));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
 // Originally: logsigmoid(x) = -log (1 + exp(-x))
 // For numerical stability, we can use the log-sum-exp trick:
 // https://hips.seas.harvard.edu/blog/2013/01/09/computing-log-sum-exp/
@@ -366,6 +391,36 @@ struct TanhGradFunctor : public BaseActivationFunctor<T> {
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
 };
 
+template <typename T>
+struct TanhGradGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device>
+  void operator()(const Device& dev, const framework::Tensor* Out,
+                  const framework::Tensor* ddX, const framework::Tensor* dOut,
+                  framework::Tensor* dOutNew, framework::Tensor* ddOut) const {
+    auto* d = dev.eigen_device();
+    auto ddx = framework::EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(ddX, "Input", "DDX", "TanhGradGrad"));
+    auto out = framework::EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(Out, "Input", "Out", "TanhGradGrad"));
+    // tanh grad grad : ddout = (1 - out^2) * ddx, dout = - (dout_old * 2 * out
+    // * ddx)
+    if (dOutNew) {
+      auto dout = framework::EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(dOut, "Input", "DOut", "TanhGradGrad"));
+      auto dout_new = framework::EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(dOutNew, "Output", "DOutNew", "SquareGradGrad"));
+      dout_new.device(*d) =
+          static_cast<T>(-1) * dout * static_cast<T>(2) * out * ddx;
+    }
+    if (ddOut) {
+      auto ddout = framework::EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(ddOut, "Output", "DDOut", "SquareGradGrad"));
+      ddout.device(*d) = (static_cast<T>(1) - out * out) * ddx;
+    }
+  }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+};
+
 // tanhshrink(x) = x - tanh(x)
 // where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
 template <typename T>
@@ -1734,6 +1789,58 @@ inline void ExtractDoubleGradTensorWithInputDOut(
   }
 }
 
+template <typename DeviceContext, typename Functor>
+class TanhDoubleGradKernel
+    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
+ public:
+  using T = typename Functor::ELEMENT_TYPE;
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const framework::Tensor *Out, *ddX, *dOut;
+    framework::Tensor *dOutNew, *ddOut;
+    Out = ddX = dOut = nullptr;
+    dOutNew = ddOut = nullptr;
+
+    // extract ddx(input) and out(input)
+    auto ddx_var = ctx.InputVar("DDX");
+    auto out_var = ctx.InputVar("Out");
+    PADDLE_ENFORCE_NOT_NULL(
+        ddx_var, platform::errors::NotFound(
+                     "Cannot get input Variable ddx, variable name = %s",
+                     ctx.InputName("DDX")));
+    PADDLE_ENFORCE_NOT_NULL(
+        out_var, platform::errors::NotFound(
+                     "Cannot get input Variable out, variable name = %s",
+                     ctx.InputName("Out")));
+    ddX = ctx.Input<framework::Tensor>("DDX");
+    Out = ctx.Input<framework::Tensor>("Out");
+
+    // set output ddout
+    auto ddout_var = ctx.OutputVar("DDOut");
+    if (ddout_var) {
+      ddOut = ctx.Output<framework::Tensor>("DDOut");
+    }
+
+    // extract dOut(intput)
+    auto dout_var = ctx.InputVar("DOut");
+    PADDLE_ENFORCE_NOT_NULL(
+        dout_var, platform::errors::NotFound(
+                      "Cannot get input Variable dout_var, variable name = %s",
+                      ctx.InputName("DOut")));
+    dOut = ctx.Input<framework::Tensor>("DOut");
+
+    // set output dout_new
+    auto dout_new_var = ctx.OutputVar("DOutNew");
+    if (dout_new_var) {
+      dOutNew = ctx.Output<framework::Tensor>("DOutNew");
+    }
+
+    if (dOutNew) dOutNew->mutable_data<T>(Out->dims(), ctx.GetPlace());
+    if (ddOut) ddOut->mutable_data<T>(Out->dims(), ctx.GetPlace());
+    auto& place = ctx.template device_context<DeviceContext>();
+    Functor functor;
+    functor(place, Out, ddX, dOut, dOutNew, ddOut);
+  }
+};
 template <typename DeviceContext, typename Functor>
 class SquareDoubleGradKernel
     : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
@@ -2047,8 +2154,8 @@ struct LogGradGradFunctor : public BaseActivationFunctor<T> {
 
 #define FOR_EACH_ACTIVATION_OP(__macro)                                       \
   __macro(sigmoid, Sigmoid, SigmoidFunctor, SigmoidGradFunctor);              \
+  __macro(silu, Silu, SiluFunctor, SiluGradFunctor);                          \
   __macro(logsigmoid, LogSigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor);  \
-  __macro(tanh, Tanh, TanhFunctor, TanhGradFunctor);                          \
   __macro(atan, Atan, AtanFunctor, AtanGradFunctor);                          \
   __macro(softshrink, SoftShrink, SoftShrinkFunctor, SoftShrinkGradFunctor);  \
   __macro(ceil, Ceil, CeilFunctor, ZeroGradFunctor);                          \
diff --git a/paddle/fluid/operators/activation_op_npu.cc b/paddle/fluid/operators/activation_op_npu.cc
new file mode 100644
index 0000000000000..f368c65823055
--- /dev/null
+++ b/paddle/fluid/operators/activation_op_npu.cc
@@ -0,0 +1,367 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the Licnse. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/activation_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class PowNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* out = ctx.Output<Tensor>("Out");
+    auto factor = ctx.Attr<float>("factor");
+
+    out->mutable_data<T>(ctx.GetPlace());
+
+    auto runner = NpuOpRunner("Power", {*x}, {*out},
+                              {{"power", factor},
+                               {"scale", static_cast<float>(1.0)},
+                               {"shift", static_cast<float>(0.0)}});
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    runner.Run(stream);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class PowGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto factor = ctx.Attr<float>("factor");
+
+    auto x_dims = x->dims();
+
+    auto place = ctx.GetPlace();
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    // NOTE(liym27): dx = dout * factor * x.pow(factor-1)
+
+    // Step1: Compute x_pow = x.pow(factor-1)
+    Tensor x_pow(x->type());
+    x_pow.mutable_data<T>(x->dims(), place);
+    auto runner_pow = NpuOpRunner("Power", {*x}, {x_pow},
+                                  {{"power", factor - static_cast<float>(1)}});
+    runner_pow.Run(stream);
+
+    // Step 2: Construct a broadcast factor, which has the same shape with x.
+
+    // 2.1 Get a factor tensor with shape [1].
+    Tensor factor_tensor(framework::proto::VarType::FP32);
+    factor_tensor.mutable_data<float>({1}, place);
+    FillNpuTensorWithConstant<float>(&factor_tensor, factor);
+
+    // 2.2 Get the factor which has the shape with x and the same value with
+    // factor.
+    Tensor factor_bc_tensor(framework::proto::VarType::FP32);
+    factor_bc_tensor.mutable_data<float>(x_dims, place);
+    auto runner_bc = NpuOpRunner("FillD", {factor_tensor}, {factor_bc_tensor},
+                                 {{"dims", framework::vectorize(x_dims)}});
+    runner_bc.Run(stream);
+
+    // Step 3: Compute x_power_mul_factor = factor * x.pow(factor-1)
+    Tensor x_power_mul_factor(x->type());
+    x_power_mul_factor.mutable_data<T>(x->dims(), place);
+    auto runner_mul_1 =
+        NpuOpRunner("Mul", {factor_bc_tensor, x_pow}, {x_power_mul_factor}, {});
+    runner_mul_1.Run(stream);
+
+    // Step 4: Compute dx = dout * factor * x.pow(factor-1)
+    dx->mutable_data<T>(place);
+    auto runner_mul_2 =
+        NpuOpRunner("Mul", {*dout, x_power_mul_factor}, {*dx}, {});
+    runner_mul_2.Run(stream);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class ReluNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* out = ctx.Output<Tensor>("Out");
+
+    out->mutable_data<T>(ctx.GetPlace());
+
+    auto runner = NpuOpRunner("Relu",
+                              {
+                                  *x,
+                              },
+                              {*out}, {});
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    runner.Run(stream);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class ReluGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* out = ctx.Input<Tensor>("Out");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    dx->mutable_data<T>(ctx.GetPlace());
+    auto runner = NpuOpRunner("ReluGrad", {*dout, *out}, {*dx}, {});
+
+    runner.Run(stream);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class SqrtNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+
+    auto* out = ctx.Output<Tensor>("Out");
+
+    auto place = ctx.GetPlace();
+
+    out->mutable_data<T>(place);
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    auto runner = NpuOpRunner("Sqrt", {*x}, {*out}, {});
+    runner.Run(stream);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class SqrtGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* out = ctx.Input<Tensor>("Out");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+
+    auto place = ctx.GetPlace();
+
+    dx->mutable_data<T>(place);
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    auto dx_runner = NpuOpRunner("SqrtGrad", {*out, *dout}, {*dx}, {});
+    dx_runner.Run(stream);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class LogNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+
+    auto* out = ctx.Output<Tensor>("Out");
+
+    auto place = ctx.GetPlace();
+
+    out->mutable_data<T>(place);
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    Tensor one(x->type());
+    one.mutable_data<T>(x->dims(), place);
+    auto one_runner = NpuOpRunner("OnesLike", {*x}, {one}, {});
+    one_runner.Run(stream);
+
+    Tensor sub(x->type());
+    sub.mutable_data<T>(x->dims(), place);
+    auto sub_runner = NpuOpRunner("Sub", {*x, one}, {sub}, {});
+    sub_runner.Run(stream);
+
+    auto out_runner = NpuOpRunner("Log1p", {sub}, {*out}, {});
+    out_runner.Run(stream);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class LogGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* x = ctx.Input<Tensor>("X");
+
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+
+    auto place = ctx.GetPlace();
+
+    dx->mutable_data<T>(place);
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    auto runner = NpuOpRunner("DivNoNan", {*dout, *x}, {*dx}, {});
+    runner.Run(stream);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class TanhNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+
+    auto* out = ctx.Output<Tensor>("Out");
+
+    auto place = ctx.GetPlace();
+
+    out->mutable_data<T>(place);
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    auto runner = NpuOpRunner("Tanh", {*x}, {*out}, {});
+    runner.Run(stream);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class TanhGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* out = ctx.Input<Tensor>("Out");
+
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+
+    auto place = ctx.GetPlace();
+
+    dx->mutable_data<T>(place);
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    auto dx_runner = NpuOpRunner("TanhGrad", {*out, *dout}, {*dx}, {});
+    dx_runner.Run(stream);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class SquareNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+
+    auto* out = ctx.Output<Tensor>("Out");
+
+    auto place = ctx.GetPlace();
+
+    out->mutable_data<T>(place);
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    auto runner = NpuOpRunner("Square", {*x}, {*out}, {});
+    runner.Run(stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    pow, ops::PowNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::PowNPUKernel<paddle::platform::NPUDeviceContext,
+                      paddle::platform::float16>);
+
+REGISTER_OP_NPU_KERNEL(
+    pow_grad, ops::PowGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::PowGradNPUKernel<paddle::platform::NPUDeviceContext,
+                          paddle::platform::float16>);
+
+REGISTER_OP_NPU_KERNEL(
+    relu, ops::ReluNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::ReluNPUKernel<paddle::platform::NPUDeviceContext,
+                       paddle::platform::float16>);
+
+REGISTER_OP_NPU_KERNEL(
+    relu_grad,
+    ops::ReluGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::ReluGradNPUKernel<paddle::platform::NPUDeviceContext,
+                           paddle::platform::float16>);
+
+REGISTER_OP_NPU_KERNEL(
+    sqrt, ops::SqrtNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::SqrtNPUKernel<paddle::platform::NPUDeviceContext,
+                       paddle::platform::float16>);
+
+REGISTER_OP_NPU_KERNEL(
+    sqrt_grad,
+    ops::SqrtGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::SqrtGradNPUKernel<paddle::platform::NPUDeviceContext,
+                           paddle::platform::float16>);
+
+REGISTER_OP_NPU_KERNEL(
+    log, ops::LogNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::LogNPUKernel<paddle::platform::NPUDeviceContext,
+                      paddle::platform::float16>);
+
+REGISTER_OP_NPU_KERNEL(
+    log_grad, ops::LogGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::LogGradNPUKernel<paddle::platform::NPUDeviceContext,
+                          paddle::platform::float16>);
+
+REGISTER_OP_NPU_KERNEL(
+    tanh, ops::TanhNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::TanhNPUKernel<paddle::platform::NPUDeviceContext,
+                       paddle::platform::float16>);
+
+REGISTER_OP_NPU_KERNEL(
+    tanh_grad,
+    ops::TanhGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::TanhGradNPUKernel<paddle::platform::NPUDeviceContext,
+                           paddle::platform::float16>);
+
+REGISTER_OP_NPU_KERNEL(
+    square, ops::SquareNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::SquareNPUKernel<paddle::platform::NPUDeviceContext,
+                         paddle::platform::float16>,
+    ops::SquareNPUKernel<paddle::platform::NPUDeviceContext, int>);
diff --git a/paddle/fluid/operators/amp/CMakeLists.txt b/paddle/fluid/operators/amp/CMakeLists.txt
index b3ff52a7ae119..2ea8bbcbc61df 100644
--- a/paddle/fluid/operators/amp/CMakeLists.txt
+++ b/paddle/fluid/operators/amp/CMakeLists.txt
@@ -4,3 +4,7 @@ if(WITH_UNITY_BUILD)
     include(unity_build_rule.cmake)
 endif()
 register_operators()
+
+if(WITH_ASCEND_CL)
+    cc_test(check_finite_and_unscale_op_npu_test SRCS check_finite_and_unscale_op_npu_test.cc DEPS op_registry check_finite_and_unscale_op scope device_context enforce executor)
+endif()
diff --git a/paddle/fluid/operators/amp/alloc_float_status_op.cc b/paddle/fluid/operators/amp/alloc_float_status_op.cc
new file mode 100644
index 0000000000000..181dd6eabe22d
--- /dev/null
+++ b/paddle/fluid/operators/amp/alloc_float_status_op.cc
@@ -0,0 +1,74 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cstring>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+class AllocFloatStatusOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasOutput("FloatStatus"), "Output", "FloatStatus",
+                   "alloc_float_status");
+    ctx->SetOutputDim("FloatStatus", {8});
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(framework::proto::VarType::FP32,
+                                   ctx.GetPlace());
+  }
+};
+
+class AllocFloatStatusMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddOutput("FloatStatus",
+              "(Tensor) of shape {8} that holds the float status.");
+    AddComment(R"DOC(
+      Produces a float Tensor that holds the float status
+)DOC");
+  }
+};
+
+template <typename DeviceContext, typename T>
+class AllocFloatStatusKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Operator alloc_float_status is not supported on CPU"));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+using CPU = paddle::platform::CPUDeviceContext;
+
+REGISTER_OPERATOR(
+    alloc_float_status, ops::AllocFloatStatusOp, ops::AllocFloatStatusMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+
+REGISTER_OP_CPU_KERNEL(alloc_float_status,
+                       ops::AllocFloatStatusKernel<CPU, float>);
diff --git a/paddle/fluid/operators/amp/alloc_float_status_op_npu.cc b/paddle/fluid/operators/amp/alloc_float_status_op_npu.cc
new file mode 100644
index 0000000000000..fe5b08af52a62
--- /dev/null
+++ b/paddle/fluid/operators/amp/alloc_float_status_op_npu.cc
@@ -0,0 +1,47 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cmath>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class AllocFloatStatusKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* float_status = ctx.Output<framework::Tensor>("FloatStatus");
+    float_status->mutable_data<T>(ctx.GetPlace());
+
+    auto runner = NpuOpRunner("NPUAllocFloatStatus", {}, {*float_status});
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    runner.Run(stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    alloc_float_status,
+    ops::AllocFloatStatusKernel<paddle::platform::NPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cc
index 9d78936ad5f7f..c7520dbd34f6a 100644
--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op.cc
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cc
@@ -60,6 +60,12 @@ class CheckFiniteAndUnscaleOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Scale",
              "(Tensor) 1-dim tensor, the scale of check_finite_and_unscale "
              "operator.");
+#ifdef PADDLE_WITH_ASCEND_CL
+    AddInput("FloatStatus",
+             "(Tensor) 1-dim tensor of shape [8], allocated by "
+             "alloc_float_status op")
+        .AsDispensable();
+#endif
     AddOutput("Out",
               "(Tensors) The scaled output tensor of "
               "check_finite_and_unscale operator.")
diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu
index 6840e4847c4c6..2c3a9c366e4fd 100644
--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu
@@ -26,18 +26,48 @@ __global__ void InverseAndMemset(const T* s, T* o, bool* found_inf) {
 }
 
 template <typename T, typename MT>
-__global__ void CheckFiniteAndUnscale(const T* in, const MT* scale, int num,
-                                      bool* found_inf, T* out) {
-  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
-
-  if (idx < num) {
-    MT val = static_cast<MT>(in[idx]) * (*scale);
+__global__ void CheckFiniteAndUnscale(const T** xs, const MT* scale,
+                                      int64_t size, int64_t* starts,
+                                      bool* found_inf, T** outs) {
+  const int64_t tid = threadIdx.x + blockIdx.x * blockDim.x;
+
+  // copy starts array from global memory to shared memory
+  extern __shared__ int64_t s_starts[];
+  for (int i = threadIdx.x; i <= size; i += blockDim.x) {
+    s_starts[i] = starts[i];
+  }
+  __syncthreads();
+
+  const int64_t num = s_starts[size];
+  int pre_xs_index = 0;
+  bool t_found_inf = false;
+  const MT t_scale = *scale;
+  for (int64_t idx = tid; idx < num; idx += gridDim.x * blockDim.x) {
+    // get the xs's index of thread
+    int xs_index = pre_xs_index;
+    while (idx < s_starts[xs_index]) xs_index++;
+    // avoid some tensor's numel is zero
+    while (idx >= s_starts[xs_index]) xs_index++;
+    pre_xs_index = xs_index - 1;
+
+    // get in data and out data
+    const T* in = xs[pre_xs_index];
+    T* out = outs[pre_xs_index];
+    int64_t in_idx = idx - s_starts[pre_xs_index];
+
+    // Unscale
+    MT val = static_cast<MT>(in[in_idx]) * t_scale;
     T narrow_val = static_cast<T>(val);
-    out[idx] = narrow_val;
+    out[in_idx] = narrow_val;
+
+    // CheckFinite
     if (!isfinite(narrow_val)) {
-      *found_inf = true;
+      t_found_inf = true;
     }
   }
+  if (t_found_inf) {
+    *found_inf = true;
+  }
 }
 
 template <typename T>
@@ -63,20 +93,53 @@ class CheckFiniteAndUnscaleGpuKernel : public framework::OpKernel<T> {
     InverseAndMemset<MPDType><<<1, 1, 0, dev_ctx.stream()>>>(
         scale_data, inverse_scale_v, found_inf_data);
 
-    for (size_t i = 0; i < xs.size(); ++i) {
-      const auto* x = xs[i];
-      auto* out = outs[i];
-      const T* x_data = x->data<T>();
-      T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
-
-      int num = x->numel();
-      int block = 1024;
-      int grid = (num + block - 1) / block;
-      VLOG(3) << "launch kernel";
-      CheckFiniteAndUnscale<T, MPDType><<<grid, block, 0, dev_ctx.stream()>>>(
-          x_data, inverse_scale_v, num, found_inf_data, out_data);
-      VLOG(3) << "finish kernel";
+    size_t xs_size = xs.size();
+    // calculate each tensor's start index and copy to device
+    auto h_starts_tensor =
+        memory::Alloc(platform::CPUPlace(), (xs_size + 1) * sizeof(int64_t));
+    int64_t* h_starts = reinterpret_cast<int64_t*>(h_starts_tensor->ptr());
+
+    auto d_starts_tensor =
+        memory::Alloc(dev_ctx, (xs_size + 1) * sizeof(int64_t));
+    int64_t* d_starts = reinterpret_cast<int64_t*>(d_starts_tensor->ptr());
+
+    h_starts[0] = 0;
+    for (int i = 1; i <= xs_size; i++) {
+      // the start index value of each tensor is
+      // the sum of previous tensor's size
+      h_starts[i] = h_starts[i - 1] + xs[i - 1]->numel();
+    }
+    int64_t total_num = h_starts[xs_size];
+    memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()),
+                 d_starts, platform::CPUPlace(), h_starts,
+                 (xs_size + 1) * sizeof(int64_t), dev_ctx.stream());
+
+    // copy each tensor's data address to device
+    auto h_mem = memory::Alloc(platform::CPUPlace(), 2 * xs_size * sizeof(T*));
+    const T** h_xs = reinterpret_cast<const T**>(h_mem->ptr());
+    T** h_outs = reinterpret_cast<T**>(h_mem->ptr()) + xs_size;
+
+    auto d_mem = memory::Alloc(dev_ctx, 2 * xs_size * sizeof(T*));
+    const T** d_xs = reinterpret_cast<const T**>(d_mem->ptr());
+    T** d_outs = reinterpret_cast<T**>(d_mem->ptr()) + xs_size;
+
+    for (size_t i = 0; i < xs_size; ++i) {
+      h_xs[i] = xs[i]->data<T>();
+      h_outs[i] = outs[i]->mutable_data<T>(dev_ctx.GetPlace());
     }
+    memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), d_xs,
+                 platform::CPUPlace(), h_xs, 2 * xs_size * sizeof(T*),
+                 dev_ctx.stream());
+
+    // Launch Kernel
+    int block = 1024;
+    int block_num = block * 20;  // each thread deal with 20 number
+    int grid = (total_num + block_num - 1) / block_num;
+    VLOG(3) << "launch kernel";
+    CheckFiniteAndUnscale<T, MPDType><<<
+        grid, block, (xs_size + 1) * sizeof(int64_t), dev_ctx.stream()>>>(
+        d_xs, inverse_scale_v, xs_size, d_starts, found_inf_data, d_outs);
+    VLOG(3) << "finish kernel";
   }
 };
 }  // namespace operators
diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc
new file mode 100644
index 0000000000000..8fd45326e4ec6
--- /dev/null
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc
@@ -0,0 +1,130 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/amp/check_finite_and_unscale_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+// NOTE(zhiqiu): The CheckFiniteAndUnscaleNPUKernel is different from CUDA.
+// On NPU, we do not really check the data of input tensors,
+// but use NPUGetFloatStatus to check whether the nan/inf occurs on device,
+// and clear it after this op.
+// Which may leads to wrong result if the input tensors is not calculated
+// on NPU device, but got from other way, for example, feeding.
+template <typename T>
+class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    const auto xs = ctx.MultiInput<framework::Tensor>("X");
+    const auto* scale = ctx.Input<framework::Tensor>("Scale");
+    const auto* float_status = ctx.Input<framework::Tensor>("FloatStatus");
+    auto outs = ctx.MultiOutput<framework::Tensor>("Out");
+    auto* found_inf = ctx.Output<framework::Tensor>("FoundInfinite");
+
+    found_inf->mutable_data<bool>(ctx.GetPlace());
+
+    bool found_inf_data = false;
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    // step1: inverse scale(RealDiv)
+    Tensor const_tensor;
+    const_tensor.mutable_data<T>({1}, ctx.GetPlace());
+    FillNpuTensorWithConstant<T>(&const_tensor, static_cast<T>(1.0));
+
+    // Inverse(1.0/scale)
+    Tensor* tmp_inverse_out = const_cast<Tensor*>(scale);
+    Tensor inverse_out(scale->type());
+    inverse_out.Resize(scale->dims());
+    inverse_out.mutable_data<T>(ctx.GetPlace());
+    auto runner_inverse =
+        NpuOpRunner("Div", {const_tensor, *scale}, {inverse_out}, {});
+    runner_inverse.Run(stream);
+    tmp_inverse_out = &inverse_out;
+
+    // NOTE(zhiqiu):
+    Tensor tmp;
+    tmp.mutable_data<float>({8}, ctx.GetPlace());
+
+    // NOTE(zhiqiu): NPUGetFloatStatus updates data on input in-place.
+    // tmp is only placeholder.
+    auto runner_float_status =
+        NpuOpRunner("NPUGetFloatStatus", {*float_status}, {tmp},
+                    {{"message", std::string("check_nan_and_inf")}});
+    runner_float_status.Run(stream);
+
+    Tensor sum;
+    sum.mutable_data<float>({1}, ctx.GetPlace());
+    auto runner_reduce_sum =
+        NpuOpRunner("ReduceSumD", {*float_status}, {sum},
+                    {{"axes", std::vector<int>{0}}, {"keep_dims", true}});
+    runner_reduce_sum.Run(stream);
+
+    std::vector<float> sum_vec;
+    TensorToVector(
+        sum, ctx.template device_context<paddle::platform::NPUDeviceContext>(),
+        &sum_vec);
+    found_inf_data = (sum_vec[0] > 1);
+
+    VLOG(4) << "found_inf_data:" << found_inf_data;
+
+    for (size_t i = 0; i < xs.size(); ++i) {
+      const auto* x = xs[i];
+      auto* out = outs[i];
+      out->mutable_data<T>(ctx.GetPlace());
+      if (!found_inf_data) {
+        // MatMul
+        auto runner_matmul =
+            NpuOpRunner("Mul", {*x, *tmp_inverse_out}, {*out}, {});
+        runner_matmul.Run(stream);
+      }
+    }
+
+    // set found_inf to true
+    VLOG(4) << "found overflow:" << found_inf_data;
+    Tensor found_inf_tensor;
+    found_inf_tensor.Resize({1});
+    bool* is_found_inf =
+        found_inf_tensor.mutable_data<bool>(paddle::platform::CPUPlace());
+    *is_found_inf = found_inf_data;
+
+    framework::TensorCopy(
+        found_inf_tensor, ctx.GetPlace(),
+        ctx.template device_context<platform::DeviceContext>(), found_inf);
+    ctx.template device_context<paddle::platform::NPUDeviceContext>().Wait();
+
+    auto runner_clear_status =
+        NpuOpRunner("NPUClearFloatStatus", {*float_status}, {tmp});
+    runner_clear_status.Run(stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_NPU_KERNEL(check_finite_and_unscale,
+                       ops::CheckFiniteAndUnscaleNPUKernel<float>,
+                       ops::CheckFiniteAndUnscaleNPUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc
new file mode 100644
index 0000000000000..a80b83f0cbe51
--- /dev/null
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc
@@ -0,0 +1,131 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <algorithm>
+#include <cstdlib>
+#include <memory>
+#include <random>
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+using Tensor = paddle::framework::Tensor;
+
+USE_OP(check_finite_and_unscale);
+USE_OP_DEVICE_KERNEL(check_finite_and_unscale, NPU);
+
+struct InputVars {
+  std::string name;
+  f::LoDTensor *tensor;
+};
+
+template <typename T>
+void Compare(f::Scope *scope, const p::DeviceContext &ctx) {
+  const f::DDim dims = f::make_ddim({2, 2});
+  auto place = ctx.GetPlace();
+
+  // init input
+  std::vector<InputVars> input_names = {
+      {"x", scope->Var("x")->GetMutable<f::LoDTensor>()},
+      {"x1", scope->Var("x1")->GetMutable<f::LoDTensor>()}};
+
+  auto *scale = scope->Var("scale")->GetMutable<f::LoDTensor>();
+
+  // init output
+  auto *out = scope->Var("out")->GetMutable<f::LoDTensor>();
+  auto *out1 = scope->Var("out1")->GetMutable<f::LoDTensor>();
+  auto *found_inf = scope->Var("found_inf")->GetMutable<f::LoDTensor>();
+
+  // Initialize input data
+  const int num_inputs = input_names.size();
+  size_t numel = static_cast<size_t>(f::product(dims));
+
+  for (int i = 0; i < num_inputs; ++i) {
+    std::vector<T> init_xs;
+    for (size_t j = 0; j < numel; ++j) {
+      if (j == 0) {
+        init_xs.push_back(static_cast<T>(NAN));
+      } else {
+        init_xs.push_back(static_cast<T>(j + 1));
+      }
+    }
+    f::TensorFromVector(init_xs, ctx, input_names[i].tensor);
+    input_names[i].tensor->Resize(dims);
+  }
+
+  f::TensorFromVector(std::vector<T>{static_cast<T>(0.5)}, ctx, scale);
+
+  ctx.Wait();
+
+  // run
+  f::AttributeMap attrs;
+  auto op = f::OpRegistry::CreateOp(
+      "check_finite_and_unscale", {{"X", {"x", "x1"}}, {"Scale", {"scale"}}},
+      {{"Out", {"out", "out1"}}, {"FoundInfinite", {"found_inf"}}}, attrs);
+  op->Run(*scope, place);
+  ctx.Wait();
+
+  // out0
+  std::vector<T> out_vec;
+  f::TensorToVector(*out, ctx, &out_vec);
+  EXPECT_EQ(out_vec.size(), static_cast<size_t>(4));
+  for (size_t j = 0; j < out_vec.size(); ++j) {
+    VLOG(3) << "out_vec[" << j << "]:" << out_vec[j];
+  }
+
+  ctx.Wait();
+
+  // out0
+  std::vector<T> out1_vec;
+  f::TensorToVector(*out1, ctx, &out1_vec);
+  EXPECT_EQ(out1_vec.size(), static_cast<size_t>(4));
+  for (size_t j = 0; j < out1_vec.size(); ++j) {
+    VLOG(3) << "out1_vec[" << j << "]:" << out1_vec[j];
+  }
+
+  ctx.Wait();
+
+  // out found_inf
+  Tensor found_inf_tensor;
+  found_inf_tensor.Resize({1});
+  bool *found_inf_data =
+      found_inf_tensor.mutable_data<bool>(paddle::platform::CPUPlace());
+  f::TensorCopy(*found_inf, place, &found_inf_tensor);
+  EXPECT_TRUE(*found_inf_data);
+
+  ctx.Wait();
+}
+
+TEST(check_finite_and_unscale, NPU_fp32) {
+  f::Scope scope;
+  auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  Compare<float>(&scope, *ctx);
+}
+
+TEST(check_finite_and_unscale, NPU_fp16) {
+  f::Scope scope;
+  auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  Compare<p::float16>(&scope, *ctx);
+}
diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
new file mode 100644
index 0000000000000..45b28bf61e5d6
--- /dev/null
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
@@ -0,0 +1,219 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/amp/update_loss_scaling_op.h"
+#include <cmath>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+void Update(const platform::NPUDeviceContext& ctx,
+            const std::vector<bool> found_inf_vec,
+            const Tensor* pre_loss_scaling_tensor, const Tensor* good_in_tensor,
+            const Tensor* bad_in_tensor, const int incr_every_n_steps,
+            const int decr_every_n_nan_or_inf, const float incr_ratio,
+            const float decr_ratio, Tensor* updated_loss_scaling_tensor,
+            Tensor* good_out_tensor, Tensor* bad_out_tensor) {
+  auto place = ctx.GetPlace();
+  auto stream = ctx.stream();
+  if (found_inf_vec[0]) {
+    // good_out_data = 0
+    auto g = good_out_tensor->mutable_data<int>(place);
+    platform::NPUMemsetAsync(static_cast<void*>(g), 0,
+                             good_out_tensor->numel() * sizeof(int), stream);
+    // bad_out_data = bad_in_data + 1
+    Tensor factor_tensor(bad_out_tensor->type());
+    factor_tensor.mutable_data<int>({1}, place);
+    FillNpuTensorWithConstant<int>(&factor_tensor, static_cast<int>(1));
+    auto runner_p2 = NpuOpRunner("Add", {*bad_in_tensor, factor_tensor},
+                                 {*bad_out_tensor}, {});
+    runner_p2.Run(stream);
+
+    std::vector<int> bad_out_data;
+    TensorToVector(*bad_out_tensor, ctx, &bad_out_data);
+    if (bad_out_data[0] == decr_every_n_nan_or_inf) {
+      auto runner_p3 = NpuOpRunner("Power", {*pre_loss_scaling_tensor},
+                                   {*updated_loss_scaling_tensor},
+                                   {{"power", static_cast<float>(1)},
+                                    {"scale", decr_ratio},
+                                    {"shift", static_cast<float>(0)}});
+
+      runner_p3.Run(stream);
+
+      std::vector<T> new_loss_scaling;
+      TensorToVector(*updated_loss_scaling_tensor, ctx, &new_loss_scaling);
+      if (new_loss_scaling[0] < static_cast<T>(1)) {
+        // updated_loss_scaling_data = 1
+        auto runner_p4 = NpuOpRunner("Power", {*pre_loss_scaling_tensor},
+                                     {*updated_loss_scaling_tensor},
+                                     {{"power", static_cast<float>(1)},
+                                      {"scale", static_cast<float>(0)},
+                                      {"shift", static_cast<float>(1)}});
+
+        runner_p4.Run(stream);
+      }
+
+      // bad_out_data = 0
+      auto b = bad_out_tensor->mutable_data<int>(place);
+      platform::NPUMemsetAsync(static_cast<void*>(b), 0,
+                               bad_out_tensor->numel() * sizeof(int), stream);
+    }
+  } else {
+    // bad_out_data = 0
+    auto b = bad_out_tensor->mutable_data<int>(place);
+    platform::NPUMemsetAsync(static_cast<void*>(b), 0,
+                             bad_out_tensor->numel() * sizeof(int), stream);
+
+    // good_out_data = good_in_data + 1
+    Tensor factor_tensor(good_out_tensor->type());
+    factor_tensor.mutable_data<int>({1}, place);
+    FillNpuTensorWithConstant<int>(&factor_tensor, static_cast<int>(1));
+    auto runner_p2 = NpuOpRunner("Add", {*good_in_tensor, factor_tensor},
+                                 {*good_out_tensor}, {});
+    runner_p2.Run(stream);
+
+    std::vector<int> good_out_data;
+    TensorToVector(*good_out_tensor, ctx, &good_out_data);
+
+    if (good_out_data[0] == incr_every_n_steps) {
+      auto runner_p3 = NpuOpRunner("Power", {*pre_loss_scaling_tensor},
+                                   {*updated_loss_scaling_tensor},
+                                   {{"power", static_cast<float>(1)},
+                                    {"scale", incr_ratio},
+                                    {"shift", static_cast<float>(0)}});
+      runner_p3.Run(stream);
+
+      std::vector<T> new_loss_scaling;
+      TensorToVector(*updated_loss_scaling_tensor, ctx, &new_loss_scaling);
+      if (!std::isfinite(new_loss_scaling[0])) {
+        // updated_loss_scaling_data = pre_loss_scaling_data
+        auto runner_p4 = NpuOpRunner("Power", {*pre_loss_scaling_tensor},
+                                     {*updated_loss_scaling_tensor},
+                                     {{"power", static_cast<float>(1)},
+                                      {"scale", static_cast<float>(1)},
+                                      {"shift", static_cast<float>(0)}});
+
+        runner_p4.Run(stream);
+      }
+      // good_out_data = 0
+      auto g = good_out_tensor->mutable_data<int>(place);
+      platform::NPUMemsetAsync(static_cast<void*>(g), 0,
+                               good_out_tensor->numel() * sizeof(int), stream);
+    }
+  }
+}
+
+template <typename T>
+class UpdateLossScalingFunctor<platform::NPUDeviceContext, T> {
+ public:
+  void operator()(const platform::NPUDeviceContext& dev_ctx,
+                  const std::vector<bool> found_inf_vec,
+                  const Tensor* pre_loss_scaling_tensor,
+                  const Tensor* good_in_tensor, const Tensor* bad_in_tensor,
+                  const int incr_every_n_steps,
+                  const int decr_every_n_nan_or_inf, const float incr_ratio,
+                  const float decr_ratio, Tensor* updated_loss_scaling_tensor,
+                  Tensor* good_out_tensor, Tensor* bad_out_tensor) const {
+    Update<T>(dev_ctx, found_inf_vec, pre_loss_scaling_tensor, good_in_tensor,
+              bad_in_tensor, incr_every_n_steps, decr_every_n_nan_or_inf,
+              incr_ratio, decr_ratio, updated_loss_scaling_tensor,
+              good_out_tensor, bad_out_tensor);
+  }
+};
+
+template <typename T>
+class LazyZerosNPU {
+ public:
+  void operator()(const platform::NPUDeviceContext& dev_ctx,
+                  const std::vector<bool> found_inf_vec,
+                  const std::vector<const framework::Tensor*>& xs,
+                  const std::vector<framework::Tensor*>& outs) const {
+    for (size_t i = 0; i < xs.size(); ++i) {
+      auto* out = outs[i];
+      if (found_inf_vec[0]) {
+        VLOG(4) << "-- UpdateLossScaling: Find infinite grads. --";
+
+        auto place = dev_ctx.GetPlace();
+        auto stream = dev_ctx.stream();
+        auto g = out->mutable_data<T>(place);
+        platform::NPUMemsetAsync(static_cast<void*>(g), 0,
+                                 out->numel() * sizeof(T), stream);
+      }
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class UpdateLossScalingNPUKernel : public framework::OpKernel<T> {
+  using MPDType = typename details::MPTypeTrait<T>::Type;
+
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+
+    const auto xs = ctx.MultiInput<framework::Tensor>("X");
+    auto outs = ctx.MultiOutput<framework::Tensor>("Out");
+    const auto* found_inf = ctx.Input<Tensor>("FoundInfinite");
+    PADDLE_ENFORCE_EQ(found_inf->numel(), 1,
+                      platform::errors::InvalidArgument(
+                          "FoundInfinite must has only one element."));
+
+    std::vector<bool> found_inf_vec;
+    TensorToVector(*found_inf, ctx.device_context(), &found_inf_vec);
+
+    LazyZerosNPU<T>{}(dev_ctx, found_inf_vec, xs, outs);
+    const bool stop_update = ctx.Attr<bool>("stop_update");
+    if (stop_update) {
+      return;
+    }
+
+    const auto* pre_loss_scaling = ctx.Input<Tensor>("PrevLossScaling");
+    const auto* good_in = ctx.Input<Tensor>("InGoodSteps");
+    const auto* bad_in = ctx.Input<Tensor>("InBadSteps");
+    auto* updated_loss_scaling = ctx.Output<Tensor>("LossScaling");
+    auto* good_out = ctx.Output<Tensor>("OutGoodSteps");
+    auto* bad_out = ctx.Output<Tensor>("OutBadSteps");
+
+    updated_loss_scaling->mutable_data<MPDType>(dev_ctx.GetPlace());
+    good_out->mutable_data<int>(dev_ctx.GetPlace());
+    bad_out->mutable_data<int>(dev_ctx.GetPlace());
+
+    const int incr_every_n_steps = ctx.Attr<int>("incr_every_n_steps");
+    const int decr_every_n_nan_or_inf =
+        ctx.Attr<int>("decr_every_n_nan_or_inf");
+    const float incr_ratio = ctx.Attr<float>("incr_ratio");
+    const float decr_ratio = ctx.Attr<float>("decr_ratio");
+    UpdateLossScalingFunctor<DeviceContext, MPDType>{}(
+        dev_ctx, found_inf_vec, pre_loss_scaling, good_in, bad_in,
+        incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio, decr_ratio,
+        updated_loss_scaling, good_out, bad_out);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    update_loss_scaling,
+    ops::UpdateLossScalingNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::UpdateLossScalingNPUKernel<paddle::platform::NPUDeviceContext,
+                                    double>);
diff --git a/paddle/fluid/operators/assign_op_npu.cc b/paddle/fluid/operators/assign_op_npu.cc
new file mode 100644
index 0000000000000..93689d5e495f3
--- /dev/null
+++ b/paddle/fluid/operators/assign_op_npu.cc
@@ -0,0 +1,63 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <string>
+
+#include "paddle/fluid/operators/assign_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+#include "paddle/fluid/platform/float16.h"
+
+namespace paddle {
+namespace framework {
+class OpDesc;
+class Variable;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+namespace platform {
+struct CPUPlace;
+struct CUDAPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
+namespace paddle {
+namespace operators {
+template <typename DeviceContext, typename T>
+class AssignNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::LoDTensor>("X");
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
+    out->mutable_data<T>(ctx.GetPlace());
+
+    auto runner = NpuOpRunner("Assign", {*out, *x}, {*out}, {});
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    runner.Run(stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(
+    assign, ops::AssignNPUKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::AssignNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::AssignNPUKernel<paddle::platform::NPUDeviceContext, double>)
diff --git a/paddle/fluid/operators/assign_op_npu_test.cc b/paddle/fluid/operators/assign_op_npu_test.cc
new file mode 100644
index 0000000000000..792d01a5efe43
--- /dev/null
+++ b/paddle/fluid/operators/assign_op_npu_test.cc
@@ -0,0 +1,80 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(assign);
+USE_OP_DEVICE_KERNEL(assign, NPU);
+
+template <typename T>
+void Compare(f::Scope* scope, const p::DeviceContext& ctx,
+             std::string op_type) {
+  // init
+  auto x = scope->Var("X");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+
+  std::vector<T> init;
+  init.push_back(static_cast<T>(1.0));
+  init.push_back(static_cast<T>(2.0));
+  init.push_back(static_cast<T>(3.0));
+  init.push_back(static_cast<T>(4.0));
+
+  TensorFromVector(init, ctx, tensor_x);
+  tensor_x->Resize({4});
+
+  ctx.Wait();
+
+  auto place = ctx.GetPlace();
+  auto out = scope->Var("Out");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+
+  auto op =
+      f::OpRegistry::CreateOp(op_type, {{"X", {"X"}}}, {{"Out", {"Out"}}}, {});
+
+  op->Run(*scope, place);
+
+  std::vector<T> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+
+  ctx.Wait();
+
+  EXPECT_EQ((uint32_t)out_vec.size(), (uint32_t)4);
+  EXPECT_EQ(out_vec[0], static_cast<T>(1.0));
+  EXPECT_EQ(out_vec[1], static_cast<T>(2.0));
+  EXPECT_EQ(out_vec[2], static_cast<T>(3.0));
+  EXPECT_EQ(out_vec[3], static_cast<T>(4.0));
+}
+
+TEST(assign, NPU_fp32) {
+  f::Scope scope;
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  Compare<float>(&scope, *ctx, "assign");
+}
diff --git a/paddle/fluid/operators/batch_norm_op.cu b/paddle/fluid/operators/batch_norm_op.cu
index 444c24b826b1b..41dc87ac1ba47 100644
--- a/paddle/fluid/operators/batch_norm_op.cu
+++ b/paddle/fluid/operators/batch_norm_op.cu
@@ -41,6 +41,83 @@ using CudnnDataType = platform::CudnnDataType<T>;
 template <typename T>
 using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;
 
+template <typename T, framework::DataLayout layout>
+static __global__ void BNForwardInference(
+    const T *x, const BatchNormParamType<T> *mean,
+    const BatchNormParamType<T> *variance, const BatchNormParamType<T> *scale,
+    const BatchNormParamType<T> *bias, const int C, const int N, const int HxW,
+    const double epsilon, T *y) {
+  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  int num = N * C * HxW;
+  for (int i = gid; i < num; i += stride) {
+    const int c = layout == framework::DataLayout::kNCHW ? i / HxW % C : i % C;
+    BatchNormParamType<T> x_sub_mean =
+        static_cast<BatchNormParamType<T>>(x[i]) - mean[c];
+    BatchNormParamType<T> inv_var = 1 / sqrt(variance[c] + epsilon);
+    y[i] = static_cast<T>(scale[c] * x_sub_mean * inv_var + bias[c]);
+  }
+}
+
+template <typename T, int BlockDim, framework::DataLayout layout>
+static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTraining(
+    const T *x, const BatchNormParamType<T> *scale,
+    const BatchNormParamType<T> *bias, const int C, const int N, const int HxW,
+    const double epsilon, double exponentialAverageFactor, T *y,
+    BatchNormParamType<T> *mean, BatchNormParamType<T> *variance,
+    BatchNormParamType<T> *save_mean,
+    BatchNormParamType<T> *save_inv_variance) {
+  int outer_size = C;
+  int inner_size = N * HxW;
+  typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage mean_storage;
+  __shared__ typename BlockReduce::TempStorage variance_storeage;
+  __shared__ BatchNormParamType<T> mean_val;
+  __shared__ BatchNormParamType<T> variance_val;
+  __shared__ BatchNormParamType<T> inv_var_val;
+
+  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
+    BatchNormParamType<T> x_sum = static_cast<BatchNormParamType<T>>(0);
+    BatchNormParamType<T> x_square_sum = static_cast<BatchNormParamType<T>>(0);
+
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int index = layout == framework::DataLayout::kNCHW
+                            ? (j / HxW * C + i) * HxW + j % HxW
+                            : j * outer_size + i;
+      BatchNormParamType<T> x_i = static_cast<BatchNormParamType<T>>(x[index]);
+      x_sum += x_i;
+      x_square_sum += x_i * x_i;
+    }
+    x_sum = BlockReduce(mean_storage).Reduce(x_sum, cub::Sum());
+    x_square_sum =
+        BlockReduce(variance_storeage).Reduce(x_square_sum, cub::Sum());
+    if (threadIdx.x == 0) {
+      mean_val = x_sum / inner_size;
+      variance_val = x_square_sum / inner_size - mean_val * mean_val;
+      inv_var_val = 1 / sqrt(variance_val + epsilon);
+
+      if (save_mean && save_inv_variance) {
+        save_mean[i] = mean_val;
+        save_inv_variance[i] = inv_var_val;
+      }
+      mean[i] = (1 - exponentialAverageFactor) * mean_val +
+                exponentialAverageFactor * mean[i];
+      variance[i] = (1 - exponentialAverageFactor) * variance_val +
+                    exponentialAverageFactor * variance[i];
+    }
+    __syncthreads();
+
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int index = layout == framework::DataLayout::kNCHW
+                            ? (j / HxW * C + i) * HxW + j % HxW
+                            : j * outer_size + i;
+      BatchNormParamType<T> x_sub_mean =
+          static_cast<BatchNormParamType<T>>(x[index]) - mean_val;
+      y[index] = scale[i] * x_sub_mean * inv_var_val + bias[i];
+    }
+  }
+}
+
 template <typename T>
 class BatchNormKernel<platform::CUDADeviceContext, T>
     : public framework::OpKernel<T> {
@@ -80,8 +157,12 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
     auto dtype = platform::CudnnDataType<T>::type;
 
 #ifdef PADDLE_WITH_HIP
-    // HIP do not support compute format of NHWC
-    auto compute_format = DataLayout::kNCHW;
+    auto compute_format = data_layout == DataLayout::kNHWC ? DataLayout::kNHWC
+                                                           : DataLayout::kNCHW;
+
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// HIP do not support compute format of NHWC
+// auto compute_format = DataLayout::kNCHW;
 #else
     const bool fast_nhwc_batch_norm =
         test_mode ||
@@ -111,14 +192,15 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
 
 // ------------------- cudnn descriptors ---------------------
 #ifdef PADDLE_WITH_HIP
-    miopenTensorDescriptor_t data_desc_;
-    miopenTensorDescriptor_t bn_param_desc_;
-    miopenBatchNormMode_t mode_;
-
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_));
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// miopenTensorDescriptor_t data_desc_;
+// miopenTensorDescriptor_t bn_param_desc_;
+// miopenBatchNormMode_t mode_;
+
+// PADDLE_ENFORCE_CUDA_SUCCESS(
+//     platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
+// PADDLE_ENFORCE_CUDA_SUCCESS(
+//     platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_));
 #else
     cudnnTensorDescriptor_t data_desc_;
     cudnnTensorDescriptor_t bn_param_desc_;
@@ -138,7 +220,8 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
     epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
 
 #ifdef PADDLE_WITH_HIP
-    mode_ = miopenBNSpatial;
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// mode_ = miopenBNSpatial;
 #elif CUDNN_VERSION_MIN(7, 0, 1)
     if (FLAGS_cudnn_batchnorm_spatial_persistent) {
       mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
@@ -161,14 +244,15 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
     }
 
 #ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
-        data_desc_, CudnnDataType<T>::type,
-        x_dims.size() > 3 ? x_dims.size() : 4, const_cast<int *>(dims.data()),
-        const_cast<int *>(strides.data())));
-    // Note: PERSISTENT not implemented for inference
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::miopenDeriveBNTensorDescriptor(
-            bn_param_desc_, data_desc_, test_mode ? miopenBNSpatial : mode_));
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
+//     data_desc_, CudnnDataType<T>::type,
+//     x_dims.size() > 3 ? x_dims.size() : 4, const_cast<int *>(dims.data()),
+//     const_cast<int *>(strides.data())));
+// Note: PERSISTENT not implemented for inference
+// PADDLE_ENFORCE_CUDA_SUCCESS(
+//     platform::dynload::miopenDeriveBNTensorDescriptor(
+//         bn_param_desc_, data_desc_, test_mode ? miopenBNSpatial : mode_));
 #else
     PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
         data_desc_, CudnnDataType<T>::type,
@@ -226,28 +310,53 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
               C, est_var->dims()[0], est_var->dims()));
 
 #ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          platform::dynload::miopenBatchNormalizationForwardInference(
-              handle, miopenBNSpatial,
-              const_cast<void *>(
-                  static_cast<const void *>(CudnnDataType<T>::kOne())),
-              const_cast<void *>(
-                  static_cast<const void *>(CudnnDataType<T>::kZero())),
-              data_desc_,
-              static_cast<const void *>(transformed_x.template data<T>()),
-              data_desc_,
-              static_cast<void *>(
-                  transformed_y.template mutable_data<T>(ctx.GetPlace())),
-              bn_param_desc_,
-              const_cast<void *>(static_cast<const void *>(
-                  scale->template data<BatchNormParamType<T>>())),
-              const_cast<void *>(static_cast<const void *>(
-                  bias->template data<BatchNormParamType<T>>())),
-              const_cast<void *>(static_cast<const void *>(
-                  est_mean->template data<BatchNormParamType<T>>())),
-              const_cast<void *>(static_cast<const void *>(
-                  est_var->template data<BatchNormParamType<T>>())),
-              epsilon));
+      const int block_size = 256;
+      const int grid_size = (N * C * H * W * D + block_size - 1) / block_size;
+      if (compute_format == DataLayout::kNCHW) {
+        BNForwardInference<
+            T,
+            DataLayout::kNCHW><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
+            transformed_x.template data<T>(),
+            est_mean->template data<BatchNormParamType<T>>(),
+            est_var->template data<BatchNormParamType<T>>(),
+            scale->template data<BatchNormParamType<T>>(),
+            bias->template data<BatchNormParamType<T>>(), C, N, H * W * D,
+            epsilon, transformed_y.template data<T>());
+      } else {
+        BNForwardInference<
+            T,
+            DataLayout::kNHWC><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
+            transformed_x.template data<T>(),
+            est_mean->template data<BatchNormParamType<T>>(),
+            est_var->template data<BatchNormParamType<T>>(),
+            scale->template data<BatchNormParamType<T>>(),
+            bias->template data<BatchNormParamType<T>>(), C, N, H * W * D,
+            epsilon, transformed_y.template data<T>());
+      }
+
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// PADDLE_ENFORCE_CUDA_SUCCESS(
+//     platform::dynload::miopenBatchNormalizationForwardInference(
+//         handle, miopenBNSpatial,
+//         const_cast<void *>(
+//             static_cast<const void *>(CudnnDataType<T>::kOne())),
+//         const_cast<void *>(
+//             static_cast<const void *>(CudnnDataType<T>::kZero())),
+//         data_desc_,
+//         static_cast<const void *>(transformed_x.template data<T>()),
+//         data_desc_,
+//         static_cast<void *>(
+//             transformed_y.template mutable_data<T>(ctx.GetPlace())),
+//         bn_param_desc_,
+//         const_cast<void *>(static_cast<const void *>(
+//             scale->template data<BatchNormParamType<T>>())),
+//         const_cast<void *>(static_cast<const void *>(
+//             bias->template data<BatchNormParamType<T>>())),
+//         const_cast<void *>(static_cast<const void *>(
+//             est_mean->template data<BatchNormParamType<T>>())),
+//         const_cast<void *>(static_cast<const void *>(
+//             est_var->template data<BatchNormParamType<T>>())),
+//         epsilon));
 #else
       PADDLE_ENFORCE_CUDA_SUCCESS(
           platform::dynload::cudnnBatchNormalizationForwardInference(
@@ -365,34 +474,66 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
 #endif  // CUDNN_VERSION_MIN(7, 4, 1)
         if (!called) {
 #ifdef PADDLE_WITH_HIP
-          PADDLE_ENFORCE_CUDA_SUCCESS(
-              platform::dynload::miopenBatchNormalizationForwardTraining(
-                  handle, mode_, const_cast<void *>(static_cast<const void *>(
-                                     CudnnDataType<T>::kOne())),
-                  const_cast<void *>(
-                      static_cast<const void *>(CudnnDataType<T>::kZero())),
-                  data_desc_,
-                  static_cast<const void *>(transformed_x.template data<T>()),
-                  data_desc_,
-                  static_cast<void *>(
-                      transformed_y.template mutable_data<T>(ctx.GetPlace())),
-                  bn_param_desc_,
-                  const_cast<void *>(static_cast<const void *>(
-                      scale->template data<BatchNormParamType<T>>())),
-                  const_cast<void *>(static_cast<const void *>(
-                      bias->template data<BatchNormParamType<T>>())),
-                  this_factor,
-                  static_cast<void *>(
-                      mean_out->template mutable_data<BatchNormParamType<T>>(
-                          ctx.GetPlace())),
-                  static_cast<void *>(variance_out->template mutable_data<
-                                      BatchNormParamType<T>>(ctx.GetPlace())),
-                  epsilon,
-                  static_cast<void *>(
-                      saved_mean->template mutable_data<BatchNormParamType<T>>(
-                          ctx.GetPlace())),
-                  static_cast<void *>(saved_variance->template mutable_data<
-                                      BatchNormParamType<T>>(ctx.GetPlace()))));
+          const int num = transformed_x.numel();
+          const int block = 256;
+          const int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+          const int max_blocks = std::max(max_threads / block, 1);
+          const int grid = std::min(C, max_blocks);
+          if (compute_format == DataLayout::kNCHW) {
+            BNForwardTraining<
+                T, block,
+                DataLayout::kNCHW><<<grid, block, 0, dev_ctx.stream()>>>(
+                transformed_x.template data<T>(),
+                scale->template data<BatchNormParamType<T>>(),
+                bias->template data<BatchNormParamType<T>>(), C, N, H * W * D,
+                epsilon, this_factor, transformed_y.template data<T>(),
+                mean_out->template data<BatchNormParamType<T>>(),
+                variance_out->template data<BatchNormParamType<T>>(),
+                saved_mean->template data<BatchNormParamType<T>>(),
+                saved_variance->template data<BatchNormParamType<T>>());
+          } else {
+            BNForwardTraining<
+                T, block,
+                DataLayout::kNHWC><<<grid, block, 0, dev_ctx.stream()>>>(
+                transformed_x.template data<T>(),
+                scale->template data<BatchNormParamType<T>>(),
+                bias->template data<BatchNormParamType<T>>(), C, N, H * W * D,
+                epsilon, this_factor, transformed_y.template data<T>(),
+                mean_out->template data<BatchNormParamType<T>>(),
+                variance_out->template data<BatchNormParamType<T>>(),
+                saved_mean->template data<BatchNormParamType<T>>(),
+                saved_variance->template data<BatchNormParamType<T>>());
+          }
+
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// PADDLE_ENFORCE_CUDA_SUCCESS(
+//     platform::dynload::miopenBatchNormalizationForwardTraining(
+//         handle, mode_, const_cast<void *>(static_cast<const void *>(
+//                            CudnnDataType<T>::kOne())),
+//         const_cast<void *>(
+//             static_cast<const void *>(CudnnDataType<T>::kZero())),
+//         data_desc_,
+//         static_cast<const void *>(transformed_x.template data<T>()),
+//         data_desc_,
+//         static_cast<void *>(
+//             transformed_y.template mutable_data<T>(ctx.GetPlace())),
+//         bn_param_desc_,
+//         const_cast<void *>(static_cast<const void *>(
+//             scale->template data<BatchNormParamType<T>>())),
+//         const_cast<void *>(static_cast<const void *>(
+//             bias->template data<BatchNormParamType<T>>())),
+//         this_factor,
+//         static_cast<void *>(
+//             mean_out->template mutable_data<BatchNormParamType<T>>(
+//                 ctx.GetPlace())),
+//         static_cast<void *>(variance_out->template mutable_data<
+//                             BatchNormParamType<T>>(ctx.GetPlace())),
+//         epsilon,
+//         static_cast<void *>(
+//             saved_mean->template mutable_data<BatchNormParamType<T>>(
+//                 ctx.GetPlace())),
+//         static_cast<void *>(saved_variance->template mutable_data<
+//                             BatchNormParamType<T>>(ctx.GetPlace()))));
 #else
           PADDLE_ENFORCE_CUDA_SUCCESS(
               platform::dynload::cudnnBatchNormalizationForwardTraining(
@@ -423,11 +564,12 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
           ctx, &transformed_y, y);
     }
 #ifdef PADDLE_WITH_HIP
-    // clean when exit.
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_));
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// clean when exit.
+// PADDLE_ENFORCE_CUDA_SUCCESS(
+//     platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
+// PADDLE_ENFORCE_CUDA_SUCCESS(
+//     platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_));
 #else
     // clean when exit.
     PADDLE_ENFORCE_CUDA_SUCCESS(
@@ -439,7 +581,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
 };
 
 template <typename T, int BlockDim, framework::DataLayout layout>
-static __global__ void KeBNBackwardScaleBias(
+static __global__ LAUNCH_BOUNDS(BlockDim) void KeBNBackwardScaleBias(
     const T *dy, const T *x, const BatchNormParamType<T> *mean,
     const BatchNormParamType<T> *variance, const double epsilon, const int N,
     const int C, const int HxW, BatchNormParamType<T> *dscale,
@@ -526,13 +668,97 @@ class InplaceHelper {
 };
 
 template <typename T, int BlockDim, framework::DataLayout layout>
-static __global__ void BNBackwardData(const T *dy,
-                                      const BatchNormParamType<T> *scale,
-                                      const BatchNormParamType<T> *mean,
-                                      const T *x,
-                                      const BatchNormParamType<T> *variance,
-                                      const int C, const int N, const int HxW,
-                                      T *dx) {
+static __global__ LAUNCH_BOUNDS(BlockDim) void BNBackward(
+    const T *dy, const T *x, const BatchNormParamType<T> *scale,
+    const BatchNormParamType<T> *saved_mean,
+    const BatchNormParamType<T> *saved_inv_variance, const int C, const int N,
+    const int HxW, const double epsilon, T *dx, BatchNormParamType<T> *dscale,
+    BatchNormParamType<T> *dbias) {
+  const int outer_size = C;
+  const int inner_size = N * HxW;
+  typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage ds_storage;
+  __shared__ typename BlockReduce::TempStorage db_storage;
+  __shared__ typename BlockReduce::TempStorage mean_storage;
+  __shared__ typename BlockReduce::TempStorage variance_storeage;
+  __shared__ BatchNormParamType<T> inv_var_val;
+  __shared__ BatchNormParamType<T> mean_val;
+  __shared__ BatchNormParamType<T> dscale_val;
+  __shared__ BatchNormParamType<T> dbias_val;
+
+  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
+    BatchNormParamType<T> ds_sum = static_cast<BatchNormParamType<T>>(0);
+    BatchNormParamType<T> db_sum = static_cast<BatchNormParamType<T>>(0);
+
+    if (saved_mean && saved_inv_variance) {
+      if (threadIdx.x == 0) {
+        inv_var_val = saved_inv_variance[i];
+        mean_val = saved_mean[i];
+      }
+    } else {
+      BatchNormParamType<T> x_sum = static_cast<BatchNormParamType<T>>(0);
+      BatchNormParamType<T> x_square_sum =
+          static_cast<BatchNormParamType<T>>(0);
+
+      for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+        const int index = layout == framework::DataLayout::kNCHW
+                              ? (j / HxW * C + i) * HxW + j % HxW
+                              : j * outer_size + i;
+        BatchNormParamType<T> x_i =
+            static_cast<BatchNormParamType<T>>(x[index]);
+        x_sum += x_i;
+        x_square_sum += x_i * x_i;
+      }
+      x_sum = BlockReduce(mean_storage).Reduce(x_sum, cub::Sum());
+      x_square_sum =
+          BlockReduce(variance_storeage).Reduce(x_square_sum, cub::Sum());
+      if (threadIdx.x == 0) {
+        mean_val = x_sum / inner_size;
+        inv_var_val =
+            1 / sqrt(x_square_sum / inner_size - mean_val * mean_val + epsilon);
+      }
+    }
+    __syncthreads();
+
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int index = layout == framework::DataLayout::kNCHW
+                            ? (j / HxW * C + i) * HxW + j % HxW
+                            : j * outer_size + i;
+      BatchNormParamType<T> dy_i =
+          static_cast<BatchNormParamType<T>>(dy[index]);
+      ds_sum +=
+          dy_i * (static_cast<BatchNormParamType<T>>(x[index]) - mean_val);
+      db_sum += dy_i;
+    }
+    ds_sum = BlockReduce(ds_storage).Reduce(ds_sum, cub::Sum());
+    db_sum = BlockReduce(db_storage).Reduce(db_sum, cub::Sum());
+    if (threadIdx.x == 0) {
+      dscale_val = ds_sum * inv_var_val;
+      dbias_val = db_sum;
+      dscale[i] = dscale_val;
+      dbias[i] = dbias_val;
+    }
+    __syncthreads();
+
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int index = layout == framework::DataLayout::kNCHW
+                            ? (j / HxW * C + i) * HxW + j % HxW
+                            : j * outer_size + i;
+      dx[index] = scale[i] * inv_var_val *
+                  (static_cast<BatchNormParamType<T>>(dy[index]) -
+                   dbias_val / static_cast<BatchNormParamType<T>>(inner_size) -
+                   (static_cast<BatchNormParamType<T>>(x[index]) - mean_val) *
+                       inv_var_val * dscale_val / inner_size);
+    }
+  }
+}
+
+template <typename T, int BlockDim, framework::DataLayout layout>
+static __global__ LAUNCH_BOUNDS(BlockDim) void BNBackwardData(
+    const T *dy, const BatchNormParamType<T> *scale,
+    const BatchNormParamType<T> *mean, const T *x,
+    const BatchNormParamType<T> *variance, const int C, const int N,
+    const int HxW, T *dx) {
   const int outer_size = C;
   const int inner_size = N * HxW;
   typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
@@ -567,7 +793,6 @@ static __global__ void BNBackwardData(const T *dy,
       dy_x_sub_mean_sum_val = dy_x_sub_mean_sum;
     }
     __syncthreads();
-
     for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
       const int index = layout == framework::DataLayout::kNCHW
                             ? (j / HxW * C + i) * HxW + j % HxW
@@ -668,8 +893,12 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
     auto dtype = platform::CudnnDataType<T>::type;
     const auto *reserve_space = ctx.Input<Tensor>("ReserveSpace");
 #ifdef PADDLE_WITH_HIP
-    // HIP do not support compute format of NHWC
-    auto compute_format = DataLayout::kNCHW;
+    auto compute_format = data_layout == DataLayout::kNHWC ? DataLayout::kNHWC
+                                                           : DataLayout::kNCHW;
+
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// HIP do not support compute format of NHWC
+// auto compute_format = DataLayout::kNCHW;
 #else
     const bool fast_nhwc_batch_norm =
         dtype == CUDNN_DATA_HALF && FLAGS_cudnn_batchnorm_spatial_persistent &&
@@ -714,7 +943,11 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
 
     auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     const int num = transformed_x.numel();
+#ifdef HIPCC
+    const int block = 256;
+#else
     const int block = 512;
+#endif
     int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
     const int max_blocks = std::max(max_threads / block, 1);
     int grid1 = (num + block - 1) / block;
@@ -734,14 +967,15 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
 
 // ------------------- cudnn descriptors ---------------------
 #ifdef PADDLE_WITH_HIP
-      miopenTensorDescriptor_t data_desc_;
-      miopenTensorDescriptor_t bn_param_desc_;
-      miopenBatchNormMode_t mode_;
-
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_));
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// miopenTensorDescriptor_t data_desc_;
+// miopenTensorDescriptor_t bn_param_desc_;
+// miopenBatchNormMode_t mode_;
+
+// PADDLE_ENFORCE_CUDA_SUCCESS(
+//     platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
+// PADDLE_ENFORCE_CUDA_SUCCESS(
+//     platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_));
 #else
       cudnnTensorDescriptor_t data_desc_;
       cudnnTensorDescriptor_t bn_param_desc_;
@@ -759,7 +993,8 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
       }
       epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
 #ifdef PADDLE_WITH_HIP
-      mode_ = miopenBNSpatial;
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// mode_ = miopenBNSpatial;
 #elif CUDNN_VERSION_MIN(7, 0, 1)
       if (FLAGS_cudnn_batchnorm_spatial_persistent) {
         mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
@@ -771,13 +1006,14 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
 #endif  // CUDNN_VERSION_MIN(7, 0, 1)
 
 #ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
-          data_desc_, CudnnDataType<T>::type,
-          x_dims.size() > 3 ? x_dims.size() : 4, const_cast<int *>(dims.data()),
-          const_cast<int *>(strides.data())));
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          platform::dynload::miopenDeriveBNTensorDescriptor(bn_param_desc_,
-                                                            data_desc_, mode_));
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
+//     data_desc_, CudnnDataType<T>::type,
+//     x_dims.size() > 3 ? x_dims.size() : 4, const_cast<int *>(dims.data()),
+//     const_cast<int *>(strides.data())));
+// PADDLE_ENFORCE_CUDA_SUCCESS(
+//     platform::dynload::miopenDeriveBNTensorDescriptor(bn_param_desc_,
+//                                                       data_desc_, mode_));
 #else
       PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
           data_desc_, CudnnDataType<T>::type,
@@ -871,20 +1107,49 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
 #endif  // CUDNN_VERSION_MIN(7, 4, 1)
         if (!called) {
 #ifdef PADDLE_WITH_HIP
-          PADDLE_ENFORCE_CUDA_SUCCESS(
-              platform::dynload::miopenBatchNormalizationBackward(
-                  dev_ctx.cudnn_handle(), mode_, CudnnDataType<T>::kOne(),
-                  CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(),
-                  CudnnDataType<T>::kZero(), data_desc_,
-                  transformed_x.template data<T>(), data_desc_,
-                  transformed_d_y.template data<T>(), data_desc_,
-                  transformed_d_x.template mutable_data<T>(ctx.GetPlace()),
-                  bn_param_desc_, scale->template data<BatchNormParamType<T>>(),
-                  d_scale->template mutable_data<BatchNormParamType<T>>(
-                      ctx.GetPlace()),
-                  d_bias->template mutable_data<BatchNormParamType<T>>(
-                      ctx.GetPlace()),
-                  epsilon, saved_mean_data, saved_var_data));
+          if (compute_format == DataLayout::kNCHW) {
+            BNBackward<
+                T, block,
+                DataLayout::kNCHW><<<grid2, block, 0, dev_ctx.stream()>>>(
+                transformed_d_y.template data<T>(),
+                transformed_x.template data<T>(),
+                scale->template data<BatchNormParamType<T>>(), saved_mean_data,
+                saved_var_data, C, N, H * W * D, epsilon,
+                transformed_d_x.template data<T>(),
+                d_scale->template mutable_data<BatchNormParamType<T>>(
+                    ctx.GetPlace()),
+                d_bias->template mutable_data<BatchNormParamType<T>>(
+                    ctx.GetPlace()));
+          } else {
+            BNBackward<
+                T, block,
+                DataLayout::kNHWC><<<grid2, block, 0, dev_ctx.stream()>>>(
+                transformed_d_y.template data<T>(),
+                transformed_x.template data<T>(),
+                scale->template data<BatchNormParamType<T>>(), saved_mean_data,
+                saved_var_data, C, N, H * W * D, epsilon,
+                transformed_d_x.template data<T>(),
+                d_scale->template mutable_data<BatchNormParamType<T>>(
+                    ctx.GetPlace()),
+                d_bias->template mutable_data<BatchNormParamType<T>>(
+                    ctx.GetPlace()));
+          }
+
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// PADDLE_ENFORCE_CUDA_SUCCESS(
+//     platform::dynload::miopenBatchNormalizationBackward(
+//         dev_ctx.cudnn_handle(), mode_, CudnnDataType<T>::kOne(),
+//         CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(),
+//         CudnnDataType<T>::kZero(), data_desc_,
+//         transformed_x.template data<T>(), data_desc_,
+//         transformed_d_y.template data<T>(), data_desc_,
+//         transformed_d_x.template mutable_data<T>(ctx.GetPlace()),
+//         bn_param_desc_, scale->template data<BatchNormParamType<T>>(),
+//         d_scale->template mutable_data<BatchNormParamType<T>>(
+//             ctx.GetPlace()),
+//         d_bias->template mutable_data<BatchNormParamType<T>>(
+//             ctx.GetPlace()),
+//         epsilon, saved_mean_data, saved_var_data));
 #else
           PADDLE_ENFORCE_CUDA_SUCCESS(
               platform::dynload::cudnnBatchNormalizationBackward(
@@ -931,11 +1196,12 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
       }
 
 #ifdef PADDLE_WITH_HIP
-      // clean when exit.
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_));
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// clean when exit.
+// PADDLE_ENFORCE_CUDA_SUCCESS(
+//     platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
+// PADDLE_ENFORCE_CUDA_SUCCESS(
+//     platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_));
 #else
       // clean when exit.
       PADDLE_ENFORCE_CUDA_SUCCESS(
diff --git a/paddle/fluid/operators/bce_loss_op.cu b/paddle/fluid/operators/bce_loss_op.cu
index 99153101fc326..8bd2b7fe2d127 100644
--- a/paddle/fluid/operators/bce_loss_op.cu
+++ b/paddle/fluid/operators/bce_loss_op.cu
@@ -32,6 +32,11 @@ __global__ void GPUBCELossForward(const T* x_data, const T* label_data,
     T one = static_cast<T>(1.);
     T neg_100 = static_cast<T>(-100.);
 
+    PADDLE_ENFORCE(
+        (x >= static_cast<T>(0)) && (x <= one),
+        "Input is expected to be within the interval [0, 1], but recieved %f.",
+        x);
+
     T term1 = max(real_log(x), neg_100);
     T term2 = max(real_log(one - x), neg_100);
 
@@ -64,29 +69,13 @@ class BCELossCUDAKernel : public framework::OpKernel<T> {
     auto* labels = ctx.Input<Tensor>("Label");
     auto* out = ctx.Output<Tensor>("Out");
 
-    auto x_data = x->data<T>();
-    auto out_data = out->mutable_data<T>(ctx.GetPlace());
+    const auto* x_data = x->data<T>();
+    auto* out_data = out->mutable_data<T>(ctx.GetPlace());
     auto x_numel = x->numel();
 
-    platform::GpuLaunchConfig config =
-        platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), x_numel);
-
-    Tensor x_cpu;
-    framework::TensorCopy(*x, platform::CPUPlace(), &x_cpu);
-    T* x_cpu_data = x_cpu.data<T>();
-
-    for (int64_t i = 0; i < x_numel; ++i) {
-      PADDLE_ENFORCE_GE(
-          x_cpu_data[i], static_cast<T>(0),
-          platform::errors::InvalidArgument(
-              "Illegal input, input must be greater than  or equal to 0"));
-      PADDLE_ENFORCE_LE(
-          x_cpu_data[i], static_cast<T>(1),
-          platform::errors::InvalidArgument(
-              "Illegal input, input must be less than or equal to 1"));
-    }
-
     auto& dev_ctx = ctx.cuda_device_context();
+    platform::GpuLaunchConfig config =
+        platform::GetGpuLaunchConfig1D(dev_ctx, x_numel);
 
     GPUBCELossForward<T><<<config.block_per_grid, config.thread_per_block, 0,
                            dev_ctx.stream()>>>(x_data, labels->data<T>(),
@@ -102,9 +91,10 @@ class BCELossGradCUDAKernel : public framework::OpKernel<T> {
     auto* labels = ctx.Input<Tensor>("Label");
     auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
     auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto dx_data = dx->mutable_data<T>(ctx.GetPlace());
 
     int x_numel = x->numel();
+    auto* dx_data = dx->mutable_data<T>(ctx.GetPlace());
+
     auto& dev_ctx = ctx.cuda_device_context();
     platform::GpuLaunchConfig config =
         platform::GetGpuLaunchConfig1D(dev_ctx, x_numel);
diff --git a/paddle/fluid/operators/cast_op_npu.cc b/paddle/fluid/operators/cast_op_npu.cc
new file mode 100644
index 0000000000000..0de0f5e450579
--- /dev/null
+++ b/paddle/fluid/operators/cast_op_npu.cc
@@ -0,0 +1,100 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/operators/cast_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+static std::map<framework::proto::VarType::Type, aclDataType>
+    DTYPE_2_ACL_DTYPE = {
+        {framework::proto::VarType::BOOL, ACL_BOOL},
+        {framework::proto::VarType::INT16, ACL_INT16},
+        {framework::proto::VarType::INT32, ACL_INT32},
+        {framework::proto::VarType::INT64, ACL_INT64},
+        {framework::proto::VarType::FP16, ACL_FLOAT16},
+        {framework::proto::VarType::FP32, ACL_FLOAT},
+        {framework::proto::VarType::FP64, ACL_DOUBLE},
+};
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class CastNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    int dtype = ctx.Attr<int>("out_dtype");
+    auto* out = ctx.Output<Tensor>("Out");
+    auto place = ctx.GetPlace();
+
+    if (x->type() == dtype) {
+      // NOTE(zhiqiu): NPU cast op may result in wrong value, so
+      // add special case here.
+      VLOG(4) << "cast to same dtype:" << dtype;
+      out->mutable_data(place, x->type());
+      framework::TensorCopy(
+          *x, ctx.GetPlace(),
+          ctx.template device_context<platform::DeviceContext>(), out);
+      return;
+    }
+
+    auto iter = DTYPE_2_ACL_DTYPE.find(
+        static_cast<framework::proto::VarType::Type>(dtype));
+    int aclDtype = iter->second;
+
+    if (dtype == framework::proto::VarType::FP32) {
+      out->mutable_data<float>(place);
+    } else if (dtype == framework::proto::VarType::FP16) {
+      out->mutable_data<paddle::platform::float16>(place);
+    } else if (dtype == framework::proto::VarType::INT16) {
+      out->mutable_data<int16_t>(place);
+    } else if (dtype == framework::proto::VarType::INT32) {
+      out->mutable_data<int32_t>(place);
+    } else if (dtype == framework::proto::VarType::INT64) {
+      out->mutable_data<int64_t>(place);
+    } else if (dtype == framework::proto::VarType::FP64) {
+      out->mutable_data<double>(place);
+    } else if (dtype == framework::proto::VarType::BOOL) {
+      out->mutable_data<bool>(place);
+    }
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    auto runner = NpuOpRunner("Cast", {*x}, {*out},
+                              {{"dst_type", static_cast<int32_t>(aclDtype)}});
+    runner.Run(stream);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    cast, ops::CastNPUKernel<paddle::platform::NPUDeviceContext, int16_t>,
+    ops::CastNPUKernel<paddle::platform::NPUDeviceContext, int32_t>,
+    ops::CastNPUKernel<paddle::platform::NPUDeviceContext, int64_t>,
+    ops::CastNPUKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::CastNPUKernel<paddle::platform::NPUDeviceContext, bool>,
+    ops::CastNPUKernel<paddle::platform::NPUDeviceContext, double>,
+    ops::CastNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::CastNPUKernel<paddle::platform::NPUDeviceContext,
+                       paddle::platform::float16>);
diff --git a/paddle/fluid/operators/clip_op.cc b/paddle/fluid/operators/clip_op.cc
index eb27df8a36757..7176a0466bb83 100644
--- a/paddle/fluid/operators/clip_op.cc
+++ b/paddle/fluid/operators/clip_op.cc
@@ -145,10 +145,14 @@ REGISTER_OPERATOR(clip_grad, ops::ClipOpGrad, ops::ClipGradInplaceInferer,
                   ops::ClipDoubleGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OP_CPU_KERNEL(
     clip, ops::ClipKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ClipKernel<paddle::platform::CPUDeviceContext, double>);
+    ops::ClipKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ClipKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ClipKernel<paddle::platform::CPUDeviceContext, int64_t>);
 REGISTER_OP_CPU_KERNEL(
     clip_grad, ops::ClipGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ClipGradKernel<paddle::platform::CPUDeviceContext, double>);
+    ops::ClipGradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ClipGradKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ClipGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
 
 REGISTER_OP_VERSION(clip)
     .AddCheckpoint(
diff --git a/paddle/fluid/operators/clip_op.cu b/paddle/fluid/operators/clip_op.cu
index d31b81c13c5cf..fd61e4ea61d4f 100644
--- a/paddle/fluid/operators/clip_op.cu
+++ b/paddle/fluid/operators/clip_op.cu
@@ -17,8 +17,12 @@ limitations under the License. */
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
     clip, ops::ClipKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ClipKernel<paddle::platform::CUDADeviceContext, double>);
+    ops::ClipKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ClipKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ClipKernel<paddle::platform::CUDADeviceContext, int64_t>);
 
 REGISTER_OP_CUDA_KERNEL(
     clip_grad, ops::ClipGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ClipGradKernel<paddle::platform::CUDADeviceContext, double>);
+    ops::ClipGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ClipGradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ClipGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/collective/CMakeLists.txt b/paddle/fluid/operators/collective/CMakeLists.txt
index 977a208d20e78..3f210219608fb 100644
--- a/paddle/fluid/operators/collective/CMakeLists.txt
+++ b/paddle/fluid/operators/collective/CMakeLists.txt
@@ -11,7 +11,7 @@ foreach(src ${OPS})
     set_source_files_properties(${src} PROPERTIES COMPILE_FLAGS ${COLLECTIVE_COMPILE_FLAGS})
 endforeach()
 
-register_operators(EXCLUDES c_gen_bkcl_id_op gen_bkcl_id_op c_gen_nccl_id_op gen_nccl_id_op DEPS ${COLLECTIVE_DEPS})
+register_operators(EXCLUDES c_gen_bkcl_id_op gen_bkcl_id_op c_gen_nccl_id_op gen_nccl_id_op c_gen_hccl_id_op gen_hccl_id_op DEPS ${COLLECTIVE_DEPS})
 
 if(WITH_NCCL OR WITH_RCCL)
     set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} nccl_common collective_helper)
@@ -19,12 +19,6 @@ if(WITH_NCCL OR WITH_RCCL)
     op_library(gen_nccl_id_op DEPS ${COLLECTIVE_DEPS})
 endif()
 
-if(WITH_ASCEND)
-    op_library(gen_nccl_id_op)
-    op_library(c_gen_nccl_id_op)
-endif()
-
-
 if(WITH_GLOO)
     set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} gloo_wrapper)
 endif()
@@ -35,5 +29,38 @@ if(WITH_XPU_BKCL)
     op_library(gen_bkcl_id_op DEPS ${COLLECTIVE_DEPS})
 endif()
 
+if(WITH_ASCEND_CL)
+    cc_library(gen_hccl_id_op_helper SRCS gen_hccl_id_op_helper.cc DEPS dynload_warpctc dynamic_loader scope)
+    set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} collective_helper gen_hccl_id_op_helper)
+    op_library(c_gen_hccl_id_op DEPS ${COLLECTIVE_DEPS})
+    op_library(gen_hccl_id_op DEPS ${COLLECTIVE_DEPS})
+endif()
+
 set(OPERATOR_DEPS ${OPERATOR_DEPS} ${COLLECTIVE_DEPS} PARENT_SCOPE)
 set(GLOB_COLLECTIVE_DEPS ${COLLECTIVE_DEPS} CACHE INTERNAL "collective dependency")
+
+if(WITH_ASCEND_CL)
+    set(COMMON_TEST_DEPS_FOR_HCOM c_comm_init_hccl_op c_gen_hccl_id_op gen_hccl_id_op_helper
+        gen_hccl_id_op op_registry ascend_hccl flags
+        dynamic_loader dynload_warpctc scope device_context enforce executor)
+    cc_test(c_broadcast_op_npu_test SRCS c_broadcast_op_npu_test.cc
+        DEPS c_broadcast_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
+    cc_test(c_allreduce_sum_op_npu_test SRCS c_allreduce_sum_op_npu_test.cc
+        DEPS c_allreduce_sum_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
+    cc_test(c_reducescatter_op_npu_test SRCS c_reducescatter_op_npu_test.cc
+        DEPS c_reducescatter_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
+    cc_test(c_allgather_op_npu_test SRCS c_allgather_op_npu_test.cc
+        DEPS c_allgather_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
+    cc_test(c_reduce_sum_op_npu_test SRCS c_reduce_sum_op_npu_test.cc
+            DEPS c_reduce_sum_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
+    cc_test(c_allreduce_max_op_npu_test SRCS c_allreduce_max_op_npu_test.cc
+        DEPS c_allreduce_max_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
+    cc_test(send_v2_op_npu_test SRCS send_v2_op_npu_test.cc
+        DEPS send_v2_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
+    cc_test(recv_v2_op_npu_test SRCS recv_v2_op_npu_test.cc
+        DEPS recv_v2_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
+    cc_test(c_sync_comm_stream_op_npu_test SRCS c_sync_comm_stream_op_npu_test.cc
+        DEPS op_registry c_broadcast_op c_comm_init_hccl_op c_sync_comm_stream_op c_gen_hccl_id_op gen_hccl_id_op_helper ${COLLECTIVE_DEPS} ascend_hccl dynamic_loader dynload_warpctc scope device_context enforce executor)
+    cc_test(c_sync_calc_stream_op_npu_test SRCS c_sync_calc_stream_op_npu_test.cc
+        DEPS op_registry elementwise_add_op c_sync_calc_stream_op c_gen_hccl_id_op gen_hccl_id_op_helper ${COLLECTIVE_DEPS} ascend_hccl dynamic_loader dynload_warpctc scope device_context enforce executor)
+endif()
diff --git a/paddle/fluid/operators/collective/allreduce_op.cc b/paddle/fluid/operators/collective/allreduce_op.cc
index 86f1c28a9dd4f..63b135a74cf4b 100644
--- a/paddle/fluid/operators/collective/allreduce_op.cc
+++ b/paddle/fluid/operators/collective/allreduce_op.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <future>  // NOLINT
 #include <ostream>
 
-#include "paddle/fluid/operators/distributed_ops/allreduce_op.h"
+#include "paddle/fluid/operators/collective/allreduce_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/collective/allreduce_op.cu.cc b/paddle/fluid/operators/collective/allreduce_op.cu.cc
index 9b70f78399026..fe2e491055270 100644
--- a/paddle/fluid/operators/collective/allreduce_op.cu.cc
+++ b/paddle/fluid/operators/collective/allreduce_op.cu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/distributed_ops/allreduce_op.h"
+#include "paddle/fluid/operators/collective/allreduce_op.h"
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
diff --git a/paddle/fluid/operators/collective/c_allgather_op.cc b/paddle/fluid/operators/collective/c_allgather_op.cc
index 4111a19c5ebc8..c4e779698ccca 100644
--- a/paddle/fluid/operators/collective/c_allgather_op.cc
+++ b/paddle/fluid/operators/collective/c_allgather_op.cc
@@ -42,6 +42,10 @@ class CAllGatherOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Out", "(Tensor) the allgather result");
     AddAttr<int>("ring_id", "(int default 0) communication ring id.")
         .SetDefault(0);
+#if defined(PADDLE_WITH_ASCEND_CL)
+    AddAttr<std::string>("tag", "(string default tag) tag for all gather.")
+        .SetDefault("tag");
+#endif
     AddAttr<bool>(
         "use_calc_stream",
         "(bool default false) eject CUDA operations to calculation stream.")
diff --git a/paddle/fluid/operators/collective/c_allgather_op_npu.cc b/paddle/fluid/operators/collective/c_allgather_op_npu.cc
new file mode 100644
index 0000000000000..e7f05549d9efe
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_allgather_op_npu.cc
@@ -0,0 +1,83 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_allgather_op.h"
+
+#include <memory>
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class CAllGatherOpASCENDKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+#if defined(PADDLE_WITH_ASCEND_CL)
+    auto in = ctx.Input<framework::Tensor>("X");
+    auto out = ctx.Output<framework::Tensor>("Out");
+    HcclDataType dtype = platform::ToHCCLDataType(in->type());
+
+    int ring_id = ctx.Attr<int>("ring_id");
+    std::string group =
+        std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
+    auto place = ctx.GetPlace();
+    auto comm = platform::HCCLCommContext::Instance().Get(ring_id, place);
+    int nranks = comm->nranks();
+
+    framework::DDim out_dims = in->dims();
+    out_dims[0] *= nranks;
+    out->mutable_data<T>(out_dims, place);
+
+    uint64_t send_numel = in->numel();
+    void *send_buff = reinterpret_cast<void *>(const_cast<T *>(in->data<T>()));
+    void *recv_buff = reinterpret_cast<void *>(out->data<T>());
+
+    aclrtStream stream = nullptr;
+    if (ctx.Attr<bool>("use_calc_stream")) {
+      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+      stream = static_cast<platform::NPUDeviceContext *>(dev_ctx)->stream();
+    } else {
+      stream = comm->stream();
+    }
+
+    VLOG(3) << "begin hccl allgather, parameter is: "
+            << ", group is " << group << ", ring_id is " << ring_id
+            << ", nranks is " << nranks;
+
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclAllGather(
+        send_buff, recv_buff, send_numel, dtype, comm->comm(),
+        reinterpret_cast<void *>(stream)));
+
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with NPU."));
+#endif
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(c_allgather, ops::CAllGatherOpASCENDKernel<int8_t>,
+                       ops::CAllGatherOpASCENDKernel<int>,
+                       ops::CAllGatherOpASCENDKernel<float>,
+                       ops::CAllGatherOpASCENDKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc b/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
new file mode 100644
index 0000000000000..4c7dfc4aad7d0
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
@@ -0,0 +1,192 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <stdio.h>
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+#include "paddle/fluid/operators/collective/c_allgather_op.h"
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+#include "paddle/fluid/operators/collective/c_broadcast_op.h"
+#include "paddle/fluid/operators/collective/c_reducescatter_op.h"
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(c_allgather);
+USE_NO_KERNEL_OP(c_gen_hccl_id);
+USE_NO_KERNEL_OP(c_comm_init_hccl);
+USE_OP_DEVICE_KERNEL(c_allgather, NPU);
+
+DECLARE_string(selected_npus);
+
+template <typename T>
+void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
+  std::string debugstring = "";
+  for (auto ele : data) {
+    debugstring += std::to_string(ele) + std::string(",");
+  }
+  VLOG(2) << preStr << ":" << std::endl << debugstring;
+}
+
+void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx,
+                     HcclRootInfo* hccl_id) {
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  std::vector<int> rank_ids{0, 1};
+  f::AttributeMap gen_hccl_id;
+
+  std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
+  gen_hccl_id["rank"] = rank_id;
+  gen_hccl_id["endpoint"] = endpointList[rank_id];
+  std::vector<std::string> other_endpoints = {
+      endpointList[rank_id == 0 ? 1 : 0]};
+  gen_hccl_id["other_endpoints"] = other_endpoints;
+
+  auto out = scope->Var("Out");
+  auto id = out->GetMutable<HcclRootInfo>();
+
+  VLOG(3) << "break";
+
+  auto comm_init_op = f::OpRegistry::CreateOp("c_gen_hccl_id", {},
+                                              {{"Out", {"Out"}}}, gen_hccl_id);
+  VLOG(3) << "break";
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+
+  memcpy(hccl_id, id, 1024);
+}
+
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx,
+             HcclRootInfo* hccl_id) {
+  auto x = scope->Var("X");
+  auto id = x->GetMutable<HcclRootInfo>();
+
+  memcpy(id, hccl_id, 1024);
+
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  // std::vector<int> rank_ids{0, 1};
+  f::AttributeMap comm_init_attrs;
+  comm_init_attrs["ring_id"] = 0;
+  comm_init_attrs["rank_ids"] = 2;
+  comm_init_attrs["rank"] = rank_id;
+  comm_init_attrs["device_id"] = device_id;
+  // comm_init_attrs["rank_ids"] = rank_ids;
+  auto comm_init_op = f::OpRegistry::CreateOp(
+      "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+}
+
+void TestHCCLAllGatherOp(f::Scope* scope, const p::DeviceContext& ctx) {
+  // init
+  auto x = scope->Var("Data");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+
+  std::vector<float> init;
+  int rank_id = atoi(getenv("RANK_ID"));
+
+  int num1 = 1;
+  int num2 = 4;
+
+  for (int64_t i = 0; i < num1 * num2; ++i) {
+    init.push_back(1.0 + rank_id);
+  }
+  PrintDebugInfo("input data", init);
+
+  TensorFromVector(init, ctx, tensor_x);
+  tensor_x->Resize({num1, num2});
+  ctx.Wait();
+
+  auto place = ctx.GetPlace();
+  auto out = scope->Var("OutData");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+  tensor_out->Resize({num1, num2});
+  tensor_out->mutable_data<float>(place);  // allocate
+  ctx.Wait();
+
+  // run
+  f::AttributeMap attrs;
+  attrs["tag"] = std::string("tagx");
+  attrs["ring_id"] = 0;
+  attrs["nranks"] = 2;
+
+  auto op = f::OpRegistry::CreateOp("c_allgather", {{"X", {"Data"}}},
+                                    {{"Out", {"OutData"}}}, attrs);
+
+  for (int i = 0; i < 10; i++) {
+    op->Run(*scope, place);
+  }
+  ctx.Wait();
+
+  std::vector<float> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+  ctx.Wait();
+
+  PrintDebugInfo("output data", out_vec);
+
+  EXPECT_EQ(out_vec.size(), init.size() * 2);
+  for (uint32_t i = 0; i < out_vec.size() / 2; i++) {
+    EXPECT_EQ(out_vec[i], 1.0);
+  }
+  for (uint32_t i = out_vec.size() / 2; i < out_vec.size(); i++) {
+    EXPECT_EQ(out_vec[i], 2.0);
+  }
+}
+
+TEST(c_allgather, NPU) {
+  f::Scope scope;
+  HcclRootInfo hccl_id;
+
+  // only support one device, if more than one device, use first default
+  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
+
+  PrepareUniqueId(&scope, ctx, &hccl_id);
+  Prepare(&scope, ctx, &hccl_id);
+  TestHCCLAllGatherOp(&scope, ctx);
+}
diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op_npu.cc b/paddle/fluid/operators/collective/c_allreduce_max_op_npu.cc
new file mode 100644
index 0000000000000..4dece4a3721ff
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_allreduce_max_op_npu.cc
@@ -0,0 +1,31 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+
+namespace paddle {
+namespace platform {
+struct ASCENDPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(
+    c_allreduce_max, ops::CAllReduceOpASCENDKernel<ops::kRedMax, int>,
+    ops::CAllReduceOpASCENDKernel<ops::kRedMax, int8_t>,
+    ops::CAllReduceOpASCENDKernel<ops::kRedMax, float>,
+    ops::CAllReduceOpASCENDKernel<ops::kRedMax, plat::float16>)
diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc b/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc
new file mode 100644
index 0000000000000..b7fd2739d5118
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc
@@ -0,0 +1,188 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <stdio.h>
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+#include "paddle/fluid/operators/collective/c_allgather_op.h"
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+#include "paddle/fluid/operators/collective/c_broadcast_op.h"
+#include "paddle/fluid/operators/collective/c_reducescatter_op.h"
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(c_allreduce_max);
+USE_NO_KERNEL_OP(c_gen_hccl_id);
+USE_NO_KERNEL_OP(c_comm_init_hccl);
+USE_OP_DEVICE_KERNEL(c_allreduce_max, NPU);
+
+DECLARE_string(selected_npus);
+
+template <typename T>
+void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
+  std::string debugstring = "";
+  for (auto ele : data) {
+    debugstring += std::to_string(ele) + std::string(",");
+  }
+  VLOG(2) << preStr << ":" << std::endl << debugstring;
+}
+
+void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx,
+                     HcclRootInfo* hccl_id) {
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  std::vector<int> rank_ids{0, 1};
+  f::AttributeMap gen_hccl_id;
+
+  std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
+  gen_hccl_id["rank"] = rank_id;
+  gen_hccl_id["endpoint"] = endpointList[rank_id];
+  std::vector<std::string> other_endpoints = {
+      endpointList[rank_id == 0 ? 1 : 0]};
+  gen_hccl_id["other_endpoints"] = other_endpoints;
+
+  auto out = scope->Var("Out");
+  auto id = out->GetMutable<HcclRootInfo>();
+
+  VLOG(3) << "break";
+
+  auto comm_init_op = f::OpRegistry::CreateOp("c_gen_hccl_id", {},
+                                              {{"Out", {"Out"}}}, gen_hccl_id);
+  VLOG(3) << "break";
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+
+  memcpy(hccl_id, id, 1024);
+}
+
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx,
+             HcclRootInfo* hccl_id) {
+  auto x = scope->Var("X");
+  auto id = x->GetMutable<HcclRootInfo>();
+
+  memcpy(id, hccl_id, 1024);
+
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  // std::vector<int> rank_ids{0, 1};
+  f::AttributeMap comm_init_attrs;
+  comm_init_attrs["ring_id"] = 0;
+  comm_init_attrs["rank_ids"] = 2;
+  comm_init_attrs["rank"] = rank_id;
+  comm_init_attrs["device_id"] = device_id;
+  // comm_init_attrs["rank_ids"] = rank_ids;
+  auto comm_init_op = f::OpRegistry::CreateOp(
+      "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+}
+
+void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx) {
+  // init
+  auto x = scope->Var("Data");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+
+  std::vector<float> init;
+  int rank_id = atoi(getenv("RANK_ID"));
+
+  int num1 = 100;
+  int num2 = 100;
+
+  for (int64_t i = 0; i < num1 * num2; ++i) {
+    init.push_back(1.0 + rank_id * 3);
+  }
+  PrintDebugInfo("input data", init);
+
+  TensorFromVector(init, ctx, tensor_x);
+  tensor_x->Resize({num1, num2});
+  ctx.Wait();
+
+  auto place = ctx.GetPlace();
+  auto out = scope->Var("OutData");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+  tensor_out->Resize({num1, num2});
+  tensor_out->mutable_data<float>(place);  // allocate
+  ctx.Wait();
+
+  // run
+  f::AttributeMap attrs;
+  attrs["tag"] = std::string("tagx");
+  attrs["ring_id"] = 0;
+
+  auto op = f::OpRegistry::CreateOp("c_allreduce_max", {{"X", {"Data"}}},
+                                    {{"Out", {"OutData"}}}, attrs);
+
+  for (int i = 0; i < 10; i++) {
+    op->Run(*scope, place);
+  }
+  ctx.Wait();
+
+  std::vector<float> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+  ctx.Wait();
+
+  PrintDebugInfo("output data", out_vec);
+
+  EXPECT_EQ(out_vec.size(), init.size());
+  for (uint32_t i = 0; i < out_vec.size(); i++) {
+    EXPECT_EQ(out_vec[i], 4.0);
+  }
+}
+
+TEST(c_allreduce_max, NPU) {
+  f::Scope scope;
+  HcclRootInfo hccl_id;
+
+  // only support one device, if more than one device, use first default
+  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
+
+  PrepareUniqueId(&scope, ctx, &hccl_id);
+  Prepare(&scope, ctx, &hccl_id);
+  TestHCCLAllReduceOp(&scope, ctx);
+}
diff --git a/paddle/fluid/operators/distributed_ops/allreduce_op.cu.cc b/paddle/fluid/operators/collective/c_allreduce_max_op_xpu.cc
similarity index 55%
rename from paddle/fluid/operators/distributed_ops/allreduce_op.cu.cc
rename to paddle/fluid/operators/collective/c_allreduce_max_op_xpu.cc
index 9b70f78399026..b0aa51f7cfdfd 100644
--- a/paddle/fluid/operators/distributed_ops/allreduce_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_max_op_xpu.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,14 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/distributed_ops/allreduce_op.h"
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+
+namespace paddle {
+namespace platform {
+struct XPUPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
-REGISTER_OP_CUDA_KERNEL(
-    allreduce, ops::AllReduceOpKernel<plat::CUDADeviceContext, float>,
-    ops::AllReduceOpKernel<plat::CUDADeviceContext, double>,
-    ops::AllReduceOpKernel<plat::CUDADeviceContext, int>,
-    ops::AllReduceOpKernel<plat::CUDADeviceContext, int64_t>,
-    ops::AllReduceOpKernel<plat::CUDADeviceContext, plat::float16>);
+REGISTER_OP_XPU_KERNEL(c_allreduce_max,
+                       ops::CAllReduceOpXPUKernel<ops::kRedMax, float>)
diff --git a/paddle/fluid/operators/collective/c_allreduce_min_op_npu.cc b/paddle/fluid/operators/collective/c_allreduce_min_op_npu.cc
new file mode 100644
index 0000000000000..48e1d2eeb58c5
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_allreduce_min_op_npu.cc
@@ -0,0 +1,31 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+
+namespace paddle {
+namespace platform {
+struct ASCENDPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(
+    c_allreduce_min, ops::CAllReduceOpASCENDKernel<ops::kRedMin, int>,
+    ops::CAllReduceOpASCENDKernel<ops::kRedMin, int8_t>,
+    ops::CAllReduceOpASCENDKernel<ops::kRedMin, float>,
+    ops::CAllReduceOpASCENDKernel<ops::kRedMin, plat::float16>)
diff --git a/paddle/fluid/operators/collective/c_allreduce_min_op_xpu.cc b/paddle/fluid/operators/collective/c_allreduce_min_op_xpu.cc
new file mode 100644
index 0000000000000..2f16a89c217da
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_allreduce_min_op_xpu.cc
@@ -0,0 +1,28 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+
+namespace paddle {
+namespace platform {
+struct XPUPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_XPU_KERNEL(c_allreduce_min,
+                       ops::CAllReduceOpXPUKernel<ops::kRedMin, float>)
diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h
index 2f56f43d793fa..0eaa377869ef6 100644
--- a/paddle/fluid/operators/collective/c_allreduce_op.h
+++ b/paddle/fluid/operators/collective/c_allreduce_op.h
@@ -19,17 +19,31 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/memory/memory.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+    defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_XPU_BKCL)
 #include "paddle/fluid/platform/collective_helper.h"
+#endif
+
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
 
+#if defined(PADDLE_WITH_XPU_BKCL)
+#include "paddle/fluid/platform/bkcl_helper.h"
+#endif
+
 #if defined(PADDLE_WITH_GLOO)
 #include <gloo/allreduce.h>
 #include "paddle/fluid/framework/fleet/gloo_wrapper.h"
 #endif
 
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
 namespace paddle {
 namespace operators {
 
@@ -105,6 +119,135 @@ class CAllReduceOpCPUKernel : public framework::OpKernel<T> {
   }
 };
 
+template <ReduceType red_type, typename T>
+class CAllReduceOpASCENDKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_ASCEND_CL)
+    auto in = ctx.Input<framework::LoDTensor>("X");
+    auto out = ctx.Output<framework::LoDTensor>("Out");
+    auto place = ctx.GetPlace();
+    HcclDataType dtype = platform::ToHCCLDataType(in->type());
+    int64_t numel = in->numel();
+
+    void* sendbuff = reinterpret_cast<void*>(const_cast<T*>(in->data<T>()));
+    void* recvbuff = reinterpret_cast<void*>(out->data<T>());
+
+    int ring_id = ctx.Attr<int>("ring_id");
+    std::string group =
+        std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
+    auto comm =
+        paddle::platform::HCCLCommContext::Instance().Get(ring_id, place);
+
+    aclrtStream stream = nullptr;
+    auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+    if (ctx.Attr<bool>("use_calc_stream")) {
+      stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
+    } else {
+      stream = comm->stream();
+    }
+
+    HcclReduceOp hccl_red_type = HCCL_REDUCE_SUM;
+    switch (red_type) {
+      case kRedSum:
+        hccl_red_type = HCCL_REDUCE_SUM;
+        break;
+
+      case kRedMax:
+        hccl_red_type = HCCL_REDUCE_MAX;
+        break;
+
+      case kRedMin:
+        hccl_red_type = HCCL_REDUCE_MIN;
+        break;
+
+      case kRedProd:
+        hccl_red_type = HCCL_REDUCE_PROD;
+        break;
+
+      default:
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "Invalid reduce type: %d", red_type));
+    }
+
+    VLOG(3) << "begin hccl allreduce, parameter is: "
+            << "input num: " << numel << "dtype: " << dtype
+            << "hccl_red_type: " << hccl_red_type << ", group is: " << group;
+
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclAllReduce(
+        sendbuff, recvbuff, numel, dtype, hccl_red_type, comm->comm(),
+        reinterpret_cast<void*>(stream)));
+
+    out->Resize(in->dims());
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with NPU."));
+#endif
+  }
+};
+
+template <ReduceType red_type, typename T>
+class CAllReduceOpXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_XPU_BKCL)
+    auto in = ctx.Input<framework::Tensor>("X");
+    auto out = ctx.Output<framework::Tensor>("Out");
+
+    auto place = ctx.GetPlace();
+    BKCLDataType dtype = platform::ToBKCLDataType(in->type());
+    int64_t numel = in->numel();
+    const void* sendbuff = in->data<void>();
+    out->Resize(in->dims());
+    void* recvbuff = out->mutable_data<T>(place);
+
+    int rid = ctx.Attr<int>("ring_id");
+    auto comm = platform::BKCLCommContext::Instance().Get(rid, place);
+
+    XPUStream stream = nullptr;
+    if (ctx.Attr<bool>("use_calc_stream")) {
+      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+      stream = static_cast<platform::XPUDeviceContext*>(dev_ctx)
+                   ->x_context()
+                   ->xpu_stream;
+    } else {
+      stream = comm->stream();
+    }
+
+    BKCLOp bkcl_red_type = BKCL_ADD;
+    switch (red_type) {
+      case kRedSum:
+        bkcl_red_type = BKCL_ADD;
+        break;
+
+      case kRedMax:
+        bkcl_red_type = BKCL_MAX;
+        break;
+
+      case kRedMin:
+        bkcl_red_type = BKCL_MIN;
+        break;
+
+      case kRedProd:
+        bkcl_red_type = BKCL_PRODUCT;
+        break;
+
+      default:
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "Invalid reduce type: %d", red_type));
+    }
+
+    PADDLE_ENFORCE_EQ(bkcl_all_reduce(comm->comm(), sendbuff, recvbuff, numel,
+                                      dtype, bkcl_red_type, stream),
+                      BKCL_SUCCESS, platform::errors::PreconditionNotMet(
+                                        "BKCL all reduce failed"));
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should be compiled with XPU."));
+#endif
+  }
+};
+
 template <ReduceType red_type, typename T>
 class CAllReduceOpCUDAKernel : public framework::OpKernel<T> {
  public:
@@ -170,10 +313,20 @@ class CAllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Out", "(Tensor) the allreduced result.");
     AddAttr<int>("ring_id", "(int default 0) communication ring id.")
         .SetDefault(0);
+#if defined(PADDLE_WITH_ASCEND_CL)
+    AddAttr<std::string>("tag", "(string default tag) tag for all reduce.")
+        .SetDefault("tag");
+#endif
     AddAttr<bool>(
         "use_calc_stream",
         "(bool default false) eject CUDA operations to calculation stream.")
         .SetDefault(false);
+    AddAttr<bool>(
+        "use_model_parallel",
+        "(bool default false) use this op with model parallel mode. In model "
+        "parallel mode, the backward is c_identity which returns itself for "
+        "c_allreduce_sum.")
+        .SetDefault(false);
     AddComment(string::Sprintf(R"DOC(
 CAllReduce %s Operator
 
diff --git a/paddle/fluid/operators/collective/c_allreduce_prod_op_npu.cc b/paddle/fluid/operators/collective/c_allreduce_prod_op_npu.cc
new file mode 100644
index 0000000000000..f3d14afe0a1bc
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_allreduce_prod_op_npu.cc
@@ -0,0 +1,31 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+
+namespace paddle {
+namespace platform {
+struct ASCENDPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(
+    c_allreduce_prod, ops::CAllReduceOpASCENDKernel<ops::kRedProd, int>,
+    ops::CAllReduceOpASCENDKernel<ops::kRedProd, int8_t>,
+    ops::CAllReduceOpASCENDKernel<ops::kRedProd, float>,
+    ops::CAllReduceOpASCENDKernel<ops::kRedProd, plat::float16>)
diff --git a/paddle/fluid/operators/collective/c_allreduce_prod_op_xpu.cc b/paddle/fluid/operators/collective/c_allreduce_prod_op_xpu.cc
new file mode 100644
index 0000000000000..92ba00428065b
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_allreduce_prod_op_xpu.cc
@@ -0,0 +1,28 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+
+namespace paddle {
+namespace platform {
+struct XPUPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_XPU_KERNEL(c_allreduce_prod,
+                       ops::CAllReduceOpXPUKernel<ops::kRedProd, float>)
diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op.cc
index 68061e6ae6bea..23ed98bb044be 100644
--- a/paddle/fluid/operators/collective/c_allreduce_sum_op.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op.cc
@@ -37,7 +37,12 @@ class CAllReduceSumOpGradMaker : public framework::SingleGradOpMaker<T> {
 
  protected:
   void Apply(GradOpPtr<T> retv) const override {
-    retv->SetType("c_allreduce_sum");
+    bool use_mp = BOOST_GET_CONST(bool, this->GetAttr("use_model_parallel"));
+    if (use_mp) {
+      retv->SetType("c_identity");
+    } else {
+      retv->SetType("c_allreduce_sum");
+    }
     retv->SetInput("X", this->OutputGrad("Out"));
     retv->SetOutput("Out", this->InputGrad("X"));
     retv->SetAttrMap(this->Attrs());
diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu.cc
new file mode 100644
index 0000000000000..b66e2e1968908
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu.cc
@@ -0,0 +1,31 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+
+namespace paddle {
+namespace platform {
+struct ASCENDPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(
+    c_allreduce_sum, ops::CAllReduceOpASCENDKernel<ops::kRedSum, int>,
+    ops::CAllReduceOpASCENDKernel<ops::kRedSum, int8_t>,
+    ops::CAllReduceOpASCENDKernel<ops::kRedSum, float>,
+    ops::CAllReduceOpASCENDKernel<ops::kRedSum, plat::float16>)
diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
new file mode 100644
index 0000000000000..f1bf9683e3559
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
@@ -0,0 +1,189 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <stdio.h>
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(c_allreduce_sum);
+USE_NO_KERNEL_OP(c_gen_hccl_id);
+USE_NO_KERNEL_OP(c_comm_init_hccl);
+USE_OP_DEVICE_KERNEL(c_allreduce_sum, NPU);
+
+DECLARE_string(selected_npus);
+
+template <typename T>
+void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
+  std::string debugstring = "";
+  for (auto ele : data) {
+    debugstring += std::to_string(ele) + std::string(",");
+  }
+  VLOG(3) << preStr << ":" << std::endl << debugstring;
+}
+
+void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx,
+                     HcclRootInfo* hccl_id) {
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  std::vector<int> rank_ids{0, 1};
+  f::AttributeMap gen_hccl_id;
+
+  std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
+  gen_hccl_id["rank"] = rank_id;
+  gen_hccl_id["endpoint"] = endpointList[rank_id];
+  std::vector<std::string> other_endpoints = {
+      endpointList[rank_id == 0 ? 1 : 0]};
+  gen_hccl_id["other_endpoints"] = other_endpoints;
+
+  auto out = scope->Var("Out");
+  auto id = out->GetMutable<HcclRootInfo>();
+
+  VLOG(3) << "break";
+
+  auto comm_init_op = f::OpRegistry::CreateOp("c_gen_hccl_id", {},
+                                              {{"Out", {"Out"}}}, gen_hccl_id);
+  VLOG(3) << "break";
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+
+  memcpy(hccl_id, id, 1024);
+}
+
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx,
+             HcclRootInfo* hccl_id) {
+  auto x = scope->Var("X");
+  auto id = x->GetMutable<HcclRootInfo>();
+
+  memcpy(id, hccl_id, 1024);
+
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  // std::vector<int> rank_ids{0, 1};
+  f::AttributeMap comm_init_attrs;
+  comm_init_attrs["ring_id"] = 0;
+  comm_init_attrs["rank_ids"] = 2;
+  comm_init_attrs["rank"] = rank_id;
+  comm_init_attrs["device_id"] = device_id;
+  // comm_init_attrs["rank_ids"] = rank_ids;
+  auto comm_init_op = f::OpRegistry::CreateOp(
+      "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+}
+
+void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx,
+                         int iter) {
+  // init
+  auto x = scope->Var("Data");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+
+  int rank_id = atoi(getenv("RANK_ID"));
+  int num1 = 3;
+  int num2 = 128;
+
+  std::vector<float> init;
+  for (int64_t i = 0; i < num1 * num2; ++i) {
+    init.push_back(1.0 + rank_id);
+  }
+  PrintDebugInfo("input data", init);
+
+  auto place = ctx.GetPlace();
+
+  TensorFromVector(init, ctx, tensor_x);
+  tensor_x->Resize({num1, num2});
+  ctx.Wait();
+
+  auto out = scope->Var("OutData");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+  tensor_out->Resize({num1, num2});
+  tensor_out->mutable_data<float>(place);  // allocate
+  ctx.Wait();
+
+  // run
+  f::AttributeMap attrs;
+  attrs["tag"] = std::string("tagx_" + std::to_string(iter));
+  attrs["ring_id"] = 0;
+
+  auto op = f::OpRegistry::CreateOp("c_allreduce_sum", {{"X", {"Data"}}},
+                                    {{"Out", {"OutData"}}}, attrs);
+
+  for (int i = 0; i < 10; i++) {
+    op->Run(*scope, place);
+  }
+  ctx.Wait();
+
+  std::vector<float> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+  ctx.Wait();
+
+  PrintDebugInfo("output data", out_vec);
+
+  EXPECT_EQ(out_vec.size(), init.size());
+  for (uint32_t i = 0; i < out_vec.size(); i++) {
+    EXPECT_EQ(out_vec[i], 3.0);
+  }
+}
+
+TEST(c_allreduce_sum, NPU) {
+  f::Scope scope;
+  HcclRootInfo hccl_id;
+
+  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
+
+  // only support one device, if more than one device, use first default
+  PrepareUniqueId(&scope, ctx, &hccl_id);
+  Prepare(&scope, ctx, &hccl_id);
+  for (int i = 0; i < 1; i++) {
+    VLOG(2) << "iter num: " << i;
+    TestHCCLAllReduceOp(&scope, ctx, i);
+  }
+}
diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op_xpu.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op_xpu.cc
new file mode 100644
index 0000000000000..e4ec538cd2323
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_xpu.cc
@@ -0,0 +1,28 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+
+namespace paddle {
+namespace platform {
+struct XPUPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_XPU_KERNEL(c_allreduce_sum,
+                       ops::CAllReduceOpXPUKernel<ops::kRedSum, float>)
diff --git a/paddle/fluid/operators/collective/c_broadcast_op.cc b/paddle/fluid/operators/collective/c_broadcast_op.cc
index 928fa8549ffb9..271d543eb2364 100644
--- a/paddle/fluid/operators/collective/c_broadcast_op.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op.cc
@@ -42,6 +42,10 @@ class CBroadcastOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault(0);
     AddAttr<int>("root", "(int default 0) root id for broadcasting.")
         .SetDefault(0);
+#if defined(PADDLE_WITH_ASCEND_CL)
+    AddAttr<std::string>("tag", "(string default tag) tag for broadcasting.")
+        .SetDefault("tag");
+#endif
     AddAttr<bool>(
         "use_calc_stream",
         "(bool default false) eject CUDA operations to calculation stream.")
diff --git a/paddle/fluid/operators/collective/c_broadcast_op_npu.cc b/paddle/fluid/operators/collective/c_broadcast_op_npu.cc
new file mode 100644
index 0000000000000..a60ba86572822
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_broadcast_op_npu.cc
@@ -0,0 +1,91 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_broadcast_op.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class CBroadcastOpASCENDKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_ASCEND_CL)
+    auto x = ctx.Input<framework::LoDTensor>("X");
+    void* ptr = reinterpret_cast<void*>(const_cast<T*>(x->data<T>()));
+    int numel = x->numel();
+    HcclDataType dtype = platform::ToHCCLDataType(x->type());
+
+    auto out = ctx.Output<framework::LoDTensor>("Out");
+
+    int ring_id = ctx.Attr<int>("ring_id");
+    auto place = ctx.GetPlace();
+    auto comm =
+        paddle::platform::HCCLCommContext::Instance().Get(ring_id, place);
+
+    aclrtStream stream = nullptr;
+    auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+    if (ctx.Attr<bool>("use_calc_stream")) {
+      stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
+    } else {
+      stream = comm->stream();
+    }
+
+    int root = ctx.Attr<int>("root");
+    std::string group =
+        std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
+
+    VLOG(3) << "begin hccl broadcast, parameter is: "
+            << "root " << root << ", group is " << group
+            << ", comm: " << comm->comm() << ", stream: " << stream;
+
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclBroadcast(
+        ptr, numel, dtype, (uint32_t)root, comm->comm(), stream));
+
+    VLOG(3) << "rank " << comm->rank() << " invoke Bcast. recieved "
+            << framework::product(out->dims());
+
+    dev_ctx->Wait();
+
+    if (out != x) {
+      framework::TensorCopy(*static_cast<const framework::Tensor*>(x), place,
+                            *platform::DeviceContextPool::Instance().Get(place),
+                            static_cast<framework::Tensor*>(out));
+    }
+    dev_ctx->Wait();
+
+    out->Resize(x->dims());
+    out->set_lod(x->lod());
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with NPU."));
+#endif
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(c_broadcast, ops::CBroadcastOpASCENDKernel<int>,
+                       ops::CBroadcastOpASCENDKernel<int8_t>,
+                       ops::CBroadcastOpASCENDKernel<float>,
+                       ops::CBroadcastOpASCENDKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc b/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc
new file mode 100644
index 0000000000000..9e39613f3fbe3
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc
@@ -0,0 +1,181 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <stdio.h>
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+#include "paddle/fluid/operators/collective/c_broadcast_op.h"
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(c_broadcast);
+USE_NO_KERNEL_OP(c_gen_hccl_id);
+USE_NO_KERNEL_OP(c_comm_init_hccl);
+USE_OP_DEVICE_KERNEL(c_broadcast, NPU);
+
+DECLARE_string(selected_npus);
+
+template <typename T>
+void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
+  std::string debugstring = "";
+  for (auto ele : data) {
+    debugstring += std::to_string(ele) + std::string(",");
+  }
+  VLOG(2) << preStr << ":" << std::endl << debugstring;
+}
+
+void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx,
+                     HcclRootInfo* hccl_id) {
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  std::vector<int> rank_ids{0, 1};
+  f::AttributeMap gen_hccl_id;
+
+  std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
+  gen_hccl_id["rank"] = rank_id;
+  gen_hccl_id["endpoint"] = endpointList[rank_id];
+  std::vector<std::string> other_endpoints = {
+      endpointList[rank_id == 0 ? 1 : 0]};
+  gen_hccl_id["other_endpoints"] = other_endpoints;
+
+  auto out = scope->Var("Out");
+  auto id = out->GetMutable<HcclRootInfo>();
+
+  VLOG(3) << "break";
+
+  auto comm_init_op = f::OpRegistry::CreateOp("c_gen_hccl_id", {},
+                                              {{"Out", {"Out"}}}, gen_hccl_id);
+  VLOG(3) << "break";
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+
+  memcpy(hccl_id, id, 1024);
+}
+
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx,
+             HcclRootInfo* hccl_id) {
+  auto x = scope->Var("X");
+  auto id = x->GetMutable<HcclRootInfo>();
+
+  memcpy(id, hccl_id, 1024);
+
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  // std::vector<int> rank_ids{0, 1};
+  f::AttributeMap comm_init_attrs;
+  comm_init_attrs["ring_id"] = 0;
+  comm_init_attrs["rank_ids"] = 2;
+  comm_init_attrs["rank"] = rank_id;
+  comm_init_attrs["device_id"] = device_id;
+  // comm_init_attrs["rank_ids"] = rank_ids;
+  auto comm_init_op = f::OpRegistry::CreateOp(
+      "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+}
+
+void TestHCCLBroadcastOp(f::Scope* scope, const p::DeviceContext& ctx) {
+  // init
+  auto x = scope->Var("Data");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+  int num = 2;
+  std::vector<float> init;
+  int rank_id = atoi(getenv("RANK_ID"));
+
+  for (int64_t i = 0; i < num * num; ++i) {
+    init.push_back(1.0 + rank_id);
+  }
+  PrintDebugInfo("input data", init);
+
+  TensorFromVector(init, ctx, tensor_x);
+  tensor_x->Resize({num, num});
+  ctx.Wait();
+
+  auto place = ctx.GetPlace();
+  auto out = scope->Var("OutData");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+  tensor_out->Resize({num, num});
+  tensor_out->mutable_data<float>(place);  // allocate
+  ctx.Wait();
+
+  // run
+  f::AttributeMap attrs;
+  attrs["tag"] = std::string("tagx");
+  attrs["root"] = 0;
+  attrs["ring_id"] = 0;
+
+  auto op = f::OpRegistry::CreateOp("c_broadcast", {{"X", {"Data"}}},
+                                    {{"Out", {"OutData"}}}, attrs);
+
+  for (int i = 0; i < 10; i++) {
+    op->Run(*scope, place);
+  }
+  ctx.Wait();
+
+  std::vector<float> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+  ctx.Wait();
+
+  PrintDebugInfo("output data", out_vec);
+  EXPECT_EQ(out_vec.size(), init.size());
+  for (uint32_t i = 0; i < out_vec.size(); i++) {
+    EXPECT_EQ(out_vec[i], 1.0);
+  }
+}
+
+TEST(c_broadcast, NPU) {
+  f::Scope scope;
+  HcclRootInfo hccl_id;
+  // only support one device, if more than one device, use first default
+  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
+
+  PrepareUniqueId(&scope, ctx, &hccl_id);
+  Prepare(&scope, ctx, &hccl_id);
+  TestHCCLBroadcastOp(&scope, ctx);
+}
diff --git a/paddle/fluid/operators/collective/c_comm_init_hccl_op.cc b/paddle/fluid/operators/collective/c_comm_init_hccl_op.cc
new file mode 100644
index 0000000000000..7817f19bacb18
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_comm_init_hccl_op.cc
@@ -0,0 +1,96 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <string>
+
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+}  // namespace framework
+}  // namespace paddle
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+class CCommInitOpAscend : public framework::OperatorBase {
+ public:
+  CCommInitOpAscend(const std::string& type,
+                    const framework::VariableNameMap& inputs,
+                    const framework::VariableNameMap& outputs,
+                    const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& place) const override {
+    PADDLE_ENFORCE_EQ(is_npu_place(place), true,
+                      platform::errors::PreconditionNotMet(
+                          "CCommInitOpAscend can run on npu place only."));
+
+    auto var = scope.FindVar(Input("X"));
+    PADDLE_ENFORCE_NOT_NULL(
+        var, platform::errors::InvalidArgument("Input con not be empty."));
+#if defined(PADDLE_WITH_ASCEND_CL)
+    HcclRootInfo* hccl_id = var->GetMutable<HcclRootInfo>();
+
+    int rank_ids = Attr<int>("rank_ids");
+    int rank_id = Attr<int>("rank");
+    int rid = Attr<int>("ring_id");
+    int device_id = BOOST_GET_CONST(platform::NPUPlace, place).device;
+    if (Attr<int>("device_id") >= 0) {
+      device_id = Attr<int>("device_id");
+    }
+    platform::HCCLCommContext::Instance().CreateHCCLComm(
+        hccl_id, rank_ids, rank_id, device_id, rid);
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with NPU."));
+#endif
+  }
+};
+
+class CCommInitOpAscendMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "Raw variable contains a NCCL UniqueId instaces.");
+    AddComment(R"DOC(
+CCommInit operator
+
+Initialize collective communicatoin context within this trainer
+)DOC");
+    AddAttr<int>("rank_ids",
+                 "(int) The number of ranks of distributed trainers");
+    AddAttr<int>("rank",
+                 "(int) The rank of the trainer in distributed training.");
+    AddAttr<int>("device_id",
+                 "(int) The deivce_id on which to initialize the communicator."
+                 "Now, you only have to set this attr manually for pipeline "
+                 "training. Otherwise, make it as default.")
+        .SetDefault(-1);
+    AddAttr<int>("ring_id", "(int default 0) user specified ring id")
+        .SetDefault(0);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(c_comm_init_hccl, ops::CCommInitOpAscend,
+                  ops::CCommInitOpAscendMaker);
diff --git a/paddle/fluid/operators/collective/c_concat_op.cc b/paddle/fluid/operators/collective/c_concat_op.cc
new file mode 100644
index 0000000000000..551fde2116258
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_concat_op.cc
@@ -0,0 +1,112 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_concat_op.h"
+
+namespace paddle {
+namespace operators {
+
+class CConcatOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "c_concat");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "c_concat");
+    int nranks = ctx->Attrs().Get<int>("nranks");
+    int rank = ctx->Attrs().Get<int>("rank");
+    int ring_id = ctx->Attrs().Get<int>("ring_id");
+    PADDLE_ENFORCE_GE(nranks, 2, platform::errors::InvalidArgument(
+                                     "The number of ranks (%d) for c_concat "
+                                     "must be greater than 1.",
+                                     nranks));
+    PADDLE_ENFORCE_GE(
+        ring_id, 0,
+        platform::errors::InvalidArgument(
+            "The ring_id (%d) for c_concat must be non-negative.", ring_id));
+    PADDLE_ENFORCE_GE(
+        rank, 0, platform::errors::InvalidArgument(
+                     "The rank (%d) for c_concat must be non-negative.", rank));
+    PADDLE_ENFORCE_LT(rank, nranks,
+                      platform::errors::InvalidArgument(
+                          "The value of rank (%d) for c_concat must "
+                          "be less than that of nranks.",
+                          rank, nranks));
+
+    framework::DDim dim = ctx->GetInputDim("X");
+    dim[dim.size() - 1] = dim[dim.size() - 1] * nranks;
+    if (dim[dim.size() - 1] < 0) dim[dim.size() - 1] = -1;
+    ctx->SetOutputDim("Out", dim);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
+  }
+};
+
+template <typename T>
+class CConcatOpGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> retv) const override {
+    retv->SetType("c_split");
+    retv->SetInput("X", this->OutputGrad("Out"));
+    retv->SetOutput("Out", this->InputGrad("X"));
+    retv->SetAttrMap(this->Attrs());
+  }
+};
+
+class CConcatOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "(Tensor) tensor to be concated.");
+    AddOutput("Out", "(Tensor) the result of concat.");
+    AddAttr<int>("rank", "(int default 0) rank id.").SetDefault(0);
+    AddAttr<int>("nranks", "(int default 1) number of ranks.").SetDefault(1);
+    AddAttr<int>("ring_id", "(int default 0) ring id.").SetDefault(0);
+    AddAttr<bool>(
+        "use_calc_stream",
+        "(bool default true) eject CUDA operations to calculation stream.")
+        .SetDefault(true);
+    AddAttr<bool>("use_model_parallel",
+                  "(bool default true) use this op with model parallel.")
+        .SetDefault(true);
+    AddComment(R"DOC(
+CConcat Operator
+AllGather the tensors on different trainers and concat them along the last dimension.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OPERATOR(c_concat, ops::CConcatOp,
+                  ops::CConcatOpGradMaker<paddle::framework::OpDesc>,
+                  ops::CConcatOpGradMaker<paddle::imperative::OpBase>,
+                  ops::CConcatOpMaker);
+
+REGISTER_OP_CPU_KERNEL(c_concat, ops::CConcatOpCPUKernel<float>,
+                       ops::CConcatOpCPUKernel<double>,
+                       ops::CConcatOpCPUKernel<int>,
+                       ops::CConcatOpCPUKernel<int64_t>,
+                       ops::CConcatOpCPUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_concat_op.cu.cc b/paddle/fluid/operators/collective/c_concat_op.cu.cc
new file mode 100644
index 0000000000000..bfdc49c440aae
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_concat_op.cu.cc
@@ -0,0 +1,110 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <vector>
+
+#include "paddle/fluid/operators/collective/c_concat_op.h"
+#include "paddle/fluid/operators/math/concat_and_split.h"
+
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/nccl_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class CConcatOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto x = ctx.Input<framework::Tensor>("X");
+    auto out = ctx.Output<framework::Tensor>("Out");
+    ncclDataType_t dtype = platform::ToNCCLDataType(x->type());
+
+    int nranks = ctx.Attr<int>("nranks");
+    int rank = ctx.Attr<int>("rank");
+    int rid = ctx.Attr<int>("ring_id");
+    auto place = ctx.GetPlace();
+    PADDLE_ENFORCE_GE(rank, 0,
+                      platform::errors::PreconditionNotMet(
+                          "The value of rank (%d) for c_concat must be "
+                          "greater than or equal to 0.",
+                          rank));
+    PADDLE_ENFORCE_GE(nranks, 2,
+                      platform::errors::PreconditionNotMet(
+                          "The value of nranks (%d) for c_concat must be "
+                          "greater than or equal to 2.",
+                          nranks));
+    PADDLE_ENFORCE_LT(rank, nranks,
+                      platform::errors::PreconditionNotMet(
+                          "The value of rank (%d) for c_concat must be "
+                          "less than that of nranks (%d).",
+                          rank, nranks));
+
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+    auto comm = platform::NCCLCommContext::Instance().Get(rid, place);
+    PADDLE_ENFORCE_EQ(
+        nranks, comm->nranks(),
+        platform::errors::InvalidArgument("nranks: %s should equal to %s",
+                                          nranks, comm->nranks()));
+
+    framework::Tensor temp_out;
+    framework::DDim temp_out_dims = x->dims();
+    temp_out_dims[0] *= nranks;
+    temp_out.mutable_data<T>(temp_out_dims, place);
+    int64_t send_numel = x->numel();
+    const T* send_buff = x->data<T>();
+    T* recv_buff = temp_out.data<T>();
+    gpuStream_t stream = nullptr;
+    auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+    stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();
+
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllGather(
+        send_buff, recv_buff, send_numel, static_cast<ncclDataType_t>(dtype),
+        comm->comm(), stream));
+
+    std::vector<framework::Tensor> inputs;
+    int axis = x->dims().size() - 1;
+    auto out_dims = x->dims();
+    out_dims[out_dims.size() - 1] *= nranks;
+    int rows_per_tensor = x->dims()[0];
+    int offset = 0;
+    for (int i = 0; i < nranks; i++) {
+      framework::Tensor temp = temp_out.Slice(offset, offset + rows_per_tensor);
+      inputs.emplace_back(temp);
+      offset += rows_per_tensor;
+    }
+
+    math::ConcatFunctor<platform::CUDADeviceContext, T> functor;
+    out->mutable_data<T>(out_dims, place);
+    auto& dev_ctx2 = ctx.template device_context<platform::CUDADeviceContext>();
+    functor(dev_ctx2, inputs, axis, out);
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with GPU."));
+#endif
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(c_concat, ops::CConcatOpCUDAKernel<float>,
+                        ops::CConcatOpCUDAKernel<double>,
+                        ops::CConcatOpCUDAKernel<int>,
+                        ops::CConcatOpCUDAKernel<int64_t>,
+                        ops::CConcatOpCUDAKernel<plat::float16>);
diff --git a/paddle/fluid/operators/distributed_ops/split_byref_op.h b/paddle/fluid/operators/collective/c_concat_op.h
similarity index 51%
rename from paddle/fluid/operators/distributed_ops/split_byref_op.h
rename to paddle/fluid/operators/collective/c_concat_op.h
index fedd7218dd6cc..55a5799e37b6f 100644
--- a/paddle/fluid/operators/distributed_ops/split_byref_op.h
+++ b/paddle/fluid/operators/collective/c_concat_op.h
@@ -1,10 +1,10 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 
-http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
@@ -14,28 +14,23 @@ limitations under the License. */
 
 #pragma once
 
+#include <algorithm>
+#include <utility>
 #include <vector>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
 
-template <typename DeviceContext, typename T>
-class SplitByrefOpKernel : public framework::OpKernel<T> {
+template <typename T>
+class CConcatOpCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<framework::Tensor>("X");
-    auto outs = ctx.MultiOutput<framework::Tensor>("Out");
-    auto place = ctx.GetPlace();
-
-    size_t row_offset = 0;
-    for (size_t i = 0; i < outs.size(); ++i) {
-      // NOTE: no need to call mutable_data here to allocate memory.
-      auto* out = outs[i];
-      VLOG(3) << "spliting by ref: " << row_offset << " " << out->dims()[0];
-      *out = in->Slice(row_offset, row_offset + out->dims()[0]);
-      row_offset += out->dims()[0];
-    }
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Do not support c_concat for cpu kernel now."));
   }
 };
 
diff --git a/paddle/fluid/operators/collective/c_gen_hccl_id_op.cc b/paddle/fluid/operators/collective/c_gen_hccl_id_op.cc
new file mode 100644
index 0000000000000..593eaf923a978
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_gen_hccl_id_op.cc
@@ -0,0 +1,111 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <string>
+
+#include "glog/logging.h"
+#include "paddle/fluid/framework/op_proto_maker.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/var_type_traits.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/place.h"
+
+#ifdef PADDLE_WITH_ASCEND_CL
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+#ifdef PADDLE_WITH_ASCEND_CL
+
+class CGenHCCLIdOp : public framework::OperatorBase {
+ public:
+  CGenHCCLIdOp(const std::string& type,
+               const framework::VariableNameMap& inputs,
+               const framework::VariableNameMap& outputs,
+               const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
+    int rank = Attr<int>("rank");
+    framework::Scope& local_scope = scope.NewScope();
+
+    std::function<std::string(size_t)> func = [&](size_t i) -> std::string {
+      return Output("Out");
+    };
+
+    if (rank == 0) {
+      std::vector<std::string> endpoint_list =
+          Attr<std::vector<std::string>>("other_endpoints");
+      SendBroadCastHCCLID(endpoint_list, 1, func, local_scope);
+    } else {
+      std::string endpoint = Attr<std::string>("endpoint");
+      RecvBroadCastHCCLID(endpoint, 1, func, local_scope);
+    }
+    scope.DeleteScope(&local_scope);
+  }
+};
+
+#else
+
+class CGenHCCLIdOp : public framework::OperatorBase {
+ public:
+  CGenHCCLIdOp(const std::string& type,
+               const framework::VariableNameMap& inputs,
+               const framework::VariableNameMap& outputs,
+               const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {}
+};
+
+#endif
+
+class CGenHCCLIdOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    VLOG(3) << "ele";
+    AddOutput("Out", "Raw variable contains a HCCL UniqueId instaces.");
+    AddComment(R"DOC(
+CGenHCCLId operator
+
+For trainer 0: generate a new UniqueId and send it to all the other trainers.
+For trainer 1~n: start a gRPC server to get the UniqueId, once got, stop the server.
+)DOC");
+    AddAttr<std::string>("endpoint",
+                         "(string), e.g. 127.0.0.1:6175 "
+                         "current listen endpoint");
+    AddAttr<std::vector<std::string>>(
+        "other_endpoints",
+        "['trainer1_ip:port', 'trainer2_ip:port', ...] "
+        "list of other trainer endpoints")
+        .SetDefault({});
+    AddAttr<int>("rank",
+                 "(int default 0) "
+                 "The rank of the trainer in distributed training.")
+        .SetDefault(0);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(c_gen_hccl_id, ops::CGenHCCLIdOp, ops::CGenHCCLIdOpMaker);
diff --git a/paddle/fluid/operators/collective/c_identity_op.cc b/paddle/fluid/operators/collective/c_identity_op.cc
new file mode 100644
index 0000000000000..646c27b90e17e
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_identity_op.cc
@@ -0,0 +1,92 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_identity_op.h"
+
+namespace paddle {
+namespace operators {
+
+class CIdentityOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "c_identity");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "c_identity");
+    int ring_id = ctx->Attrs().Get<int>("ring_id");
+    PADDLE_ENFORCE_GE(
+        ring_id, 0,
+        platform::errors::InvalidArgument(
+            "The ring_id (%d) for c_identity must be non-negative.", ring_id));
+    framework::DDim dim = ctx->GetInputDim("X");
+    ctx->SetOutputDim("Out", dim);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
+  }
+};
+
+class CIdentityOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "(Tensor) identity tensor.");
+    AddOutput("Out", "(Tensor) identity tensor.");
+    AddAttr<int>("ring_id", "(int default 0) nccl communication ring id.")
+        .SetDefault(0);
+    AddAttr<bool>(
+        "use_calc_stream",
+        "(bool default true) eject CUDA operations to calculation stream.")
+        .SetDefault(true);
+    AddAttr<bool>("use_model_parallel",
+                  "(bool default true) use this op with model parallel.")
+        .SetDefault(true);
+    AddComment(R"DOC(
+Identity Operator which returns a copy of itself.
+)DOC");
+  }
+};
+
+template <typename T>
+class CIdentityOpGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> retv) const override {
+    retv->SetType("c_allreduce_sum");
+    retv->SetInput("X", this->OutputGrad("Out"));
+    retv->SetOutput("Out", this->InputGrad("X"));
+    retv->SetAttrMap(this->Attrs());
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OPERATOR(c_identity, ops::CIdentityOp,
+                  ops::CIdentityOpGradMaker<paddle::framework::OpDesc>,
+                  ops::CIdentityOpGradMaker<paddle::imperative::OpBase>,
+                  ops::CIdentityOpMaker);
+
+REGISTER_OP_CPU_KERNEL(c_identity, ops::CIdentityOpCPUKernel<float>,
+                       ops::CIdentityOpCPUKernel<double>,
+                       ops::CIdentityOpCPUKernel<int>,
+                       ops::CIdentityOpCPUKernel<int64_t>,
+                       ops::CIdentityOpCPUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_identity_op.cu.cc b/paddle/fluid/operators/collective/c_identity_op.cu.cc
new file mode 100644
index 0000000000000..8ccf40e317ade
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_identity_op.cu.cc
@@ -0,0 +1,48 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_identity_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class CIdentityOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto x = ctx.Input<framework::LoDTensor>("X");
+    auto out = ctx.Output<framework::LoDTensor>("Out");
+
+    int rid = ctx.Attr<int>("ring_id");
+    PADDLE_ENFORCE_GE(
+        rid, 0,
+        platform::errors::InvalidArgument(
+            "The ring_id (%d) for c_identity op must be non-negative.", rid));
+    out->mutable_data<T>(ctx.GetPlace());
+
+    TensorCopy(*x, out->place(), out);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(c_identity, ops::CIdentityOpCUDAKernel<float>,
+                        ops::CIdentityOpCUDAKernel<double>,
+                        ops::CIdentityOpCUDAKernel<int>,
+                        ops::CIdentityOpCUDAKernel<int64_t>,
+                        ops::CIdentityOpCUDAKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_identity_op.h b/paddle/fluid/operators/collective/c_identity_op.h
new file mode 100644
index 0000000000000..ca817fb6bac0e
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_identity_op.h
@@ -0,0 +1,38 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class CIdentityOpCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Do not support c_identity for cpu kernel now."));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/collective/c_reduce_max_op_npu.cc b/paddle/fluid/operators/collective/c_reduce_max_op_npu.cc
new file mode 100644
index 0000000000000..f35b4c2f70722
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_reduce_max_op_npu.cc
@@ -0,0 +1,31 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_reduce_op.h"
+
+namespace paddle {
+namespace platform {
+struct ASCENDPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(c_reduce_max,
+                       ops::CReduceOpASCENDKernel<ops::kRedMax, int>,
+                       ops::CReduceOpASCENDKernel<ops::kRedMax, int8_t>,
+                       ops::CReduceOpASCENDKernel<ops::kRedMax, float>,
+                       ops::CReduceOpASCENDKernel<ops::kRedMax, plat::float16>)
diff --git a/paddle/fluid/operators/split_selected_rows_op.cu b/paddle/fluid/operators/collective/c_reduce_max_op_xpu.cc
similarity index 59%
rename from paddle/fluid/operators/split_selected_rows_op.cu
rename to paddle/fluid/operators/collective/c_reduce_max_op_xpu.cc
index 7250917036f61..6d3af7bb5f258 100644
--- a/paddle/fluid/operators/split_selected_rows_op.cu
+++ b/paddle/fluid/operators/collective/c_reduce_max_op_xpu.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,8 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/split_selected_rows_op.h"
+#include "paddle/fluid/operators/collective/c_reduce_op.h"
+
+namespace paddle {
+namespace platform {
+struct XPUPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    split_selected_rows,
-    ops::SplitSelectedRowsOpKernel<paddle::platform::CUDADeviceContext, float>);
+namespace plat = paddle::platform;
+
+REGISTER_OP_XPU_KERNEL(c_reduce_max,
+                       ops::CReduceOpXPUKernel<ops::kRedMax, float>)
diff --git a/paddle/fluid/operators/collective/c_reduce_min_op_npu.cc b/paddle/fluid/operators/collective/c_reduce_min_op_npu.cc
new file mode 100644
index 0000000000000..6ebb7e4c40e68
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_reduce_min_op_npu.cc
@@ -0,0 +1,31 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_reduce_op.h"
+
+namespace paddle {
+namespace platform {
+struct ASCENDPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(c_reduce_min,
+                       ops::CReduceOpASCENDKernel<ops::kRedMin, int>,
+                       ops::CReduceOpASCENDKernel<ops::kRedMin, int8_t>,
+                       ops::CReduceOpASCENDKernel<ops::kRedMin, float>,
+                       ops::CReduceOpASCENDKernel<ops::kRedMin, plat::float16>)
diff --git a/paddle/fluid/operators/distributed_ops/split_byref_op.cu.cc b/paddle/fluid/operators/collective/c_reduce_min_op_xpu.cc
similarity index 54%
rename from paddle/fluid/operators/distributed_ops/split_byref_op.cu.cc
rename to paddle/fluid/operators/collective/c_reduce_min_op_xpu.cc
index 056659c3ea61f..791e58d8493ce 100644
--- a/paddle/fluid/operators/distributed_ops/split_byref_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_min_op_xpu.cc
@@ -1,10 +1,10 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 
-http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
@@ -12,8 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/distributed_ops/split_byref_op.h"
+#include "paddle/fluid/operators/collective/c_reduce_op.h"
+
+namespace paddle {
+namespace platform {
+struct XPUPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    split_byref,
-    ops::SplitByrefOpKernel<paddle::platform::CUDADeviceContext, float>);
+namespace plat = paddle::platform;
+
+REGISTER_OP_XPU_KERNEL(c_reduce_min,
+                       ops::CReduceOpXPUKernel<ops::kRedMin, float>)
diff --git a/paddle/fluid/operators/collective/c_reduce_op.h b/paddle/fluid/operators/collective/c_reduce_op.h
index 1bce01e13a2ad..fa9fd079d8e48 100644
--- a/paddle/fluid/operators/collective/c_reduce_op.h
+++ b/paddle/fluid/operators/collective/c_reduce_op.h
@@ -24,15 +24,28 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+    defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/collective_helper.h"
+#endif
+
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
+
+#if defined(PADDLE_WITH_XPU_BKCL)
+#include "paddle/fluid/platform/bkcl_helper.h"
+#endif
+
 #if defined(PADDLE_WITH_GLOO)
 #include <gloo/reduce.h>
 #include "paddle/fluid/framework/fleet/gloo_wrapper.h"
 #endif
 
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
 namespace paddle {
 namespace operators {
 
@@ -110,6 +123,148 @@ class CReduceOpCPUKernel : public framework::OpKernel<T> {
   }
 };
 
+template <ReduceType red_type, typename T>
+class CReduceOpASCENDKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_ASCEND_CL)
+    auto in = ctx.Input<framework::LoDTensor>("X");
+    auto out = ctx.Output<framework::LoDTensor>("Out");
+    auto place = ctx.GetPlace();
+    HcclDataType dtype = platform::ToHCCLDataType(in->type());
+    int64_t numel = in->numel();
+
+    void* sendbuff = reinterpret_cast<void*>(const_cast<T*>(in->data<T>()));
+    void* recvbuff = reinterpret_cast<void*>(out->data<T>());
+
+    int ring_id = ctx.Attr<int>("ring_id");
+    int root_id = ctx.Attr<int>("root_id");
+    std::string group =
+        std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
+    auto comm =
+        paddle::platform::HCCLCommContext::Instance().Get(ring_id, place);
+
+    aclrtStream stream = nullptr;
+    auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+    if (ctx.Attr<bool>("use_calc_stream")) {
+      stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
+    } else {
+      stream = comm->stream();
+    }
+
+    int rank_id = comm->rank();
+
+    HcclReduceOp hccl_red_type = HCCL_REDUCE_SUM;
+    switch (red_type) {
+      case kRedSum:
+        hccl_red_type = HCCL_REDUCE_SUM;
+        break;
+
+      case kRedMax:
+        hccl_red_type = HCCL_REDUCE_MAX;
+        break;
+
+      case kRedMin:
+        hccl_red_type = HCCL_REDUCE_MIN;
+        break;
+
+      case kRedProd:
+        hccl_red_type = HCCL_REDUCE_PROD;
+        break;
+
+      default:
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "Invalid reduce type: %d", red_type));
+    }
+
+    VLOG(3) << "begin hccl reduce, parameter is: "
+            << "input num: " << numel << "root_id: " << root_id
+            << "dtype: " << dtype << "hccl_red_type: " << hccl_red_type
+            << ", group is: " << group;
+
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclAllReduce(
+        sendbuff, recvbuff, numel, dtype, hccl_red_type, comm->comm(),
+        reinterpret_cast<void*>(stream)));
+
+    if (rank_id != root_id) {
+      auto npu_place = BOOST_GET_CONST(platform::NPUPlace, place);
+      memory::Copy(npu_place, reinterpret_cast<void*>(out->data<T>()),
+                   npu_place,
+                   reinterpret_cast<void*>(const_cast<T*>(in->data<T>())),
+                   numel * sizeof(T), stream);
+    }
+
+    out->Resize(in->dims());
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with NPU."));
+#endif
+  }
+};
+
+template <ReduceType red_type, typename T>
+class CReduceOpXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_XPU_BKCL)
+    auto in = ctx.Input<framework::Tensor>("X");
+    auto out = ctx.Output<framework::Tensor>("Out");
+
+    auto place = ctx.GetPlace();
+    BKCLDataType dtype = platform::ToBKCLDataType(in->type());
+    int64_t numel = in->numel();
+    const void* sendbuff = in->data<void>();
+    out->Resize(in->dims());
+    void* recvbuff = out->mutable_data<T>(place);
+
+    int rid = ctx.Attr<int>("ring_id");
+    int root = ctx.Attr<int>("root_id");
+    auto comm = platform::BKCLCommContext::Instance().Get(rid, place);
+
+    XPUStream stream = nullptr;
+    if (ctx.Attr<bool>("use_calc_stream")) {
+      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+      stream = static_cast<platform::XPUDeviceContext*>(dev_ctx)
+                   ->x_context()
+                   ->xpu_stream;
+    } else {
+      stream = comm->stream();
+    }
+
+    BKCLOp bkcl_red_type = BKCL_ADD;
+    switch (red_type) {
+      case kRedSum:
+        bkcl_red_type = BKCL_ADD;
+        break;
+
+      case kRedMax:
+        bkcl_red_type = BKCL_MAX;
+        break;
+
+      case kRedMin:
+        bkcl_red_type = BKCL_MIN;
+        break;
+
+      case kRedProd:
+        bkcl_red_type = BKCL_PRODUCT;
+        break;
+
+      default:
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "Invalid reduce type: %d", red_type));
+    }
+
+    PADDLE_ENFORCE_EQ(bkcl_reduce(comm->comm(), sendbuff, recvbuff, numel,
+                                  dtype, bkcl_red_type, root, stream),
+                      BKCL_SUCCESS, platform::errors::PreconditionNotMet(
+                                        "BKCL all reduce failed"));
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should be compiled with XPU."));
+#endif
+  }
+};
+
 template <ReduceType red_type, typename T>
 class CReduceOpCUDAKernel : public framework::OpKernel<T> {
  public:
@@ -179,6 +334,10 @@ class CReduceOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Out", "(Tensor) the reduced result.");
     AddAttr<int>("ring_id", "(int default 0) communication ring id.")
         .SetDefault(0);
+#if defined(PADDLE_WITH_ASCEND_CL)
+    AddAttr<std::string>("tag", "(string default tag) tag for reduce.")
+        .SetDefault("tag");
+#endif
     AddAttr<int>("root_id", "(int default 0) root id.").SetDefault(0);
     AddAttr<bool>(
         "use_calc_stream",
diff --git a/paddle/fluid/operators/collective/c_reduce_prod_op_npu.cc b/paddle/fluid/operators/collective/c_reduce_prod_op_npu.cc
new file mode 100644
index 0000000000000..f0b7021e7997d
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_reduce_prod_op_npu.cc
@@ -0,0 +1,31 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_reduce_op.h"
+
+namespace paddle {
+namespace platform {
+struct ASCENDPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(c_reduce_prod,
+                       ops::CReduceOpASCENDKernel<ops::kRedProd, int>,
+                       ops::CReduceOpASCENDKernel<ops::kRedProd, int8_t>,
+                       ops::CReduceOpASCENDKernel<ops::kRedProd, float>,
+                       ops::CReduceOpASCENDKernel<ops::kRedProd, plat::float16>)
diff --git a/paddle/fluid/operators/collective/c_reduce_prod_op_xpu.cc b/paddle/fluid/operators/collective/c_reduce_prod_op_xpu.cc
new file mode 100644
index 0000000000000..e7e770e8ffdca
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_reduce_prod_op_xpu.cc
@@ -0,0 +1,28 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_reduce_op.h"
+
+namespace paddle {
+namespace platform {
+struct XPUPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_XPU_KERNEL(c_reduce_prod,
+                       ops::CReduceOpXPUKernel<ops::kRedProd, float>)
diff --git a/paddle/fluid/operators/collective/c_reduce_sum_op_npu.cc b/paddle/fluid/operators/collective/c_reduce_sum_op_npu.cc
new file mode 100644
index 0000000000000..dd4dbbd5f3645
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_reduce_sum_op_npu.cc
@@ -0,0 +1,31 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_reduce_op.h"
+
+namespace paddle {
+namespace platform {
+struct ASCENDPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(c_reduce_sum,
+                       ops::CReduceOpASCENDKernel<ops::kRedSum, int>,
+                       ops::CReduceOpASCENDKernel<ops::kRedSum, int8_t>,
+                       ops::CReduceOpASCENDKernel<ops::kRedSum, float>,
+                       ops::CReduceOpASCENDKernel<ops::kRedSum, plat::float16>)
diff --git a/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc b/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc
new file mode 100644
index 0000000000000..3683c7722ba3b
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc
@@ -0,0 +1,192 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <stdio.h>
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+#include "paddle/fluid/operators/collective/c_reduce_op.h"
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(c_reduce_sum);
+USE_NO_KERNEL_OP(c_gen_hccl_id);
+USE_NO_KERNEL_OP(c_comm_init_hccl);
+USE_OP_DEVICE_KERNEL(c_reduce_sum, NPU);
+
+DECLARE_string(selected_npus);
+
+template <typename T>
+void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
+  std::string debugstring = "";
+  for (auto ele : data) {
+    debugstring += std::to_string(ele) + std::string(",");
+  }
+  VLOG(3) << preStr << ":" << std::endl << debugstring;
+}
+
+void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx,
+                     HcclRootInfo* hccl_id) {
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  std::vector<int> rank_ids{0, 1};
+  f::AttributeMap gen_hccl_id;
+
+  std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
+  gen_hccl_id["rank"] = rank_id;
+  gen_hccl_id["endpoint"] = endpointList[rank_id];
+  std::vector<std::string> other_endpoints = {
+      endpointList[rank_id == 0 ? 1 : 0]};
+  gen_hccl_id["other_endpoints"] = other_endpoints;
+
+  auto out = scope->Var("Out");
+  auto id = out->GetMutable<HcclRootInfo>();
+
+  VLOG(3) << "break";
+
+  auto comm_init_op = f::OpRegistry::CreateOp("c_gen_hccl_id", {},
+                                              {{"Out", {"Out"}}}, gen_hccl_id);
+  VLOG(3) << "break";
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+
+  memcpy(hccl_id, id, 1024);
+}
+
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx,
+             HcclRootInfo* hccl_id) {
+  auto x = scope->Var("X");
+  auto id = x->GetMutable<HcclRootInfo>();
+
+  memcpy(id, hccl_id, 1024);
+
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  // std::vector<int> rank_ids{0, 1};
+  f::AttributeMap comm_init_attrs;
+  comm_init_attrs["ring_id"] = 0;
+  comm_init_attrs["rank_ids"] = 2;
+  comm_init_attrs["rank"] = rank_id;
+  comm_init_attrs["device_id"] = device_id;
+  // comm_init_attrs["rank_ids"] = rank_ids;
+  auto comm_init_op = f::OpRegistry::CreateOp(
+      "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+}
+
+void TestHCCLReduceOp(f::Scope* scope, const p::DeviceContext& ctx, int iter) {
+  // init
+  auto x = scope->Var("Data");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+
+  int rank_id = atoi(getenv("RANK_ID"));
+  int num1 = 3;
+  int num2 = 128;
+
+  std::vector<float> init;
+  for (int64_t i = 0; i < num1 * num2; ++i) {
+    init.push_back(1.0 + rank_id);
+  }
+  PrintDebugInfo("input data", init);
+
+  auto place = ctx.GetPlace();
+
+  TensorFromVector(init, ctx, tensor_x);
+  tensor_x->Resize({num1, num2});
+  ctx.Wait();
+
+  auto out = scope->Var("OutData");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+  tensor_out->Resize({num1, num2});
+  tensor_out->mutable_data<float>(place);  // allocate
+  ctx.Wait();
+
+  // run
+  f::AttributeMap attrs;
+  attrs["tag"] = std::string("tagx_" + std::to_string(iter));
+  attrs["ring_id"] = 0;
+  int root_id = 0;
+  attrs["root_id"] = root_id;
+
+  auto op = f::OpRegistry::CreateOp("c_reduce_sum", {{"X", {"Data"}}},
+                                    {{"Out", {"OutData"}}}, attrs);
+
+  op->Run(*scope, place);
+  ctx.Wait();
+
+  std::vector<float> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+  ctx.Wait();
+
+  PrintDebugInfo("output data", out_vec);
+
+  EXPECT_EQ(out_vec.size(), init.size());
+  for (uint32_t i = 0; i < out_vec.size(); i++) {
+    if (rank_id == root_id) {
+      EXPECT_EQ(out_vec[i], 3.0);
+    } else {
+      EXPECT_EQ(out_vec[i], init[i]);
+    }
+  }
+}
+
+TEST(c_reduce_sum, NPU) {
+  f::Scope scope;
+  HcclRootInfo hccl_id;
+
+  // only support one device, if more than one device, use first default
+  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
+
+  PrepareUniqueId(&scope, ctx, &hccl_id);
+  Prepare(&scope, ctx, &hccl_id);
+  for (int i = 0; i < 2; i++) {
+    VLOG(2) << "iter num: " << i;
+    TestHCCLReduceOp(&scope, ctx, i);
+  }
+}
diff --git a/paddle/fluid/operators/collective/c_reduce_sum_op_xpu.cc b/paddle/fluid/operators/collective/c_reduce_sum_op_xpu.cc
new file mode 100644
index 0000000000000..a0ec4d2a99cd7
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_reduce_sum_op_xpu.cc
@@ -0,0 +1,28 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_reduce_op.h"
+
+namespace paddle {
+namespace platform {
+struct XPUPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_XPU_KERNEL(c_reduce_sum,
+                       ops::CReduceOpXPUKernel<ops::kRedSum, float>)
diff --git a/paddle/fluid/operators/collective/c_reducescatter_op.cc b/paddle/fluid/operators/collective/c_reducescatter_op.cc
index ada1fd2b1270c..7836f11dc9b1f 100644
--- a/paddle/fluid/operators/collective/c_reducescatter_op.cc
+++ b/paddle/fluid/operators/collective/c_reducescatter_op.cc
@@ -49,6 +49,10 @@ class CReduceScatterOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<int>("nranks",
                  "Total trainer count of the distributed training job")
         .SetDefault(1);
+#if defined(PADDLE_WITH_ASCEND_CL)
+    AddAttr<std::string>("tag", "(string default tag) tag for reduce scatter.")
+        .SetDefault("tag");
+#endif
     AddAttr<bool>(
         "use_calc_stream",
         "(bool default false) eject CUDA operations to calculation stream.")
diff --git a/paddle/fluid/operators/collective/c_reducescatter_op.h b/paddle/fluid/operators/collective/c_reducescatter_op.h
index 366d8a3747cfb..490b152bc2d30 100644
--- a/paddle/fluid/operators/collective/c_reducescatter_op.h
+++ b/paddle/fluid/operators/collective/c_reducescatter_op.h
@@ -22,6 +22,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/collective/c_reducescatter_op_npu.cc b/paddle/fluid/operators/collective/c_reducescatter_op_npu.cc
new file mode 100644
index 0000000000000..44096a82c34d6
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_reducescatter_op_npu.cc
@@ -0,0 +1,87 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_reducescatter_op.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class CReduceScatterOpAscendKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_ASCEND_CL)
+    auto in = ctx.Input<framework::Tensor>("X");
+    auto out = ctx.Output<framework::Tensor>("Out");
+
+    int ring_id = ctx.Attr<int>("ring_id");
+    std::string group =
+        std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
+    auto place = ctx.GetPlace();
+    auto comm = platform::HCCLCommContext::Instance().Get(ring_id, place);
+    int nranks = comm->nranks();
+
+    auto out_dims = in->dims();
+    PADDLE_ENFORCE_EQ(out_dims[0] % nranks, 0,
+                      platform::errors::InvalidArgument(
+                          "The input tensor X's "
+                          "dim[0] (%d) should be divisible by nranks(%d)",
+                          out_dims[0], nranks));
+
+    out_dims[0] = out_dims[0] / nranks;
+    out->mutable_data<T>(out_dims, place);
+
+    uint64_t recv_numel = in->numel() / nranks;
+
+    void* inputPtr = reinterpret_cast<void*>(const_cast<T*>(in->data<T>()));
+    void* outputPtr = reinterpret_cast<void*>(out->data<T>());
+    HcclDataType dtype = platform::ToHCCLDataType(in->type());
+
+    aclrtStream stream = nullptr;
+    if (ctx.Attr<bool>("use_calc_stream")) {
+      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+      stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
+    } else {
+      stream = comm->stream();
+    }
+    VLOG(3) << "begin hccl reduce scatter, parameter is: "
+            << "recv_numel: " << recv_numel << "dtype: " << dtype
+            << "hccl_red_type: " << HCCL_REDUCE_SUM << ", group is: " << group;
+
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclReduceScatter(
+        inputPtr, outputPtr, recv_numel, dtype, HCCL_REDUCE_SUM, comm->comm(),
+        reinterpret_cast<void*>(stream)));
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with NPU."));
+#endif
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(c_reducescatter,
+                       ops::CReduceScatterOpAscendKernel<int8_t>,
+                       ops::CReduceScatterOpAscendKernel<int>,
+                       ops::CReduceScatterOpAscendKernel<float>,
+                       ops::CReduceScatterOpAscendKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc b/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc
new file mode 100644
index 0000000000000..f82f050a7206f
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc
@@ -0,0 +1,189 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <stdio.h>
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+#include "paddle/fluid/operators/collective/c_allgather_op.h"
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+#include "paddle/fluid/operators/collective/c_broadcast_op.h"
+#include "paddle/fluid/operators/collective/c_reducescatter_op.h"
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(c_reducescatter);
+USE_NO_KERNEL_OP(c_gen_hccl_id);
+USE_NO_KERNEL_OP(c_comm_init_hccl);
+USE_OP_DEVICE_KERNEL(c_reducescatter, NPU);
+
+DECLARE_string(selected_npus);
+
+template <typename T>
+void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
+  std::string debugstring = "";
+  for (auto ele : data) {
+    debugstring += std::to_string(ele) + std::string(",");
+  }
+  VLOG(2) << preStr << ":" << std::endl << debugstring;
+}
+
+void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx,
+                     HcclRootInfo* hccl_id) {
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  std::vector<int> rank_ids{0, 1};
+  f::AttributeMap gen_hccl_id;
+
+  std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
+  gen_hccl_id["rank"] = rank_id;
+  gen_hccl_id["endpoint"] = endpointList[rank_id];
+  std::vector<std::string> other_endpoints = {
+      endpointList[rank_id == 0 ? 1 : 0]};
+  gen_hccl_id["other_endpoints"] = other_endpoints;
+
+  auto out = scope->Var("Out");
+  auto id = out->GetMutable<HcclRootInfo>();
+
+  VLOG(3) << "break";
+
+  auto comm_init_op = f::OpRegistry::CreateOp("c_gen_hccl_id", {},
+                                              {{"Out", {"Out"}}}, gen_hccl_id);
+  VLOG(3) << "break";
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+
+  memcpy(hccl_id, id, 1024);
+}
+
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx,
+             HcclRootInfo* hccl_id) {
+  auto x = scope->Var("X");
+  auto id = x->GetMutable<HcclRootInfo>();
+
+  memcpy(id, hccl_id, 1024);
+
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  // std::vector<int> rank_ids{0, 1};
+  f::AttributeMap comm_init_attrs;
+  comm_init_attrs["ring_id"] = 0;
+  comm_init_attrs["rank_ids"] = 2;
+  comm_init_attrs["rank"] = rank_id;
+  comm_init_attrs["device_id"] = device_id;
+  // comm_init_attrs["rank_ids"] = rank_ids;
+  auto comm_init_op = f::OpRegistry::CreateOp(
+      "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+}
+
+void TestHCCLReduceScatterOp(f::Scope* scope, const p::DeviceContext& ctx) {
+  // init
+  auto x = scope->Var("Data");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+
+  std::vector<float> init;
+  int num1 = 4;
+  int num2 = 1;
+
+  for (int64_t i = 0; i < num1 * num2; ++i) {
+    init.push_back(1.0);
+  }
+  PrintDebugInfo("input data", init);
+
+  TensorFromVector(init, ctx, tensor_x);
+  tensor_x->Resize({num1, num2});
+
+  ctx.Wait();
+
+  auto place = ctx.GetPlace();
+  auto out = scope->Var("OutData");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+  tensor_out->Resize({num1, num2});
+  tensor_out->mutable_data<float>(place);  // allocate
+
+  ctx.Wait();
+
+  // run
+  f::AttributeMap attrs;
+  attrs["tag"] = std::string("tagx");
+  attrs["ring_id"] = 0;
+  attrs["nranks"] = 2;
+
+  auto op = f::OpRegistry::CreateOp("c_reducescatter", {{"X", {"Data"}}},
+                                    {{"Out", {"OutData"}}}, attrs);
+
+  int iter_num = 10;
+  for (int i = 0; i < iter_num; i++) {
+    op->Run(*scope, place);
+    ctx.Wait();
+  }
+
+  std::vector<float> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+  ctx.Wait();
+
+  PrintDebugInfo("output data", out_vec);
+  EXPECT_EQ(out_vec.size(), init.size() / 2);
+  for (uint32_t i = 0; i < out_vec.size(); i++) {
+    EXPECT_EQ(out_vec[i], 2.0);
+  }
+}
+
+TEST(c_reducescatter, NPU) {
+  f::Scope scope;
+  HcclRootInfo hccl_id;
+
+  // only support one device, if more than one device, use first default
+  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
+
+  PrepareUniqueId(&scope, ctx, &hccl_id);
+  Prepare(&scope, ctx, &hccl_id);
+  TestHCCLReduceScatterOp(&scope, ctx);
+}
diff --git a/paddle/fluid/operators/collective/c_split_op.cc b/paddle/fluid/operators/collective/c_split_op.cc
new file mode 100644
index 0000000000000..03046d571d0f0
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_split_op.cc
@@ -0,0 +1,112 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_split_op.h"
+
+namespace paddle {
+namespace operators {
+
+class CSplitOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "c_split");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "c_split");
+    int nranks = ctx->Attrs().Get<int>("nranks");
+    int rank = ctx->Attrs().Get<int>("rank");
+    int ring_id = ctx->Attrs().Get<int>("ring_id");
+    PADDLE_ENFORCE_GE(nranks, 2, platform::errors::InvalidArgument(
+                                     "The number of ranks (%d) for c_split "
+                                     "must be greater than 1.",
+                                     nranks));
+    PADDLE_ENFORCE_GE(
+        ring_id, 0,
+        platform::errors::InvalidArgument(
+            "The ring_id (%d) for c_split must be non-negative.", ring_id));
+    PADDLE_ENFORCE_GE(
+        rank, 0, platform::errors::InvalidArgument(
+                     "The rank (%d) for c_split must be non-negative.", rank));
+    PADDLE_ENFORCE_LT(rank, nranks,
+                      platform::errors::InvalidArgument(
+                          "The value of rank (%d) for c_split must "
+                          "be less than that of nranks.",
+                          rank, nranks));
+
+    framework::DDim dim = ctx->GetInputDim("X");
+    dim[dim.size() - 1] = dim[dim.size() - 1] / nranks;
+    if (dim[0] < 0) dim[0] = -1;
+    ctx->SetOutputDim("Out", dim);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
+  }
+};
+
+template <typename T>
+class CSplitOpGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> retv) const override {
+    retv->SetType("c_allgather");
+    retv->SetInput("X", this->OutputGrad("Out"));
+    retv->SetOutput("Out", this->InputGrad("X"));
+    retv->SetAttrMap(this->Attrs());
+  }
+};
+
+class CSplitOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "(Tensor) tensor to be split.");
+    AddOutput("Out", "(Tensor) the result of split.");
+    AddAttr<int>("rank", "(int default 0) rank id.").SetDefault(0);
+    AddAttr<int>("nranks", "(int default 1) number of ranks.").SetDefault(1);
+    AddAttr<int>("ring_id", "(int default 0) ring id.").SetDefault(0);
+    AddAttr<bool>(
+        "use_calc_stream",
+        "(bool default false) eject CUDA operations to calculation stream.")
+        .SetDefault(false);
+    AddAttr<bool>("use_model_parallel",
+                  "(bool default false) use this op with model parallel.")
+        .SetDefault(true);
+    AddComment(R"DOC(
+CSplit Operator
+Split the tensor evenly according to its rank.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OPERATOR(c_split, ops::CSplitOp,
+                  ops::CSplitOpGradMaker<paddle::framework::OpDesc>,
+                  ops::CSplitOpGradMaker<paddle::imperative::OpBase>,
+                  ops::CSplitOpMaker);
+
+REGISTER_OP_CPU_KERNEL(c_split, ops::CSplitOpCPUKernel<float>,
+                       ops::CSplitOpCPUKernel<double>,
+                       ops::CSplitOpCPUKernel<int>,
+                       ops::CSplitOpCPUKernel<int64_t>,
+                       ops::CSplitOpCPUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_split_op.cu.cc b/paddle/fluid/operators/collective/c_split_op.cu.cc
new file mode 100644
index 0000000000000..92a7f5e41b1d2
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_split_op.cu.cc
@@ -0,0 +1,80 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <vector>
+
+#include "paddle/fluid/operators/collective/c_split_op.h"
+#include "paddle/fluid/operators/math/concat_and_split.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class CSplitOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto x = ctx.Input<framework::Tensor>("X");
+    auto out = ctx.Output<framework::Tensor>("Out");
+
+    int nranks = ctx.Attr<int>("nranks");
+    int rank = ctx.Attr<int>("rank");
+    auto place = ctx.GetPlace();
+
+    PADDLE_ENFORCE_GE(rank, 0, platform::errors::PreconditionNotMet(
+                                   "The value of rank (%d) for c_split must be "
+                                   "greater than or equal to 0.",
+                                   rank));
+    PADDLE_ENFORCE_GE(nranks, 2,
+                      platform::errors::PreconditionNotMet(
+                          "The value of nranks (%d) for c_split must be "
+                          "greater than or equal to 2.",
+                          nranks));
+    PADDLE_ENFORCE_LT(rank, nranks,
+                      platform::errors::PreconditionNotMet(
+                          "The value of rank (%d) for c_split must be "
+                          "less than that of nranks (%d).",
+                          rank, nranks));
+
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    std::vector<const framework::Tensor*> shape_refer;
+    std::vector<framework::Tensor*> results;
+    size_t numel = x->numel();
+    auto dims = x->dims();
+    numel /= nranks;
+    int axis = dims.size() - 1;
+    dims[dims.size() - 1] /= nranks;
+    for (int i = 0; i < nranks; i++) {
+      framework::Tensor* out = new framework::Tensor();
+      out->mutable_data<T>(dims, place);
+      shape_refer.emplace_back(out);
+      results.emplace_back(out);
+    }
+
+    math::SplitFunctor<platform::CUDADeviceContext, T> functor;
+    functor(dev_ctx, *x, shape_refer, axis, &results);
+    out->mutable_data<T>(dims, place);
+    paddle::framework::TensorCopySync(*results[rank], out->place(), out);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(c_split, ops::CSplitOpCUDAKernel<float>,
+                        ops::CSplitOpCUDAKernel<double>,
+                        ops::CSplitOpCUDAKernel<int>,
+                        ops::CSplitOpCUDAKernel<int64_t>,
+                        ops::CSplitOpCUDAKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_split_op.h b/paddle/fluid/operators/collective/c_split_op.h
new file mode 100644
index 0000000000000..ea0c7fc45c66b
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_split_op.h
@@ -0,0 +1,38 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class CSplitOpCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Do not support c_split for cpu kernel now."));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc b/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc
index 700d1173e2ff6..83da712bee908 100644
--- a/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc
+++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc
@@ -61,6 +61,16 @@ class CSyncCalcStreamCudaKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(dev_ctx->stream()));
 #endif
 
+#elif defined(PADDLE_WITH_ASCEND_CL) && !defined(_WIN32)
+    auto place = ctx.GetPlace();
+    PADDLE_ENFORCE_EQ(is_npu_place(place), true,
+                      platform::errors::PreconditionNotMet(
+                          "Sync stream op can run on npu place only for now."));
+
+    auto dev_ctx = static_cast<platform::NPUDeviceContext*>(
+        platform::DeviceContextPool::Instance().Get(place));
+    PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeStream(dev_ctx->stream()));
+
 #else
     PADDLE_THROW(platform::errors::PreconditionNotMet(
         "PaddlePaddle should compile with GPU."));
diff --git a/paddle/fluid/operators/collective/c_sync_calc_stream_op_npu_test.cc b/paddle/fluid/operators/collective/c_sync_calc_stream_op_npu_test.cc
new file mode 100644
index 0000000000000..4b1f7bb340178
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op_npu_test.cc
@@ -0,0 +1,107 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <stdio.h>
+
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(elementwise_add);
+USE_OP_DEVICE_KERNEL(elementwise_add, NPU);
+USE_NO_KERNEL_OP(c_sync_calc_stream);
+
+template <typename T>
+void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
+  // init
+  auto x = scope->Var("X");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+
+  auto y = scope->Var("Y");
+  auto tensor_y = y->GetMutable<f::LoDTensor>();
+
+  std::vector<T> init_x;
+  for (int64_t i = 0; i < 10 * 10; ++i) {
+    init_x.push_back(static_cast<T>(1.0));
+  }
+
+  std::vector<T> init_y;
+  for (int64_t i = 0; i < 10 * 10; ++i) {
+    init_y.push_back(static_cast<T>(2.0));
+  }
+
+  TensorFromVector(init_x, ctx, tensor_x);
+  tensor_x->Resize({10, 10});
+  TensorFromVector(init_y, ctx, tensor_y);
+  tensor_y->Resize({10, 10});
+
+  f::AttributeMap attrs;
+  auto place = ctx.GetPlace();
+  auto out = scope->Var("Out");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+
+  // sync data
+  auto sync_op0 = f::OpRegistry::CreateOp("c_sync_calc_stream", {{"X", {"X"}}},
+                                          {{"Out", {"Out"}}}, attrs);
+  sync_op0->Run(*scope, place);
+
+  // run
+
+  auto op =
+      f::OpRegistry::CreateOp("elementwise_add", {{"X", {"X"}}, {"Y", {"Y"}}},
+                              {{"Out", {"Out"}}}, attrs);
+
+  op->Run(*scope, place);
+
+  // sync op run
+  auto sync_op = f::OpRegistry::CreateOp("c_sync_calc_stream", {{"X", {"X"}}},
+                                         {{"Out", {"Out"}}}, attrs);
+  sync_op->Run(*scope, place);
+
+  std::vector<T> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+
+  // sync op copy
+  auto sync_op2 = f::OpRegistry::CreateOp("c_sync_calc_stream", {{"X", {"X"}}},
+                                          {{"Out", {"Out"}}}, attrs);
+  sync_op2->Run(*scope, place);
+
+  float expected = 3.0;
+
+  EXPECT_EQ(out_vec.size(), init_x.size());
+  for (uint32_t i = 0; i < out_vec.size(); i++) {
+    EXPECT_EQ(out_vec[i], static_cast<T>(expected));
+  }
+}
+
+TEST(c_sync_calc_stream, NPU_fp32) {
+  f::Scope scope;
+  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  Compare<float>(&scope, ctx);
+}
diff --git a/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc b/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
index 95b9cd040fe94..e6f6bf5345619 100644
--- a/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
+++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
@@ -19,6 +19,11 @@ limitations under the License. */
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
 
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
 namespace paddle {
 namespace operators {
 
@@ -56,9 +61,8 @@ template <typename T>
 class CSyncCommStreamCudaKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-
     auto place = ctx.GetPlace();
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 
     int ring_id = ctx.Attr<int>("ring_id");
     auto stream =
@@ -70,6 +74,16 @@ class CSyncCommStreamCudaKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
 #endif
 
+#elif defined(PADDLE_WITH_ASCEND_CL)
+    auto place = ctx.GetPlace();
+    PADDLE_ENFORCE_EQ(is_npu_place(place), true,
+                      platform::errors::PreconditionNotMet(
+                          "Sync stream op can run on npu place only for now."));
+    int ring_id = ctx.Attr<int>("ring_id");
+    auto stream =
+        platform::HCCLCommContext::Instance().Get(ring_id, place)->stream();
+    PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeStream(stream));
+
 #else
     PADDLE_THROW(platform::errors::PreconditionNotMet(
         "PaddlePaddle should compile with GPU."));
diff --git a/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc b/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc
new file mode 100644
index 0000000000000..3915ec4fa35e8
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc
@@ -0,0 +1,190 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <stdio.h>
+
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+#include "paddle/fluid/operators/collective/c_broadcast_op.h"
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(c_broadcast);
+USE_NO_KERNEL_OP(c_sync_comm_stream);
+USE_NO_KERNEL_OP(c_gen_hccl_id);
+USE_NO_KERNEL_OP(c_comm_init_hccl);
+USE_OP_DEVICE_KERNEL(c_broadcast, NPU);
+
+DECLARE_string(selected_npus);
+
+template <typename T>
+void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
+  std::string debugstring = "";
+  for (auto ele : data) {
+    debugstring += std::to_string(ele) + std::string(",");
+  }
+  VLOG(2) << preStr << ":" << std::endl << debugstring;
+}
+
+void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx,
+                     HcclRootInfo* hccl_id) {
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  std::vector<int> rank_ids{0, 1};
+  f::AttributeMap gen_hccl_id;
+
+  std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
+  gen_hccl_id["rank"] = rank_id;
+  gen_hccl_id["endpoint"] = endpointList[rank_id];
+  std::vector<std::string> other_endpoints = {
+      endpointList[rank_id == 0 ? 1 : 0]};
+  gen_hccl_id["other_endpoints"] = other_endpoints;
+
+  auto out = scope->Var("Out");
+  auto id = out->GetMutable<HcclRootInfo>();
+
+  VLOG(3) << "break";
+
+  auto comm_init_op = f::OpRegistry::CreateOp("c_gen_hccl_id", {},
+                                              {{"Out", {"Out"}}}, gen_hccl_id);
+  VLOG(3) << "break";
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+
+  memcpy(hccl_id, id, 1024);
+}
+
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx,
+             HcclRootInfo* hccl_id) {
+  auto x = scope->Var("X");
+  auto id = x->GetMutable<HcclRootInfo>();
+
+  memcpy(id, hccl_id, 1024);
+
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  // std::vector<int> rank_ids{0, 1};
+  f::AttributeMap comm_init_attrs;
+  comm_init_attrs["ring_id"] = 0;
+  comm_init_attrs["rank_ids"] = 2;
+  comm_init_attrs["rank"] = rank_id;
+  comm_init_attrs["device_id"] = device_id;
+  // comm_init_attrs["rank_ids"] = rank_ids;
+  auto comm_init_op = f::OpRegistry::CreateOp(
+      "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+}
+
+void TestHCCLBroadcastOp(f::Scope* scope, const p::DeviceContext& ctx) {
+  std::cout << "BEGIN TEST:" << __FUNCTION__ << std::endl;
+  // init
+  auto x = scope->Var("Data");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+  int num = 2;
+  std::vector<float> init;
+  int rank_id = atoi(getenv("RANK_ID"));
+  std::cout << "rank_id:" << rank_id << std::endl;
+  for (int64_t i = 0; i < num * num; ++i) {
+    init.push_back(1.0 + rank_id);
+    std::cout << init[0];
+  }
+  std::cout << std::endl;
+
+  TensorFromVector(init, ctx, tensor_x);
+  tensor_x->Resize({num, num});
+
+  ctx.Wait();
+
+  auto place = ctx.GetPlace();
+  auto out = scope->Var("OutData");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+  tensor_out->Resize({num, num});
+  tensor_out->mutable_data<float>(place);  // allocate
+
+  ctx.Wait();
+
+  // run
+  f::AttributeMap attrs;
+  attrs["tag"] = std::string("tagx");
+  attrs["root"] = 0;
+  attrs["ring_id"] = 0;
+
+  auto op = f::OpRegistry::CreateOp("c_broadcast", {{"X", {"Data"}}},
+                                    {{"Out", {"OutData"}}}, attrs);
+
+  op->Run(*scope, place);
+
+  // comm sync
+
+  auto sync_op = f::OpRegistry::CreateOp(
+      "c_sync_comm_stream", {{"X", {"Data"}}}, {{"Out", {"OutData"}}}, attrs);
+  sync_op->Run(*scope, place);
+
+  // ctx.Wait();
+
+  std::vector<float> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+
+  EXPECT_EQ(out_vec.size(), init.size());
+  for (uint32_t i = 0; i < out_vec.size(); i++) {
+    EXPECT_EQ(out_vec[i], 1.0);
+  }
+}
+
+TEST(c_sync_comm_stream_op, NPU) {
+  f::Scope scope;
+  HcclRootInfo hccl_id;
+
+  // only support one device, if more than one device, use first default
+  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
+
+  PrepareUniqueId(&scope, ctx, &hccl_id);
+  Prepare(&scope, ctx, &hccl_id);
+  TestHCCLBroadcastOp(&scope, ctx);
+}
diff --git a/paddle/fluid/operators/collective/gen_hccl_id_op.cc b/paddle/fluid/operators/collective/gen_hccl_id_op.cc
new file mode 100644
index 0000000000000..0cb2dd188725f
--- /dev/null
+++ b/paddle/fluid/operators/collective/gen_hccl_id_op.cc
@@ -0,0 +1,216 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <ostream>
+#include <string>
+
+#include "glog/logging.h"
+#include "paddle/fluid/framework/op_proto_maker.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/var_type_traits.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/string/split.h"
+
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+
+namespace paddle {
+namespace operators {
+
+#ifdef PADDLE_WITH_ASCEND_CL
+
+class GenHCCLIdOp : public framework::OperatorBase {
+ public:
+  GenHCCLIdOp(const std::string& type, const framework::VariableNameMap& inputs,
+              const framework::VariableNameMap& outputs,
+              const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
+    std::vector<std::string> trainers =
+        Attr<std::vector<std::string>>("trainers");
+    int trainer_id = Attr<int>("trainer_id");
+    std::string endpoint = trainers[trainer_id];
+
+    PADDLE_ENFORCE_GE(trainer_id, 0, platform::errors::InvalidArgument(
+                                         "trainer_id %d is less than 0. Its "
+                                         "valid range is [0, trainer_size)"));
+    PADDLE_ENFORCE_LT(
+        trainer_id, static_cast<int>(trainers.size()),
+        platform::errors::OutOfRange("trainer_id %d is out of range. Its valid "
+                                     "range is [0, trainer_size)",
+                                     trainer_id));
+
+    int hccl_comm_num = Attr<int>("hccl_comm_num");
+    int use_hierarchical_allreduce = Attr<bool>("use_hierarchical_allreduce");
+    int inter_nranks = Attr<int>("hierarchical_allreduce_inter_nranks");
+    int inter_trainer_id = -1;
+    int exter_trainer_id = -1;
+
+    if (use_hierarchical_allreduce) {
+      PADDLE_ENFORCE_GT(
+          trainers.size(), 1,
+          platform::errors::PreconditionNotMet(
+              "The number of collective trainers %llu <= 1", trainers.size()));
+      PADDLE_ENFORCE_GT(
+          inter_nranks, 1,
+          platform::errors::PreconditionNotMet(
+              "inter_nranks %d <= 1 while in hierarchical allreduce mode",
+              inter_nranks));
+      PADDLE_ENFORCE_EQ(
+          trainers.size() % inter_nranks, 0,
+          platform::errors::PreconditionNotMet(
+              "The number of trainers %llu mod inter_nranks %d is not equal 0",
+              trainers.size(), inter_nranks));
+
+      inter_trainer_id = trainer_id % inter_nranks;
+
+      if (trainer_id % inter_nranks == 0) {
+        exter_trainer_id = trainer_id / inter_nranks;
+      }
+    }
+
+    std::ostringstream ss;
+    for (size_t i = 0; i < trainers.size(); i++) {
+      ss << trainers[i] << ",";
+    }
+
+    VLOG(1) << "trainer_id:" << trainer_id
+            << ", use_hierarchical_allreduce:" << use_hierarchical_allreduce
+            << ", hccl_comm_num:" << hccl_comm_num
+            << ", inter_nranks:" << inter_nranks
+            << ", inter_trainer_id:" << inter_trainer_id
+            << ", exter_trainer_id:" << exter_trainer_id
+            << ", trainers:" << ss.str();
+
+    int server_fd = -1;
+
+    /// 1. init flat
+    std::function<std::string(size_t)> func = platform::GetFlatHCCLVarName;
+    if (trainer_id == 0) {
+      // server endpoints
+      std::vector<std::string> flat_endpoints;
+      flat_endpoints.insert(flat_endpoints.begin(), trainers.begin() + 1,
+                            trainers.end());
+      SendBroadCastHCCLID(flat_endpoints, hccl_comm_num, func, scope);
+    } else {
+      server_fd = CreateListenSocket(endpoint);
+      RecvBroadCastHCCLID(server_fd, endpoint, hccl_comm_num, func, scope);
+    }
+
+    /// 2. hierarchical inter ncclid
+    func = platform::GetHierarchicalInterHCCLVarName;
+    if (inter_trainer_id == 0) {
+      std::ostringstream ss;
+      ss << endpoint;
+      std::vector<std::string> inter_endpoints;
+      for (int i = trainer_id + 1; i < trainer_id + inter_nranks &&
+                                   i < static_cast<int>(trainers.size());
+           i++) {
+        ss << ",";
+        inter_endpoints.push_back(trainers[i]);
+        ss << trainers[i];
+      }
+      VLOG(1) << "Hierarchical inter ring endpoints:" << ss.str();
+
+      SendBroadCastHCCLID(inter_endpoints, hccl_comm_num, func, scope);
+    } else if (inter_trainer_id > 0) {
+      VLOG(1) << "Hierarchical inter ring";
+      RecvBroadCastHCCLID(server_fd, endpoint, hccl_comm_num, func, scope);
+    }
+
+    /// 3. hierarchical exter ncclid
+    func = platform::GetHierarchicalExterHCCLVarName;
+    if (exter_trainer_id == 0) {
+      std::ostringstream ss;
+      std::vector<std::string> exter_endpoints;
+      ss << endpoint;
+      for (size_t i = inter_nranks; i < trainers.size(); i += inter_nranks) {
+        ss << ",";
+        exter_endpoints.push_back(trainers[i]);
+        ss << trainers[i];
+      }
+      VLOG(1) << "Hierarchical exter ring endpoints:" << ss.str();
+
+      SendBroadCastHCCLID(exter_endpoints, hccl_comm_num, func, scope);
+    } else if (exter_trainer_id > 0) {
+      VLOG(1) << "Hierarchical exter ring";
+      RecvBroadCastHCCLID(server_fd, endpoint, hccl_comm_num, func, scope);
+    }
+
+    // close socket server
+    if (trainer_id != 0) {
+      CloseSocket(server_fd);
+    }
+  }
+};
+
+#else
+class GenHCCLIdOp : public framework::OperatorBase {
+ public:
+  GenHCCLIdOp(const std::string& type, const framework::VariableNameMap& inputs,
+              const framework::VariableNameMap& outputs,
+              const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {}
+};
+
+#endif
+
+class GenHCCLIdOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddOutput("HCCLID", "Raw variable contains a HCCL UniqueId instaces.");
+    AddComment(R"DOC(
+GenHCCLId operator
+
+For trainer 0: generate a new UniqueId and send it to all the other trainers.
+For trainer 1~n: start a gRPC server to get the UniqueId, once got, stop the server.
+)DOC");
+    AddAttr<std::vector<std::string>>(
+        "trainers",
+        "['trainer0_ip:port', 'trainer1_ip:port', ...] "
+        "list of all trainer endpoints")
+        .SetDefault({});
+    AddAttr<int>("trainer_id",
+                 "(int) "
+                 "The index of the trainer in distributed training.");
+    AddAttr<int>("hccl_comm_num",
+                 "(int default 1) "
+                 "The number of nccl communicator num.")
+        .SetDefault(1);
+    AddAttr<bool>("use_hierarchical_allreduce",
+                  "(bool default false) "
+                  "Wheter to use hierarchical allreduce.")
+        .SetDefault(false);
+    AddAttr<int>("hierarchical_allreduce_inter_nranks",
+                 "(int default 1) "
+                 "Wheter to use hierarchical allreduce.")
+        .SetDefault(-1);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(gen_hccl_id, ops::GenHCCLIdOp, ops::GenHCCLIdOpMaker);
diff --git a/paddle/fluid/operators/collective/gen_hccl_id_op_helper.cc b/paddle/fluid/operators/collective/gen_hccl_id_op_helper.cc
new file mode 100644
index 0000000000000..15940a76f7110
--- /dev/null
+++ b/paddle/fluid/operators/collective/gen_hccl_id_op_helper.cc
@@ -0,0 +1,350 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+#include <arpa/inet.h>
+#include <netdb.h>
+#include <netinet/in.h>
+#include <stdlib.h>
+#include <sys/socket.h>
+
+#include <algorithm>
+#include <ostream>
+#include <string>
+
+#include "glog/logging.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/var_type_traits.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/string/split.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+constexpr char COMM_HEAD[] = "_pd_gen_comm_id_";
+#define HCCL_UNIQUE_ID_BYTES 1024
+
+// Check system calls, such as socket, bind.
+#define CHECK_SYS_CALL(call, name)          \
+  do {                                      \
+    int retval;                             \
+    CHECK_SYS_CALL_VAL(call, name, retval); \
+  } while (false)
+
+#define CHECK_SYS_CALL_VAL(call, name, retval)                            \
+  do {                                                                    \
+    RETRY_SYS_CALL_VAL(call, name, retval);                               \
+    if (retval == -1) {                                                   \
+      PADDLE_THROW(platform::errors::Unavailable("Call to %s failed: %s", \
+                                                 name, strerror(errno))); \
+    }                                                                     \
+  } while (false)
+
+#define RETRY_SYS_CALL_VAL(call, name, retval)                           \
+  do {                                                                   \
+    retval = (call);                                                     \
+    if (retval == -1 &&                                                  \
+        (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) {   \
+      LOG(WARNING) << "Call " << name << " returned " << strerror(errno) \
+                   << " retry";                                          \
+    } else {                                                             \
+      break;                                                             \
+    }                                                                    \
+  } while (true)
+
+static int SocketSend(int fd, const char* buffer, int size) {
+  int offset = 0;
+  int bytes = 0;
+  while (offset < size) {
+    bytes = send(fd, buffer + offset, size - offset, 0);
+    if (bytes == -1) {
+      if (errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN) {
+        // send failed
+        return -1;
+      } else {
+        bytes = 0;
+      }
+    }
+    offset += bytes;
+  }
+  return offset;
+}
+
+static int SocketRecv(int fd, char* buffer, int size) {
+  int offset = 0;
+  int bytes = 0;
+  while (offset < size) {
+    bytes = recv(fd, buffer + offset, size - offset, 0);
+    if (bytes == 0) {
+      // closed by client, maybe probing alive client
+      return 0;
+    }
+    if (bytes == -1) {
+      if (errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN) {
+        return -1;
+      } else {
+        bytes = 0;
+      }
+    }
+    offset += bytes;
+  }
+  return offset;
+}
+
+static void BindOrConnectFailed(int timeout, int* try_times, int* total_time,
+                                const char* op, const std::string& ep) {
+  PADDLE_ENFORCE_LT(
+      *total_time, timeout,
+      platform::errors::Unavailable("%s addr=%s timeout, failed reason: %s", op,
+                                    ep.c_str(), strerror(errno)));
+  ++(*try_times);
+  int retry_time = std::min(*try_times * 500, 3000);  // max 3 seconds
+  *total_time += retry_time;
+
+  LOG(WARNING) << op << " addr=" << ep << " failed " << *try_times
+               << " times with reason: " << strerror(errno) << " retry after "
+               << retry_time / 1000.0 << " seconds";
+  std::this_thread::sleep_for(std::chrono::milliseconds(retry_time));
+}
+
+int CreateListenSocket(const std::string& ep) {
+  auto addr = paddle::string::Split(ep, ':');
+  PADDLE_ENFORCE_EQ(
+      addr.size(), 2UL,
+      platform::errors::InvalidArgument(
+          "The endpoint should contain host and port, but got %s.", ep));
+  std::string host = addr[0];
+  int port = std::stoi(addr[1]);
+
+  // creating socket fd
+  int server_fd = -1;
+  CHECK_SYS_CALL_VAL(socket(AF_INET, SOCK_STREAM, 0), "socket", server_fd);
+
+  // NOTE. Solutions to `Address already in use`.
+  // 1. Reuse addr&port. Otherwise, once the server closes the socket
+  // before client, the server will enter TIME-WAIT status. If we bind port
+  // again, the error `Address already in use` will appear.
+  // 2. Or we can close the client first to ensure that the server does
+  // not enter the TIME-WAIT state. But this is obviously not as convenient
+  // as the reuse method.
+  int opt = 1;
+#if defined(SO_REUSEPORT)
+  // since Linux kernel 3.9
+  CHECK_SYS_CALL(setsockopt(server_fd, SOL_SOCKET, SO_REUSEADDR | SO_REUSEPORT,
+                            &opt, sizeof(opt)),
+                 "setsockopt");
+#else
+  CHECK_SYS_CALL(
+      setsockopt(server_fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)),
+      "setsockopt");
+#endif
+
+  struct sockaddr_in address;
+  address.sin_family = AF_INET;
+  address.sin_addr.s_addr = INADDR_ANY;
+  address.sin_port = htons(port);
+
+  // TODO(wangxi) Set from env, default 900s=15min
+  int timeout = 900 * 1000;
+  int try_times = 0;
+  int total_time = 0;
+  while (true) {
+    int ret_val = -1;
+    RETRY_SYS_CALL_VAL(
+        bind(server_fd, (struct sockaddr*)&address, sizeof(address)), "bind",
+        ret_val);
+
+    if (ret_val == -1) {
+      BindOrConnectFailed(timeout, &try_times, &total_time, "bind", ep);
+      continue;
+    }
+    break;
+  }
+
+  CHECK_SYS_CALL(listen(server_fd, 3), "listen");
+  LOG(INFO) << "Server listening on: " << ep << " successful.";
+  return server_fd;
+}
+
+void CloseSocket(int fd) { CHECK_SYS_CALL(close(fd), "close"); }
+
+static int SocketAccept(int server_fd, const char* head) {
+  struct sockaddr_in client_addr;
+  socklen_t addr_length = sizeof(client_addr);
+  char buffer[1024] = {0};
+  int conn = -1;
+
+  while (true) {
+    CHECK_SYS_CALL_VAL(
+        accept(server_fd, reinterpret_cast<struct sockaddr*>(&client_addr),
+               &addr_length),
+        "accept", conn);
+
+    int ret_val = SocketRecv(conn, buffer, strlen(head));
+    if (ret_val > 0 && strncmp(buffer, head, strlen(head)) == 0) {
+      break;  // accept client
+    } else {
+      VLOG(3) << "socket read failed with ret_val=" << ret_val;
+      CloseSocket(conn);
+    }
+  }
+  return conn;
+}
+
+static int ConnectAddr(const std::string& ep, const char* head) {
+  auto addr = paddle::string::Split(ep, ':');
+  PADDLE_ENFORCE_EQ(
+      addr.size(), 2UL,
+      platform::errors::InvalidArgument(
+          "The endpoint should contain host and port, but got %s.", ep));
+  std::string host = addr[0];
+  int port = std::stoi(addr[1]);
+
+  int sock = -1;
+  CHECK_SYS_CALL_VAL(socket(AF_INET, SOCK_STREAM, 0), "socket", sock);
+
+  struct sockaddr_in server_addr;
+  memset(&server_addr, 0, sizeof(server_addr));
+  server_addr.sin_family = AF_INET;
+  server_addr.sin_port = htons(port);
+
+  char* ip = NULL;
+  struct hostent* hp = NULL;
+  hp = gethostbyname(host.c_str());
+  PADDLE_ENFORCE_NOT_NULL(hp, platform::errors::InvalidArgument(
+                                  "Fail to get host by name %s.", host));
+
+  int i = 0;
+  while (hp->h_addr_list[i] != NULL) {
+    ip = inet_ntoa(*(struct in_addr*)hp->h_addr_list[i]);
+    VLOG(3) << "gethostbyname  host:" << host << "  ->ip: " << ip;
+    break;
+  }
+
+  PADDLE_ENFORCE_GT(inet_pton(AF_INET, ip, &server_addr.sin_addr), 0,
+                    platform::errors::Unavailable("Open address %s failed: %s",
+                                                  ep, strerror(errno)));
+
+  // TODO(wangxi) Set from env, default 900s=15min
+  int timeout = 900 * 1000;
+  int try_times = 0;
+  int total_time = 0;
+  while (true) {
+    int ret_val = -1;
+    RETRY_SYS_CALL_VAL(
+        connect(sock, (struct sockaddr*)&server_addr, sizeof(server_addr)),
+        "connect", ret_val);
+
+    if (ret_val == -1) {
+      BindOrConnectFailed(timeout, &try_times, &total_time, "connect", ep);
+      continue;
+    }
+
+    CHECK_SYS_CALL(SocketSend(sock, head, strlen(head)), "send");
+    break;
+  }
+  return sock;
+}
+
+static void RecvHCCLID(int conn, HcclRootInfo* hccl_id) {
+  char buffer[1024] = {0};
+  static_assert(HCCL_UNIQUE_ID_BYTES <= 1024,
+                "hccl id bytes must <= buffer size");
+
+  CHECK_SYS_CALL(SocketRecv(conn, buffer, HCCL_UNIQUE_ID_BYTES),
+                 "recv hccl id");
+  memcpy(hccl_id, buffer, HCCL_UNIQUE_ID_BYTES);
+}
+
+static void SendHCCLID(int conn, HcclRootInfo* hccl_id) {
+  char buffer[1024] = {0};
+  memcpy(buffer, hccl_id, HCCL_UNIQUE_ID_BYTES);
+
+  CHECK_SYS_CALL(SocketSend(conn, buffer, HCCL_UNIQUE_ID_BYTES),
+                 "send hccl id");
+}
+
+void SendBroadCastHCCLID(std::vector<std::string> servers, int hccl_comm_num,
+                         std::function<std::string(size_t)> func,
+                         const framework::Scope& scope) {
+  // connect with server
+  std::vector<int> connects;
+  for (auto server : servers) {
+    VLOG(3) << "connecting endpoint: " << server;
+    int conn = ConnectAddr(server, COMM_HEAD);
+    connects.push_back(conn);
+  }
+  VLOG(3) << "connecting completed...";
+
+  for (int i = 0; i < hccl_comm_num; ++i) {
+    std::string var_name = func(i);
+    auto var = scope.FindVar(var_name);
+    PADDLE_ENFORCE_NOT_NULL(
+        var, platform::errors::NotFound("Variable with name %s is not found",
+                                        var_name.c_str()));
+    auto hccl_id = var->GetMutable<HcclRootInfo>();
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclGetRootInfo(hccl_id));
+
+    int j = 0;
+    for (auto conn : connects) {
+      VLOG(3) << "sending hccl_id_var: " << var_name << " to " << servers[j]
+              << " hccl_comm_no: " << i;
+      SendHCCLID(conn, hccl_id);
+      ++j;
+    }
+    VLOG(3) << "sending completed...";
+  }
+
+  // close client
+  for (auto conn : connects) {
+    CloseSocket(conn);
+  }
+}
+
+void RecvBroadCastHCCLID(std::string endpoint, int hccl_comm_num,
+                         std::function<std::string(size_t)> func,
+                         const framework::Scope& scope) {
+  int server = CreateListenSocket(endpoint);
+  RecvBroadCastHCCLID(server, endpoint, hccl_comm_num, func, scope);
+  CloseSocket(server);
+}
+
+void RecvBroadCastHCCLID(int server_fd, std::string endpoint, int hccl_comm_num,
+                         std::function<std::string(size_t)> func,
+                         const framework::Scope& scope) {
+  int client = SocketAccept(server_fd, COMM_HEAD);
+
+  for (int i = 0; i < hccl_comm_num; ++i) {
+    std::string var_name = func(i);
+    auto var = scope.FindVar(var_name);
+    PADDLE_ENFORCE_NOT_NULL(
+        var, platform::errors::NotFound("Variable with name %s is not found",
+                                        var_name.c_str()));
+    auto hccl_id = var->GetMutable<HcclRootInfo>();
+
+    VLOG(3) << "trainer: " << endpoint << " receiving hccl_id_var: " << var_name
+            << " from trainer 0, hccl_comm_no: " << i;
+    RecvHCCLID(client, hccl_id);
+  }
+  VLOG(3) << "receiving completed...";
+  CloseSocket(client);
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/collective/gen_hccl_id_op_helper.h b/paddle/fluid/operators/collective/gen_hccl_id_op_helper.h
new file mode 100644
index 0000000000000..1ad6f791e1fc3
--- /dev/null
+++ b/paddle/fluid/operators/collective/gen_hccl_id_op_helper.h
@@ -0,0 +1,48 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <functional>
+#include <string>
+#include <vector>
+
+namespace paddle {
+namespace framework {
+class Scope;
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace operators {
+
+int CreateListenSocket(const std::string& ep);
+
+void CloseSocket(int fd);
+
+void SendBroadCastHCCLID(std::vector<std::string> servers, int nccl_comm_num,
+                         std::function<std::string(size_t)> func,
+                         const framework::Scope& scope);
+
+// server listen on endpoint, then recv nccl id
+void RecvBroadCastHCCLID(std::string endpoint, int nccl_comm_num,
+                         std::function<std::string(size_t)> func,
+                         const framework::Scope& scope);
+
+// recv nccl id from socket
+void RecvBroadCastHCCLID(int server_fd, std::string endpoint, int nccl_comm_num,
+                         std::function<std::string(size_t)> func,
+                         const framework::Scope& scope);
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/collective/recv_v2_op.cc b/paddle/fluid/operators/collective/recv_v2_op.cc
index 0ae7b821617f9..39a9ed0c74ef5 100644
--- a/paddle/fluid/operators/collective/recv_v2_op.cc
+++ b/paddle/fluid/operators/collective/recv_v2_op.cc
@@ -70,6 +70,12 @@ class RecvOpV2Maker : public framework::OpProtoAndCheckerMaker {
     AddAttr<int>("peer", "(int default 0) rank id for sender.").SetDefault(0);
     AddAttr<int>("dtype", "(int default 5('float32')) data type of tensor.")
         .SetDefault(5);
+#if defined(PADDLE_WITH_ASCEND_CL)
+    AddAttr<std::string>("tag", "(string default tag) tag for broadcasting.")
+        .SetDefault("tag");
+    AddAttr<int>("srTag", "(string default tag) tag for broadcasting.")
+        .SetDefault(0);
+#endif
     AddAttr<std::vector<int>>("out_shape", "shape of the output tensor.")
         .SetDefault(std::vector<int>());
     AddAttr<bool>(
diff --git a/paddle/fluid/operators/collective/recv_v2_op_npu.cc b/paddle/fluid/operators/collective/recv_v2_op_npu.cc
new file mode 100644
index 0000000000000..69f1f4681a33d
--- /dev/null
+++ b/paddle/fluid/operators/collective/recv_v2_op_npu.cc
@@ -0,0 +1,79 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/recv_v2_op.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class CRecvOpASCENDKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_ASCEND_CL)
+    auto x = ctx.Output<framework::LoDTensor>("Out");
+    void* ptr = reinterpret_cast<void*>(const_cast<T*>(x->data<T>()));
+    int numel = x->numel();
+    HcclDataType dtype = platform::ToHCCLDataType(x->type());
+
+    int ring_id = ctx.Attr<int>("ring_id");
+    auto place = ctx.GetPlace();
+    auto comm =
+        paddle::platform::HCCLCommContext::Instance().Get(ring_id, place);
+
+    aclrtStream stream = nullptr;
+    auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+    if (ctx.Attr<bool>("use_calc_stream")) {
+      stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
+    } else {
+      stream = comm->stream();
+    }
+
+    int nranks = comm->nranks();
+    int peer = ctx.Attr<int>("peer");
+
+    PADDLE_ENFORCE_EQ(nranks, 2, platform::errors::InvalidArgument(
+                                     "The nranks must be 2, but (%d)", nranks));
+
+    int root = peer;
+
+    VLOG(3) << "begin hccl recv, parameter is: "
+            << "root " << root << ", comm: " << comm->comm()
+            << ", stream: " << stream;
+
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclBroadcast(
+        ptr, numel, dtype, (uint32_t)root, comm->comm(), stream));
+
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with NPU."));
+#endif
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(recv_v2, ops::CRecvOpASCENDKernel<int>,
+                       ops::CRecvOpASCENDKernel<int8_t>,
+                       ops::CRecvOpASCENDKernel<float>,
+                       ops::CRecvOpASCENDKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc b/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
new file mode 100644
index 0000000000000..384dfd1fc5f2d
--- /dev/null
+++ b/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
@@ -0,0 +1,165 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <stdio.h>
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+#include "paddle/fluid/operators/collective/recv_v2_op.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(recv_v2);
+USE_NO_KERNEL_OP(c_gen_hccl_id);
+USE_NO_KERNEL_OP(c_comm_init_hccl);
+USE_OP_DEVICE_KERNEL(recv_v2, NPU);
+
+void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx,
+                     HcclRootInfo* hccl_id) {
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  std::vector<int> rank_ids{0, 1};
+  f::AttributeMap gen_hccl_id;
+
+  std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
+  gen_hccl_id["rank"] = rank_id;
+  gen_hccl_id["endpoint"] = endpointList[rank_id];
+  std::vector<std::string> other_endpoints = {
+      endpointList[rank_id == 0 ? 1 : 0]};
+  gen_hccl_id["other_endpoints"] = other_endpoints;
+
+  auto out = scope->Var("Out");
+  auto id = out->GetMutable<HcclRootInfo>();
+
+  VLOG(3) << "break";
+
+  auto comm_init_op = f::OpRegistry::CreateOp("c_gen_hccl_id", {},
+                                              {{"Out", {"Out"}}}, gen_hccl_id);
+  VLOG(3) << "break";
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+
+  memcpy(hccl_id, id, 1024);
+}
+
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx,
+             HcclRootInfo* hccl_id) {
+  auto x = scope->Var("X");
+  auto id = x->GetMutable<HcclRootInfo>();
+
+  memcpy(id, hccl_id, 1024);
+
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  // std::vector<int> rank_ids{0, 1};
+  f::AttributeMap comm_init_attrs;
+  comm_init_attrs["ring_id"] = 0;
+  comm_init_attrs["rank_ids"] = 2;
+  comm_init_attrs["rank"] = rank_id;
+  comm_init_attrs["device_id"] = device_id;
+  // comm_init_attrs["rank_ids"] = rank_ids;
+  auto comm_init_op = f::OpRegistry::CreateOp(
+      "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+}
+
+void TestHcomRecvOp(f::Scope* scope, const p::DeviceContext& ctx) {
+  std::cout << "BEGIN TEST:" << __FUNCTION__ << std::endl;
+
+  int num = atoi(getenv("DATA_SIZE"));
+  EXPECT_GT(num, 0);
+  EXPECT_LT(num, 1 << 15);
+  int rank_id = atoi(getenv("RANK_ID"));
+  VLOG(3) << "rank_id:" << rank_id << std::endl;
+
+  ctx.Wait();
+  auto place = ctx.GetPlace();
+  auto out = scope->Var("Data");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+  tensor_out->Resize({num, num});
+  tensor_out->mutable_data<float>(place);  // allocate
+
+  ctx.Wait();
+
+  f::AttributeMap attrs;
+  attrs["tag"] = std::string("srtest");
+  attrs["peer"] = atoi(getenv("SRC_RANK"));
+  attrs["ring_id"] = 0;
+  attrs["srTag"] = 0;
+  std::vector<int> out_shape;
+  out_shape.push_back(num);
+  out_shape.push_back(num);
+  attrs["out_shape"] = out_shape;
+
+  auto op = f::OpRegistry::CreateOp("recv_v2", {}, {{"Out", {"Data"}}}, attrs);
+  VLOG(3) << "CreateOp recv_v2";
+
+  for (int i = 0; i < 10; i++) {
+    op->Run(*scope, place);
+  }
+  VLOG(3) << "Run op recv_v2";
+  std::vector<float> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+  ctx.Wait();
+  std::vector<float> init(num * num, 1.0 * atoi(getenv("DEST_RANK")));
+  EXPECT_EQ(out_vec == init, true);
+}
+
+TEST(recv_v2, NPU) {
+  f::Scope scope;
+  HcclRootInfo hccl_id;
+
+  char* npu_id = getenv("FLAGS_selected_npus");
+  VLOG(3) << "Select npu:" << npu_id;
+  p::NPUDeviceContext ctx(p::NPUPlace(atoi(npu_id)));
+
+  PrepareUniqueId(&scope, ctx, &hccl_id);
+  Prepare(&scope, ctx, &hccl_id);
+  TestHcomRecvOp(&scope, ctx);
+}
diff --git a/paddle/fluid/operators/collective/send_v2_op.cc b/paddle/fluid/operators/collective/send_v2_op.cc
index c5a86b4f08813..c60d560e43bae 100644
--- a/paddle/fluid/operators/collective/send_v2_op.cc
+++ b/paddle/fluid/operators/collective/send_v2_op.cc
@@ -50,6 +50,12 @@ class SendOpV2Maker : public framework::OpProtoAndCheckerMaker {
     AddAttr<int>("ring_id", "(int default 0) nccl communication ring id.")
         .SetDefault(0);
     AddAttr<int>("peer", "(int default 0) rank id for receiver.").SetDefault(0);
+#if defined(PADDLE_WITH_ASCEND_CL)
+    AddAttr<std::string>("tag", "(string default tag) tag for broadcasting.")
+        .SetDefault("tag");
+    AddAttr<int>("srTag", "(string default tag) tag for broadcasting.")
+        .SetDefault(0);
+#endif
     AddAttr<bool>(
         "use_calc_stream",
         "(bool default false) eject CUDA operations to calculation stream.")
diff --git a/paddle/fluid/operators/collective/send_v2_op_npu.cc b/paddle/fluid/operators/collective/send_v2_op_npu.cc
new file mode 100644
index 0000000000000..0ade090fcaac0
--- /dev/null
+++ b/paddle/fluid/operators/collective/send_v2_op_npu.cc
@@ -0,0 +1,79 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/send_v2_op.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class CSendOpASCENDKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_ASCEND_CL)
+    auto x = ctx.Input<framework::LoDTensor>("X");
+    void* ptr = reinterpret_cast<void*>(const_cast<T*>(x->data<T>()));
+    int numel = x->numel();
+    HcclDataType dtype = platform::ToHCCLDataType(x->type());
+
+    int ring_id = ctx.Attr<int>("ring_id");
+    auto place = ctx.GetPlace();
+    auto comm =
+        paddle::platform::HCCLCommContext::Instance().Get(ring_id, place);
+
+    aclrtStream stream = nullptr;
+    auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+    if (ctx.Attr<bool>("use_calc_stream")) {
+      stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
+    } else {
+      stream = comm->stream();
+    }
+
+    int nranks = comm->nranks();
+    int rank = comm->rank();
+
+    PADDLE_ENFORCE_EQ(nranks, 2, platform::errors::InvalidArgument(
+                                     "The nranks must be 2, but (%d)", nranks));
+
+    int root = rank;
+
+    VLOG(3) << "begin hccl send, parameter is: "
+            << "root " << root << ", comm: " << comm->comm()
+            << ", stream: " << stream;
+
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclBroadcast(
+        ptr, numel, dtype, (uint32_t)root, comm->comm(), stream));
+
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with NPU."));
+#endif
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(send_v2, ops::CSendOpASCENDKernel<int>,
+                       ops::CSendOpASCENDKernel<int8_t>,
+                       ops::CSendOpASCENDKernel<float>,
+                       ops::CSendOpASCENDKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/send_v2_op_npu_test.cc b/paddle/fluid/operators/collective/send_v2_op_npu_test.cc
new file mode 100644
index 0000000000000..cf01b1d0a6a1d
--- /dev/null
+++ b/paddle/fluid/operators/collective/send_v2_op_npu_test.cc
@@ -0,0 +1,154 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <stdio.h>
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+#include "paddle/fluid/operators/collective/send_v2_op.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(send_v2);
+USE_NO_KERNEL_OP(c_gen_hccl_id);
+USE_NO_KERNEL_OP(c_comm_init_hccl);
+USE_OP_DEVICE_KERNEL(send_v2, NPU);
+
+void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx,
+                     HcclRootInfo* hccl_id) {
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  std::vector<int> rank_ids{0, 1};
+  f::AttributeMap gen_hccl_id;
+
+  std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
+  gen_hccl_id["rank"] = rank_id;
+  gen_hccl_id["endpoint"] = endpointList[rank_id];
+  std::vector<std::string> other_endpoints = {
+      endpointList[rank_id == 0 ? 1 : 0]};
+  gen_hccl_id["other_endpoints"] = other_endpoints;
+
+  auto out = scope->Var("Out");
+  auto id = out->GetMutable<HcclRootInfo>();
+
+  VLOG(3) << "break";
+
+  auto comm_init_op = f::OpRegistry::CreateOp("c_gen_hccl_id", {},
+                                              {{"Out", {"Out"}}}, gen_hccl_id);
+  VLOG(3) << "break";
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+
+  memcpy(hccl_id, id, 1024);
+}
+
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx,
+             HcclRootInfo* hccl_id) {
+  auto x = scope->Var("X");
+  auto id = x->GetMutable<HcclRootInfo>();
+
+  memcpy(id, hccl_id, 1024);
+
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  // std::vector<int> rank_ids{0, 1};
+  f::AttributeMap comm_init_attrs;
+  comm_init_attrs["ring_id"] = 0;
+  comm_init_attrs["rank_ids"] = 2;
+  comm_init_attrs["rank"] = rank_id;
+  comm_init_attrs["device_id"] = device_id;
+  // comm_init_attrs["rank_ids"] = rank_ids;
+  auto comm_init_op = f::OpRegistry::CreateOp(
+      "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+}
+
+void TestHcomSendOp(f::Scope* scope, const p::DeviceContext& ctx) {
+  std::cout << "BEGIN TEST:" << __FUNCTION__ << std::endl;
+  auto x = scope->Var("Data");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+  int num = atoi(getenv("DATA_SIZE"));
+
+  EXPECT_GT(num, 0);
+  EXPECT_LT(num, 1 << 15);
+  std::vector<float> init(num * num, 1.0 * atoi(getenv("DEST_RANK")));
+  int rank_id = atoi(getenv("RANK_ID"));
+  VLOG(3) << "rank id:" << rank_id;
+  TensorFromVector(init, ctx, tensor_x);
+  tensor_x->Resize({num, num});
+  ctx.Wait();
+  auto place = ctx.GetPlace();
+  ctx.Wait();
+
+  f::AttributeMap attrs;
+  attrs["tag"] = std::string("srtest");
+  attrs["peer"] = atoi(getenv("DEST_RANK"));
+  attrs["ring_id"] = 0;
+  attrs["srTag"] = 0;
+
+  auto op = f::OpRegistry::CreateOp("send_v2", {{"X", {"Data"}}}, {}, attrs);
+
+  for (int i = 0; i < 10; i++) {
+    op->Run(*scope, place);
+  }
+  VLOG(3) << "send run over";
+  ctx.Wait();
+}
+
+TEST(send_v2, NPU) {
+  f::Scope scope;
+  HcclRootInfo hccl_id;
+
+  char* npu_id = getenv("FLAGS_selected_npus");
+  VLOG(3) << "Select npu:" << npu_id;
+  p::NPUDeviceContext ctx(p::NPUPlace(atoi(npu_id)));
+
+  PrepareUniqueId(&scope, ctx, &hccl_id);
+  Prepare(&scope, ctx, &hccl_id);
+  TestHcomSendOp(&scope, ctx);
+}
diff --git a/paddle/fluid/operators/concat_op_npu.cc b/paddle/fluid/operators/concat_op_npu.cc
new file mode 100644
index 0000000000000..87bb3397ca267
--- /dev/null
+++ b/paddle/fluid/operators/concat_op_npu.cc
@@ -0,0 +1,126 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/concat_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class ConcatNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto ins = ctx.MultiInput<framework::LoDTensor>("X");
+    framework::LoDTensor* out = ctx.Output<framework::LoDTensor>("Out");
+    PADDLE_ENFORCE_NOT_NULL(ins[0],
+                            platform::errors::NotFound(
+                                "The first input tensor is not initalized."));
+    auto axis = ctx.Attr<int>("axis");
+
+    if (ctx.HasInput("AxisTensor")) {
+      PADDLE_THROW(platform::errors::NotFound(
+          "The AxisTensor is not supported on NPU now."));
+    }
+    axis = ComputeAxis(static_cast<int64_t>(axis),
+                       static_cast<int64_t>(ins[0]->dims().size()));
+
+    auto place = ctx.GetPlace();
+    out->mutable_data<T>(place);
+
+    std::vector<framework::Tensor> inputs;
+    std::vector<std::string> names;
+    for (size_t i = 0; i < ins.size(); ++i) {
+      if (ins[i] && ins[i]->numel() > 0) {
+        inputs.push_back(*ins[i]);
+        names.push_back("x" + std::to_string(i));
+      } else {
+        continue;
+      }
+    }
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    auto runner = NpuOpRunner(
+        "ConcatD", {inputs}, {*out},
+        {{"concat_dim", axis}, {"N", static_cast<int>(inputs.size())}});
+    runner.AddInputNames(names);
+    runner.Run(stream);
+  }
+};
+
+template <typename T>
+class ConcatGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* out_grad =
+        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto ins = ctx.MultiInput<framework::LoDTensor>("X");
+    auto out_var_names = ctx.OutputNames(framework::GradVarName("X"));
+    auto outs =
+        ctx.MultiOutput<framework::LoDTensor>(framework::GradVarName("X"));
+
+    PADDLE_ENFORCE_NOT_NULL(ins[0],
+                            platform::errors::NotFound(
+                                "The first input tensor is not initalized."));
+
+    auto axis = ctx.Attr<int>("axis");
+
+    axis = ComputeAxis(static_cast<int64_t>(axis),
+                       static_cast<int64_t>(ins[0]->dims().size()));
+
+    int offset = 0;
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    for (size_t j = 0; j < outs.size(); ++j) {
+      // For stop gradient
+      // get output tensor that the name is not kEmptyVarName
+      if (out_var_names[j] != framework::kEmptyVarName &&
+          outs[j]->numel() != 0UL) {
+        outs[j]->mutable_data<T>(ctx.GetPlace());
+        std::vector<int> offsets;
+        std::vector<int> sizes;
+        for (int dim = 0; dim < ins[j]->dims().size(); ++dim) {
+          if (dim == axis) {
+            offsets.push_back(offset);
+            sizes.push_back(ins[j]->dims()[dim]);
+          } else {
+            offsets.push_back(0);
+            sizes.push_back(ins[j]->dims()[dim]);
+          }
+        }
+        auto runner = NpuOpRunner("SliceD", {*out_grad}, {*outs[j]},
+                                  {{"offsets", offsets}, {"size", sizes}});
+        runner.Run(stream);
+      }
+      if (ins[j]->numel() != 0UL) {
+        offset += ins[j]->dims()[axis];
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(concat, ops::ConcatNPUKernel<float>,
+                       ops::ConcatNPUKernel<paddle::platform::float16>,
+                       ops::ConcatNPUKernel<int>);
+
+REGISTER_OP_NPU_KERNEL(concat_grad, ops::ConcatGradNPUKernel<float>,
+                       ops::ConcatGradNPUKernel<paddle::platform::float16>,
+                       ops::ConcatGradNPUKernel<int>);
diff --git a/paddle/fluid/operators/controlflow/compare_op.cc b/paddle/fluid/operators/controlflow/compare_op.cc
index 3cad86d96c26a..bf047de86fc21 100644
--- a/paddle/fluid/operators/controlflow/compare_op.cc
+++ b/paddle/fluid/operators/controlflow/compare_op.cc
@@ -23,29 +23,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename Functor>
-class CompareOpKernel<platform::CPUDeviceContext, Functor>
-    : public framework::OpKernel<typename Functor::ELEM_TYPE> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    using T = typename Functor::ELEM_TYPE;
-    using Tensor = framework::Tensor;
-
-    auto* x = context.Input<Tensor>("X");
-    auto* y = context.Input<Tensor>("Y");
-    auto* z = context.Output<Tensor>("Out");
-    int axis = context.Attr<int>("axis");
-
-    if (x->numel() == 1 && y->numel() == 1) {
-      bool* z_data = z->mutable_data<bool>(context.GetPlace());
-      z_data[0] = Functor()(x->data<T>()[0], y->data<T>()[0]);
-    } else {
-      ElementwiseComputeEx<Functor, platform::CPUDeviceContext, T, bool>(
-          context, x, y, axis, Functor(), z);
-    }
-  }
-};
-
 template <typename OpComment>
 class CompareOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
@@ -153,16 +130,22 @@ class CompareOp : public framework::OperatorWithKernel {
   REGISTER_COMPARE_OP_VERSION(op_type);
 
 REGISTER_COMPARE_OP(less_than, "Out = X < Y");
-REGISTER_COMPARE_KERNEL(less_than, CPU, paddle::operators::LessThanFunctor);
+REGISTER_COMPARE_KERNEL(less_than, CPU, paddle::operators::LessThanFunctor,
+                        paddle::operators::GreaterEqualFunctor);
 REGISTER_COMPARE_OP(less_equal, "Out = X <= Y");
-REGISTER_COMPARE_KERNEL(less_equal, CPU, paddle::operators::LessEqualFunctor);
+REGISTER_COMPARE_KERNEL(less_equal, CPU, paddle::operators::LessEqualFunctor,
+                        paddle::operators::GreaterThanFunctor);
 REGISTER_COMPARE_OP(greater_than, "Out = X > Y");
 REGISTER_COMPARE_KERNEL(greater_than, CPU,
-                        paddle::operators::GreaterThanFunctor);
+                        paddle::operators::GreaterThanFunctor,
+                        paddle::operators::LessEqualFunctor);
 REGISTER_COMPARE_OP(greater_equal, "Out = X >= Y");
 REGISTER_COMPARE_KERNEL(greater_equal, CPU,
-                        paddle::operators::GreaterEqualFunctor);
+                        paddle::operators::GreaterEqualFunctor,
+                        paddle::operators::LessThanFunctor);
 REGISTER_COMPARE_OP(equal, "Out = X == Y");
-REGISTER_COMPARE_KERNEL(equal, CPU, paddle::operators::EqualFunctor);
+REGISTER_COMPARE_KERNEL(equal, CPU, paddle::operators::EqualFunctor,
+                        paddle::operators::EqualFunctor);
 REGISTER_COMPARE_OP(not_equal, "Out = X != Y");
-REGISTER_COMPARE_KERNEL(not_equal, CPU, paddle::operators::NotEqualFunctor);
+REGISTER_COMPARE_KERNEL(not_equal, CPU, paddle::operators::NotEqualFunctor,
+                        paddle::operators::NotEqualFunctor);
diff --git a/paddle/fluid/operators/controlflow/compare_op.cu b/paddle/fluid/operators/controlflow/compare_op.cu
index b1f3063583597..3ca700e16e6e7 100644
--- a/paddle/fluid/operators/controlflow/compare_op.cu
+++ b/paddle/fluid/operators/controlflow/compare_op.cu
@@ -14,11 +14,17 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/controlflow/compare_op.h"
 
-REGISTER_COMPARE_KERNEL(less_than, CUDA, paddle::operators::LessThanFunctor);
-REGISTER_COMPARE_KERNEL(less_equal, CUDA, paddle::operators::LessEqualFunctor);
-REGISTER_COMPARE_KERNEL(greater_than, CUDA,
+REGISTER_COMPARE_KERNEL(less_than, CUDA, paddle::operators::LessThanFunctor,
+                        paddle::operators::GreaterEqualFunctor);
+REGISTER_COMPARE_KERNEL(less_equal, CUDA, paddle::operators::LessEqualFunctor,
                         paddle::operators::GreaterThanFunctor);
+REGISTER_COMPARE_KERNEL(greater_than, CUDA,
+                        paddle::operators::GreaterThanFunctor,
+                        paddle::operators::LessEqualFunctor);
 REGISTER_COMPARE_KERNEL(greater_equal, CUDA,
-                        paddle::operators::GreaterEqualFunctor);
-REGISTER_COMPARE_KERNEL(equal, CUDA, paddle::operators::EqualFunctor);
-REGISTER_COMPARE_KERNEL(not_equal, CUDA, paddle::operators::NotEqualFunctor);
+                        paddle::operators::GreaterEqualFunctor,
+                        paddle::operators::LessThanFunctor);
+REGISTER_COMPARE_KERNEL(equal, CUDA, paddle::operators::EqualFunctor,
+                        paddle::operators::EqualFunctor);
+REGISTER_COMPARE_KERNEL(not_equal, CUDA, paddle::operators::NotEqualFunctor,
+                        paddle::operators::NotEqualFunctor);
diff --git a/paddle/fluid/operators/controlflow/compare_op.h b/paddle/fluid/operators/controlflow/compare_op.h
index b7529e4ae632d..ff929ee7dfce7 100644
--- a/paddle/fluid/operators/controlflow/compare_op.h
+++ b/paddle/fluid/operators/controlflow/compare_op.h
@@ -68,7 +68,7 @@ struct NotEqualFunctor {
   }
 };
 
-template <typename DeviceContext, typename Functor>
+template <typename DeviceContext, typename Functor, typename InverseFunctor>
 class CompareOpKernel
     : public framework::OpKernel<typename Functor::ELEM_TYPE> {
  public:
@@ -80,21 +80,33 @@ class CompareOpKernel
     auto* y = context.Input<Tensor>("Y");
     auto* z = context.Output<Tensor>("Out");
     int axis = context.Attr<int>("axis");
-    ElementwiseComputeEx<Functor, DeviceContext, T, bool>(context, x, y, axis,
-                                                          Functor(), z);
+
+    auto x_dims = x->dims();
+    auto y_dims = y->dims();
+    if (x_dims.size() >= y_dims.size()) {
+      ElementwiseComputeEx<Functor, DeviceContext, T, bool>(context, x, y, axis,
+                                                            Functor(), z);
+    } else {
+      ElementwiseComputeEx<InverseFunctor, DeviceContext, T, bool>(
+          context, x, y, axis, InverseFunctor(), z);
+    }
   }
 };
 
 }  // namespace operators
 }  // namespace paddle
 
-#define REGISTER_COMPARE_KERNEL(op_type, dev, functor)                    \
-  REGISTER_OP_##dev##_KERNEL(                                             \
-      op_type, ::paddle::operators::CompareOpKernel<                      \
-                   ::paddle::platform::dev##DeviceContext, functor<int>>, \
-      ::paddle::operators::CompareOpKernel<                               \
-          ::paddle::platform::dev##DeviceContext, functor<int64_t>>,      \
-      ::paddle::operators::CompareOpKernel<                               \
-          ::paddle::platform::dev##DeviceContext, functor<float>>,        \
-      ::paddle::operators::CompareOpKernel<                               \
-          ::paddle::platform::dev##DeviceContext, functor<double>>);
+#define REGISTER_COMPARE_KERNEL(op_type, dev, functor, inverse_functor)       \
+  REGISTER_OP_##dev##_KERNEL(op_type,                                         \
+                             ::paddle::operators::CompareOpKernel<            \
+                                 ::paddle::platform::dev##DeviceContext,      \
+                                 functor<int>, inverse_functor<int>>,         \
+                             ::paddle::operators::CompareOpKernel<            \
+                                 ::paddle::platform::dev##DeviceContext,      \
+                                 functor<int64_t>, inverse_functor<int64_t>>, \
+                             ::paddle::operators::CompareOpKernel<            \
+                                 ::paddle::platform::dev##DeviceContext,      \
+                                 functor<float>, inverse_functor<float>>,     \
+                             ::paddle::operators::CompareOpKernel<            \
+                                 ::paddle::platform::dev##DeviceContext,      \
+                                 functor<double>, inverse_functor<double>>);
diff --git a/paddle/fluid/operators/controlflow/compare_op_npu.cc b/paddle/fluid/operators/controlflow/compare_op_npu.cc
new file mode 100644
index 0000000000000..591fb55936734
--- /dev/null
+++ b/paddle/fluid/operators/controlflow/compare_op_npu.cc
@@ -0,0 +1,78 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/fluid/operators/controlflow/compare_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+#ifdef PADDLE_WITH_ASCEND_CL
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class EqualNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::LoDTensor>("X");
+    auto* y = ctx.Input<framework::LoDTensor>("Y");
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
+    out->mutable_data<bool>(ctx.GetPlace());
+
+    auto runner = NpuOpRunner("Equal", {*x, *y}, {*out}, {});
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    runner.Run(stream);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class LessThanNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::LoDTensor>("X");
+    auto* y = ctx.Input<framework::LoDTensor>("Y");
+    auto* z = ctx.Output<framework::LoDTensor>("Out");
+    // int axis = context.Attr<int>("axis");
+    z->mutable_data<bool>(ctx.GetPlace());  // allocate
+    auto runner = NpuOpRunner("Less", {*x, *y}, {*z});
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    runner.Run(stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(equal, ops::EqualNPUKernel<float>,
+                       ops::EqualNPUKernel<plat::float16>,
+                       ops::EqualNPUKernel<int>);
+
+REGISTER_OP_NPU_KERNEL(
+    less_than,
+    ops::LessThanNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::LessThanNPUKernel<paddle::platform::NPUDeviceContext,
+                           paddle::platform::float16>);
+
+#endif
diff --git a/paddle/fluid/operators/controlflow/conditional_block_op.h b/paddle/fluid/operators/controlflow/conditional_block_op.h
index b9ea2ade6cb90..6513bae839e98 100644
--- a/paddle/fluid/operators/controlflow/conditional_block_op.h
+++ b/paddle/fluid/operators/controlflow/conditional_block_op.h
@@ -78,6 +78,13 @@ class ConditionalOp : public framework::OperatorBase {
       framework::TensorCopy(*ips[0], platform::CPUPlace(), &cpu_tensor);
       platform::DeviceContextPool::Instance().Get(ips[0]->place())->Wait();
       res = cpu_tensor.data<bool>()[0];
+#endif
+    } else if (platform::is_npu_place(ips[0]->place())) {
+#ifdef PADDLE_WITH_ASCEND_CL
+      framework::LoDTensor cpu_tensor;
+      framework::TensorCopy(*ips[0], platform::CPUPlace(), &cpu_tensor);
+      platform::DeviceContextPool::Instance().Get(ips[0]->place())->Wait();
+      res = cpu_tensor.data<bool>()[0];
 #endif
     } else {
       res = ips[0]->data<bool>()[0];
diff --git a/paddle/fluid/operators/controlflow/fetch_op.cc b/paddle/fluid/operators/controlflow/fetch_op.cc
index d86b6b48422d9..fdd1b776bd8fa 100644
--- a/paddle/fluid/operators/controlflow/fetch_op.cc
+++ b/paddle/fluid/operators/controlflow/fetch_op.cc
@@ -44,6 +44,11 @@ static void DataCopy(const framework::LoDTensor &src_item,
       TensorCopySync(src_item, platform::CPUPlace(), dst_item);
     }
 #else
+#ifdef PADDLE_WITH_ASCEND_CL
+    if (platform::is_npu_place(src_item.place())) {
+      platform::DeviceContextPool::Instance().Get(src_item.place())->Wait();
+    }
+#endif
     TensorCopySync(src_item, platform::CPUPlace(), dst_item);
 #endif
   } else {
diff --git a/paddle/fluid/operators/controlflow/logical_op_npu.cc b/paddle/fluid/operators/controlflow/logical_op_npu.cc
new file mode 100644
index 0000000000000..1b0c0e444347a
--- /dev/null
+++ b/paddle/fluid/operators/controlflow/logical_op_npu.cc
@@ -0,0 +1,57 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_ASCEND_CL
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/operators/controlflow/logical_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class LogicalNotNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+
+    auto* out = ctx.Output<Tensor>("Out");
+
+    auto place = ctx.GetPlace();
+
+    out->mutable_data<T>(place);
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    auto runner = NpuOpRunner("LogicalNot", {*x}, {*out}, {});
+    runner.Run(stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    logical_not,
+    ops::LogicalNotNPUKernel<paddle::platform::NPUDeviceContext, bool>);
+
+#endif
diff --git a/paddle/fluid/operators/conv_cudnn_op.cu b/paddle/fluid/operators/conv_cudnn_op.cu
index 39e9d37ddc6c7..ab535e341f757 100644
--- a/paddle/fluid/operators/conv_cudnn_op.cu
+++ b/paddle/fluid/operators/conv_cudnn_op.cu
@@ -1363,7 +1363,14 @@ REGISTER_OP_KERNEL(
     conv2d_grad_grad, CUDNN, plat::CUDAPlace,
     paddle::operators::CUDNNConvDoubleGradOpKernel<float>,
     paddle::operators::CUDNNConvDoubleGradOpKernel<plat::float16>);
-
+// ROCM has limit thread in depthwise_conv.cu and willl result in accuracy issue
+// Use depthwise_conv2d in MIOPEN to resolve this issue
+REGISTER_OP_KERNEL(depthwise_conv2d, CUDNN, plat::CUDAPlace,
+                   paddle::operators::CUDNNConvOpKernel<float>,
+                   paddle::operators::CUDNNConvOpKernel<plat::float16>);
+REGISTER_OP_KERNEL(depthwise_conv2d_grad, CUDNN, plat::CUDAPlace,
+                   paddle::operators::CUDNNConvGradOpKernel<float>,
+                   paddle::operators::CUDNNConvGradOpKernel<plat::float16>);
 REGISTER_OP_CUDA_KERNEL(
     depthwise_conv2d_grad_grad,
     paddle::operators::CUDNNConvDoubleGradOpKernel<float>,
diff --git a/paddle/fluid/operators/conv_transpose_cudnn_op.cu b/paddle/fluid/operators/conv_transpose_cudnn_op.cu
index a712d31cf7e2c..c4cd5854c0f78 100644
--- a/paddle/fluid/operators/conv_transpose_cudnn_op.cu
+++ b/paddle/fluid/operators/conv_transpose_cudnn_op.cu
@@ -490,10 +490,6 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
     bool deterministic = FLAGS_cudnn_deterministic;
     T* input_grad_data = nullptr;
     T* filter_grad_data = nullptr;
-    if (input_grad)
-      input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
-    if (filter_grad)
-      filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
 
     if (input_grad) {
       input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
@@ -884,7 +880,7 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
 
     int iwo_group = groups;
     int c_group = 1;
-#if CUDNN_VERSION_MIN(7, 0, 1)
+#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1)
     iwo_group = 1;
     c_group = groups;
     groups = 1;
@@ -948,7 +944,8 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
         args1.idesc.set(transformed_ddO_channel, iwo_group);
         args1.wdesc.set(*W, layout, iwo_group);
         args1.odesc.set(transformed_ddX, iwo_group);
-        args1.cdesc.set(dtype, padding_common, strides, dilations, c_group);
+        args1.cdesc.set(dtype, padding_common, strides, dilations,
+                        platform::AllowTF32Cudnn(), c_group);
 #ifdef PADDLE_WITH_HIP
         using search1 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
         workspace_size = search1::GetWorkspaceSize(args1);
@@ -967,7 +964,8 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
         args2.idesc.set(transformed_ddO_channel, iwo_group);
         args2.wdesc.set(*ddW, layout, iwo_group);
         args2.odesc.set(transformed_X, iwo_group);
-        args2.cdesc.set(dtype, padding_common, strides, dilations, c_group);
+        args2.cdesc.set(dtype, padding_common, strides, dilations,
+                        platform::AllowTF32Cudnn(), c_group);
 #ifdef PADDLE_WITH_HIP
         using search2 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
         workspace_size =
@@ -991,7 +989,8 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
 
       args3.odesc.set(transformed_ddX_channel, iwo_group);
 
-      args3.cdesc.set(dtype, padding_common, strides, dilations, c_group);
+      args3.cdesc.set(dtype, padding_common, strides, dilations,
+                      platform::AllowTF32Cudnn(), c_group);
 #ifdef PADDLE_WITH_HIP
       using search3 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
       workspace_size =
@@ -1013,7 +1012,8 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
       args4.idesc.set(transformed_dO, iwo_group);
       args4.wdesc.set(*ddW, layout, iwo_group);
       args4.odesc.set(transformed_dX_channel, iwo_group);
-      args4.cdesc.set(dtype, padding_common, strides, dilations, c_group);
+      args4.cdesc.set(dtype, padding_common, strides, dilations,
+                      platform::AllowTF32Cudnn(), c_group);
 #ifdef PADDLE_WITH_HIP
       using search4 = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
       workspace_size =
@@ -1083,6 +1083,10 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
       if (ddW) {
         for (int i = 0; i < groups; i++) {
 #ifdef PADDLE_WITH_HIP
+          // MIOPEN ONLY support beta to be 0.0f
+          Tensor conv_x_ddw(dO->type());
+          conv_x_ddw.Resize(transformed_ddO_channel.dims());
+          T* conv_x_ddw_data = conv_x_ddw.mutable_data<T>(ctx.GetPlace());
           wkspace_handle.RunFunc(
               [&](void* workspace_ptr) {
                 PADDLE_ENFORCE_CUDA_SUCCESS(
@@ -1090,11 +1094,17 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
                         handle, &alpha, args2.odesc.desc(),
                         x + i * group_offset_in, args2.wdesc.desc(),
                         ddw + i * group_offset_filter, args2.cdesc.desc(),
-                        bwd_algo2, &alpha, args2.idesc.desc(),
-                        transformed_ddy_channel + i * group_offset_out,
-                        workspace_ptr, workspace_size));
+                        bwd_algo2, &beta, args2.idesc.desc(),
+                        conv_x_ddw_data + i * group_offset_out, workspace_ptr,
+                        workspace_size));
               },
               workspace_size);
+          PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenOpTensor(
+              handle, miopenTensorOpAdd, &alpha, args2.idesc.desc(),
+              transformed_ddy_channel + i * group_offset_out, &alpha,
+              args2.idesc.desc(), conv_x_ddw_data + i * group_offset_out, &beta,
+              args2.idesc.desc(),
+              transformed_ddy_channel + i * group_offset_out));
 #else   // PADDLE_WITH_HIP
           wkspace_handle.RunFunc(
               [&](void* workspace_ptr) {
diff --git a/paddle/fluid/operators/copy_cross_scope_op.cc b/paddle/fluid/operators/copy_cross_scope_op.cc
new file mode 100644
index 0000000000000..721354954c703
--- /dev/null
+++ b/paddle/fluid/operators/copy_cross_scope_op.cc
@@ -0,0 +1,151 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/var_type_traits.h"
+
+namespace paddle {
+namespace framework {
+class OpDesc;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+}  // namespace paddle
+
+using LoDTensor = paddle::framework::LoDTensor;
+using Tensor = paddle::framework::Tensor;
+
+namespace paddle {
+namespace operators {
+
+class CopyCrossScopeOp : public framework::OperatorBase {
+ public:
+  CopyCrossScopeOp(const std::string& type,
+                   const framework::VariableNameMap& inputs,
+                   const framework::VariableNameMap& outputs,
+                   const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext* ctx) const {}
+
+ private:
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
+    int num_micro_scopes = scope.kids().size();
+    int num_micro_batches = Attr<int>("num_micro_batches");
+    bool ToM = Attr<bool>("to_main_scope");
+    PADDLE_ENFORCE_EQ(num_micro_scopes, num_micro_batches,
+                      platform::errors::InvalidArgument(
+                          "For pipeline, number of micro scopes (%d) should "
+                          "be equal to number of micro batches (%d).",
+                          num_micro_scopes, num_micro_batches));
+    const std::string& id_name = Input("Id");
+    auto* id_var = scope.FindVar(id_name);
+    PADDLE_ENFORCE_NOT_NULL(
+        id_var,
+        platform::errors::NotFound("No variable with name %s found.", id_name));
+    auto id_tensor = id_var->GetMutable<LoDTensor>();
+    auto it = scope.kids().begin();
+    framework::Tensor cpu_id_tensor;
+    TensorCopySync(*id_tensor, platform::CPUPlace(), &cpu_id_tensor);
+    auto id_value = cpu_id_tensor.data<int64_t>();
+    for (auto i = 0; i < *id_value; i++) {
+      it++;
+    }
+    if (it == scope.kids().end()) {
+      if (ToM) {
+        auto dst_scope = *it;
+        const std::string& x_name = Input("X");
+        auto* dst_var = dst_scope->FindVar(x_name);
+        PADDLE_ENFORCE_NOT_NULL(
+            dst_var,
+            platform::errors::NotFound(
+                "No variable with name %s found in source scope.", x_name));
+        auto* main_var = scope.FindVar(x_name);
+        PADDLE_ENFORCE_NOT_NULL(
+            main_var,
+            platform::errors::NotFound(
+                "No variable with name %s found in destination scope.",
+                x_name));
+        auto dst_tensor = dst_var->GetMutable<LoDTensor>();
+        auto main_tensor = main_var->GetMutable<LoDTensor>();
+        TensorCopySync(*dst_tensor, main_tensor->place(), main_tensor);
+      }
+      return;
+    }
+    auto source_scope = *it;
+    it++;
+    auto dst_scope = *it;
+    const std::string& x_name = Input("X");
+    auto* source_var = source_scope->FindVar(x_name);
+    PADDLE_ENFORCE_NOT_NULL(
+        source_var,
+        platform::errors::NotFound(
+            "No variable with name %s found in source scope.", x_name));
+    auto* dst_var = dst_scope->FindVar(x_name);
+    PADDLE_ENFORCE_NOT_NULL(
+        dst_var,
+        platform::errors::NotFound(
+            "No variable with name %s found in destination scope.", x_name));
+    auto src_tensor = source_var->GetMutable<LoDTensor>();
+    auto dst_tensor = dst_var->GetMutable<LoDTensor>();
+    TensorCopySync(*src_tensor, dst_tensor->place(), dst_tensor);
+
+    if (ToM) {
+      auto* main_var = scope.FindVar(x_name);
+      PADDLE_ENFORCE_NOT_NULL(
+          main_var,
+          platform::errors::NotFound(
+              "No variable with name %s found in destination scope.", x_name));
+      auto main_tensor = main_var->GetMutable<LoDTensor>();
+      TensorCopySync(*dst_tensor, main_tensor->place(), main_tensor);
+    }
+  }
+};
+
+class CopyCrossScopeOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(Tensor), The first input tensor of copy_cross_scope op, which "
+             "is copying micro scope.");
+    AddInput("Id",
+             "(Tensor), The second input tensor of copy_cross_scope op, which "
+             "is a id of the current micro scope.");
+    AddAttr<bool>("to_main_scope", "Return current scope to main scope.")
+        .SetDefault(false);
+    AddAttr<int>("num_micro_batches", "Number of micro batches for pipeline.");
+    AddComment(R"DOC(
+      This op is used by pipeline to copy tensors across micro batch scopes. 
+      Copy the variable value of the giving Id's micro scope to the micro scope of Id + 1 position. 
+      If need to copy back to the main scope, using to_main_scope option to copy the variable value of 
+      the current micro scope to the main scope.
+    )DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_WITHOUT_GRADIENT(copy_cross_scope, ops::CopyCrossScopeOp,
+                             ops::CopyCrossScopeOpMaker);
diff --git a/paddle/fluid/operators/copy_cross_scope_test.cc b/paddle/fluid/operators/copy_cross_scope_test.cc
new file mode 100644
index 0000000000000..e175b235f9c18
--- /dev/null
+++ b/paddle/fluid/operators/copy_cross_scope_test.cc
@@ -0,0 +1,154 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <iostream>
+#include <list>
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/copy_cross_scope_op.cc"
+#include "paddle/fluid/string/printf.h"
+
+#define Conn(x, y) x##y
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+
+USE_NO_KERNEL_OP(copy_cross_scope);
+
+template <typename T>
+void Compare1(f::Scope* scope, const p::DeviceContext& ctx,
+              std::string op_type) {
+  // init
+  auto var_x = scope->Var("tmp");
+  auto x = var_x->GetMutable<f::LoDTensor>();
+  std::vector<T> main_x = {1.0};
+  TensorFromVector(main_x, ctx, x);
+
+  auto var_id = scope->Var("Id");
+  auto id = var_id->GetMutable<f::LoDTensor>();
+  std::vector<int64_t> main_id = {1};
+  TensorFromVector(main_id, ctx, id);
+  for (int i = 0; i < 3; i++) {
+    auto& child_scope = scope->NewScope();
+    auto child_var = child_scope.Var("tmp");
+    auto tensor_x = child_var->GetMutable<f::LoDTensor>();
+    std::vector<T> init_x = {static_cast<T>(i)};
+    TensorFromVector(init_x, ctx, tensor_x);
+  }
+
+  ctx.Wait();
+
+  // run
+  f::AttributeMap attrs = {{"to_main_scope", false}, {"num_micro_batches", 3}};
+  std::map<std::string, std::vector<std::string>> output;
+  auto op = f::OpRegistry::CreateOp(op_type, {{"X", {"tmp"}}, {"Id", {"Id"}}},
+                                    output, attrs);
+
+  auto place = ctx.GetPlace();
+  op->Run(*scope, place);
+  ctx.Wait();
+
+  std::list<f::Scope*>::const_iterator iter = scope->kids().begin();
+  iter++;
+  iter++;
+
+  auto* kid_scope = *iter;
+  auto* dst_var = kid_scope->FindVar("tmp");
+  auto* tensor_out = dst_var->GetMutable<f::LoDTensor>();
+
+  std::vector<T> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+
+  int expected = 1;
+  EXPECT_EQ(static_cast<int>(out_vec[0]), expected);
+}
+
+template <typename T>
+void Compare2(f::Scope* scope, const p::DeviceContext& ctx,
+              std::string op_type) {
+  // init
+  auto var_x = scope->Var("tmp");
+  auto x = var_x->GetMutable<f::LoDTensor>();
+  std::vector<T> main_x = {1.0};
+  TensorFromVector(main_x, ctx, x);
+
+  auto var_id = scope->Var("Id");
+  auto id = var_id->GetMutable<f::LoDTensor>();
+  std::vector<int64_t> main_id = {0};
+  TensorFromVector(main_id, ctx, id);
+  for (int i = 0; i < 3; i++) {
+    auto& child_scope = scope->NewScope();
+    auto child_var = child_scope.Var("tmp");
+    auto tensor_x = child_var->GetMutable<f::LoDTensor>();
+    std::vector<T> init_x = {static_cast<T>(i)};
+    TensorFromVector(init_x, ctx, tensor_x);
+  }
+
+  ctx.Wait();
+
+  // run
+  f::AttributeMap attrs = {{"to_main_scope", true}, {"num_micro_batches", 3}};
+  std::map<std::string, std::vector<std::string>> output;
+  auto op = f::OpRegistry::CreateOp(op_type, {{"X", {"tmp"}}, {"Id", {"Id"}}},
+                                    output, attrs);
+
+  auto place = ctx.GetPlace();
+  op->Run(*scope, place);
+  ctx.Wait();
+
+  auto* dst_var = scope->FindVar("tmp");
+  auto* tensor_out = dst_var->GetMutable<f::LoDTensor>();
+
+  std::vector<T> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+
+  int expected = 0;
+  EXPECT_EQ(static_cast<int>(out_vec[0]), expected);
+}
+
+#ifdef PADDLE_WITH_CUDA
+TEST(copy_cross_scope, CUDA_fp32) {
+  f::Scope scope;
+  p::CUDADeviceContext ctx(p::CUDAPlace(0));
+  Compare1<float>(&scope, ctx, "copy_cross_scope");
+}
+
+TEST(copy_cross_scope_to_main_scope, CUDA_fp32) {
+  f::Scope scope;
+  p::CUDADeviceContext ctx(p::CUDAPlace(0));
+  Compare2<float>(&scope, ctx, "copy_cross_scope");
+}
+#elif PADDLE_WITH_ASCEND_CL
+TEST(copy_cross_scope, NPU_fp32) {
+  f::Scope scope;
+  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  Compare1<float>(&scope, ctx, "copy_cross_scope");
+}
+
+TEST(copy_cross_scope_to_main_scope, NPU_fp32) {
+  f::Scope scope;
+  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  Compare2<float>(&scope, ctx, "copy_cross_scope");
+}
+#endif
diff --git a/paddle/fluid/operators/diag_embed_op.cu b/paddle/fluid/operators/diag_embed_op.cu
index 2e03622e10f0f..7e3ab6be664cb 100644
--- a/paddle/fluid/operators/diag_embed_op.cu
+++ b/paddle/fluid/operators/diag_embed_op.cu
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/diag_embed_op.h"
 
diff --git a/paddle/fluid/operators/dist_op.h b/paddle/fluid/operators/dist_op.h
index a2279e40623b4..6a34ef48a169d 100644
--- a/paddle/fluid/operators/dist_op.h
+++ b/paddle/fluid/operators/dist_op.h
@@ -167,6 +167,7 @@ static void DistGradFunction(const framework::ExecutionContext& context) {
   auto sign =
       (x_minux_y > static_cast<T>(0)).template cast<T>() * static_cast<T>(1.0) +
       (x_minux_y < static_cast<T>(0)).template cast<T>() * static_cast<T>(-1.0);
+  T epsilon = static_cast<T>(1.0e-10f);
 
   // 1: Lp-norm(z), z = x-y, compute dz
   if (p == 0) {
@@ -189,12 +190,14 @@ static void DistGradFunction(const framework::ExecutionContext& context) {
     // dz = pow(abs(x-y)/out, p-1) * sign(x-y) * dout
     if (platform::is_cpu_place(context.GetPlace())) {
       grad_t.device(place) =
-          (x_minux_y_abs / out_t.broadcast(out_bcast_dims)).pow(p - 1) *
+          (x_minux_y_abs / (out_t + epsilon).broadcast(out_bcast_dims))
+              .pow(p - 1) *
           sign.eval() * out_grad_t.broadcast(out_bcast_dims);
     } else {
       grad_t.device(place) =
-          (x_minux_y_abs / out_t.broadcast(out_bcast_dims)).pow(p - 1) * sign *
-          out_grad_t.broadcast(out_bcast_dims);
+          (x_minux_y_abs / (out_t + epsilon).broadcast(out_bcast_dims))
+              .pow(p - 1) *
+          sign * out_grad_t.broadcast(out_bcast_dims);
     }
   }
 
diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt
deleted file mode 100644
index c9db6148bc45d..0000000000000
--- a/paddle/fluid/operators/distributed/CMakeLists.txt
+++ /dev/null
@@ -1,76 +0,0 @@
-return()
-
-if(WITH_GRPC)
-    set(cc_generic_services "false")
-else()
-    set(cc_generic_services "true")
-endif()
-configure_file(send_recv.proto.in ${CMAKE_CURRENT_SOURCE_DIR}/send_recv.proto @ONLY)
-
-cc_library(async_sparse_param_update_recorder SRCS async_sparse_param_update_recorder.cc DEPS enforce simple_threadpool)
-cc_test(async_sparse_param_update_recorder_test SRCS async_sparse_param_update_recorder_test.cc DEPS async_sparse_param_update_recorder)
-
-cc_library(heart_beat_monitor SRCS heart_beat_monitor.cc DEPS enforce simple_threadpool)
-cc_library(large_scale_kv SRCS large_scale_kv.cc DEPS enforce simple_threadpool device_context)
-cc_test(heart_beat_monitor_test SRCS heart_beat_monitor_test.cc DEPS heart_beat_monitor)
-
-# FIXME(typhoonzero): use add_subdirectory once we clean the dependency of these files
-set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
-if(WITH_GRPC)
-  set(GRPC_DEPS grpc++_unsecure grpc_unsecure gpr zlib protobuf)
-  set(GRPC_SRCS grpc/grpc_client.cc grpc/grpc_server.cc grpc/grpc_serde.cc grpc/grpc_bytebuffer_stream.cc grpc/grpc_variable_response.cc)
-  grpc_library(sendrecvop_rpc SRCS sendrecvop_utils.cc
-        request_handler_impl.cc rpc_client.cc rpc_server.cc
-        variable_response.cc
-        collective_client.cc collective_server.cc
-        ${GRPC_SRCS}
-      PROTO send_recv.proto 
-      DEPS lod_tensor selected_rows_functor memory scope ${GRPC_DEPS} async_sparse_param_update_recorder heart_beat_monitor large_scale_kv)
-
-  set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-  set(RPC_DEPS sendrecvop_rpc ${GRPC_DEPS})
-
-  cc_test(grpc_serde_test SRCS grpc/grpc_serde_test.cc 
-    DEPS ${RPC_DEPS} scope profiler math_function)
-
-else()
-  set(BRPC_SRCS brpc/brpc_client.cc brpc/brpc_server.cc brpc/brpc_sendrecvop_utils.cc brpc/brpc_variable_response.cc brpc/brpc_rdma_pool.cc)
-  set_source_files_properties(${BRPC_SRCS} parameter_prefetch.cc parameter_send.cc parameter_recv.cc communicator.cc rpc_server_test.cc brpc/brpc_serde_test.cc collective_server.cc collective_server_test.cc collective_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-
-  set(BRPC_DEPS brpc ssl crypto protobuf leveldb zlib)
-
-  brpc_library(sendrecvop_rpc SRCS sendrecvop_utils.cc
-      request_handler_impl.cc rpc_client.cc rpc_server.cc
-      variable_response.cc
-      collective_client.cc collective_server.cc
-      ${BRPC_SRCS}
-    PROTO send_recv.proto
-    DEPS lod_tensor selected_rows memory scope ${BRPC_DEPS})
-
-  set(RPC_DEPS sendrecvop_rpc ${BRPC_DEPS})
-  cc_test(brpc_serde_test SRCS brpc/brpc_serde_test.cc
-      DEPS ${RPC_DEPS} gflags glog executor proto_desc lookup_sparse_table_read_op)
-endif()
-
-
-cc_test(rpc_server_test SRCS rpc_server_test.cc
-    DEPS ${RPC_DEPS} executor scope proto_desc lookup_sparse_table_read_op checkpoint_notify_op scale_op )
-cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler scope)
-cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_rpc memory node)
-cc_library(parameter_send SRCS parameter_send.cc DEPS sendrecvop_rpc memory)
-cc_library(parameter_recv SRCS parameter_recv.cc DEPS sendrecvop_rpc memory)
-cc_library(communicator SRCS communicator.cc DEPS scope selected_rows tensor variable_helper selected_rows_functor simple_threadpool parameter_send parameter_recv generator)
-cc_test(communicator_test SRCS communicator_test.cc DEPS communicator)
-if(WITH_GPU OR WITH_ROCM)
-    cc_test(collective_server_test SRCS collective_server_test.cc 
-        DEPS sendrecvop_rpc executor ${RPC_DEPS}
-        selected_rows_functor  scope math_function)
-endif()
-if(WITH_TESTING)
-    if(TEST rpc_server_test)
-        set_tests_properties(rpc_server_test PROPERTIES TIMEOUT 120)
-    endif()
-    if(TEST heart_beat_monitor_test)
-        set_tests_properties(heart_beat_monitor_test PROPERTIES TIMEOUT 120)
-    endif()
-endif()
diff --git a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h b/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h
deleted file mode 100644
index 28a5f2ad6c764..0000000000000
--- a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h
+++ /dev/null
@@ -1,186 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <ThreadPool.h>
-#include <functional>
-#include <future>  // NOLINT
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-class ConcurrentSet {
- public:
-  ConcurrentSet() : pool_(new ::ThreadPool(1)) {}
-  ~ConcurrentSet() {}
-
-  std::future<void> Update(const std::vector<int64_t>& rows) {
-    auto task = [this, rows] {
-      if (VLOG_IS_ON(3)) {
-        std::ostringstream sstream;
-        sstream << "[";
-        for (auto& id : rows) {
-          sstream << id << ", ";
-        }
-        sstream << "]";
-        VLOG(3) << "update ids -> " << sstream.str();
-      }
-      for (auto row : rows) {
-        set_.insert(row);
-      }
-    };
-    return pool_->enqueue(std::move(task));
-  }
-
-  std::future<void> GetAndClear(std::vector<int64_t>* result) {
-    auto task = [this, &result] {
-      result->clear();
-      for (auto& id : set_) {
-        result->push_back(id);
-      }
-      if (VLOG_IS_ON(3)) {
-        std::ostringstream sstream;
-        sstream << "[";
-        for (auto& id : *result) {
-          sstream << id << ", ";
-        }
-        sstream << "]";
-        VLOG(3) << "result ids size: " << result->size() << " "
-                << sstream.str();
-      }
-      set_.clear();
-    };
-    return pool_->enqueue(std::move(task));
-  }
-
- private:
-  std::unordered_set<int64_t> set_;
-  std::unique_ptr<::ThreadPool> pool_{nullptr};
-};
-
-class AsyncSparseParamUpdateRecorder {
-  using TrainerToRows = std::vector<std::unique_ptr<ConcurrentSet>>;
-
- public:
-  AsyncSparseParamUpdateRecorder(
-      int trainer_num,
-      const std::unordered_map<std::string, std::string>& grad_to_param)
-      : trainer_num_(trainer_num), grad_to_param_(grad_to_param) {
-    if (VLOG_IS_ON(3)) {
-      std::ostringstream sstream;
-      sstream << "[";
-      for (auto& item : grad_to_param) {
-        sstream << item.first << ":" << item.second << ", ";
-      }
-      sstream << "]";
-      VLOG(3) << "trainer_num: " << trainer_num
-              << " grad_to_param_: " << sstream.str();
-    }
-    for (auto& iter : grad_to_param) {
-      param_to_grad_[iter.second] = iter.first;
-      auto& param_name = iter.second;
-      param_to_updated_rows_[param_name] = TrainerToRows();
-      auto& trainer_to_rows = param_to_updated_rows_[param_name];
-      for (auto i = 0; i < trainer_num; ++i) {
-        trainer_to_rows.emplace_back(new ConcurrentSet());
-      }
-    }
-  }
-
-  ~AsyncSparseParamUpdateRecorder() = default;
-
-  void Update(const std::string& grad_name,
-              const std::vector<int64_t>& update_rows) {
-    VLOG(3) << "update grad: " << grad_name
-            << " row size: " << update_rows.size();
-    auto& param_name = grad_to_param_.at(grad_name);
-    auto& trainer_to_rows = param_to_updated_rows_.at(param_name);
-
-    std::vector<std::future<void>> fs;
-    for (auto& set : trainer_to_rows) {
-      fs.push_back(set->Update(update_rows));
-    }
-    for (auto& f : fs) {
-      f.wait();
-    }
-  }
-
-  void GetAndClear(const std::string& param_name, int trainer_id,
-                   std::vector<int64_t>* result) {
-    VLOG(3) << "GetAndClear param: " << param_name
-            << " for trainer: " << trainer_id;
-    PADDLE_ENFORCE_LT(
-        trainer_id, trainer_num_,
-        platform::errors::InvalidArgument(
-            "The value of trainer_id: %s should less than trainer_num: %s.",
-            trainer_id, trainer_num_));
-    param_to_updated_rows_.at(param_name)[trainer_id]
-        ->GetAndClear(result)
-        .wait();
-  }
-
-  bool HasParam(const std::string& param_name) {
-    return param_to_grad_.find(param_name) != param_to_grad_.end();
-  }
-
-  bool HasGrad(const std::string& grad_name) {
-    return grad_to_param_.find(grad_name) != grad_to_param_.end();
-  }
-
- private:
-  const int trainer_num_;
-  std::unordered_map<std::string, std::string> grad_to_param_;
-  std::unordered_map<std::string, std::string> param_to_grad_;
-  std::unordered_map<std::string, TrainerToRows> param_to_updated_rows_;
-
-  // init recorder
- public:
-  static void Init(
-      int trainer_num,
-      const std::unordered_map<std::string, std::string>& grad_to_param) {
-    InitImpl(trainer_num, grad_to_param);
-  }
-
-  static AsyncSparseParamUpdateRecorder* GetInstance() {
-    return recorder_.get();
-  }
-
- private:
-  // Init is called by GetInstance.
-  static void InitImpl(
-      int trainer_num,
-      const std::unordered_map<std::string, std::string>& grad_to_param) {
-    if (recorder_ == nullptr) {
-      recorder_.reset(
-          new AsyncSparseParamUpdateRecorder(trainer_num, grad_to_param));
-    }
-  }
-
-  static std::once_flag init_flag_;
-  static std::unique_ptr<AsyncSparseParamUpdateRecorder> recorder_;
-};
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder_test.cc b/paddle/fluid/operators/distributed/async_sparse_param_update_recorder_test.cc
deleted file mode 100644
index 2d78559625c91..0000000000000
--- a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder_test.cc
+++ /dev/null
@@ -1,97 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h"
-#include <algorithm>
-#include "gtest/gtest.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-TEST(ConcurrentSet, All) {
-  ConcurrentSet concurrent_set;
-  std::vector<int64_t> in1 = {1, 2, 3, 4};
-  std::vector<int64_t> in2 = {2, 3, 5, 6};
-
-  std::vector<std::future<void>> futures;
-  futures.push_back(concurrent_set.Update(in1));
-  futures.push_back(concurrent_set.Update(in2));
-
-  for (auto &f : futures) {
-    f.wait();
-  }
-
-  std::unordered_set<int64_t> in;
-  std::copy(in1.begin(), in1.end(), std::inserter(in, in.begin()));
-  std::copy(in2.begin(), in2.end(), std::inserter(in, in.begin()));
-
-  std::vector<int64_t> ret;
-  concurrent_set.GetAndClear(&ret).wait();
-
-  std::unordered_set<int64_t> out;
-  std::copy(ret.begin(), ret.end(), std::inserter(out, out.begin()));
-
-  EXPECT_EQ(in, out);
-
-  concurrent_set.GetAndClear(&ret).wait();
-  EXPECT_EQ(ret.size(), 0UL);
-}
-
-TEST(AsyncSparseParamUpdateRecorder, All) {
-  std::unordered_map<std::string, std::string> grad_to_param;
-  grad_to_param["grad1"] = "param1";
-  grad_to_param["grad2"] = "param2";
-
-  int trainer_num = 10;
-
-  AsyncSparseParamUpdateRecorder recorder(trainer_num, grad_to_param);
-  std::vector<int64_t> in1 = {1, 2, 3, 4};
-  std::vector<int64_t> in2 = {2, 3, 5, 6};
-
-  std::unordered_set<int64_t> in;
-  std::copy(in1.begin(), in1.end(), std::inserter(in, in.begin()));
-  std::copy(in2.begin(), in2.end(), std::inserter(in, in.begin()));
-
-  recorder.Update("grad1", in1);
-  recorder.Update("grad1", in2);
-
-  EXPECT_TRUE(recorder.HasParam("param1"));
-  EXPECT_TRUE(recorder.HasParam("param2"));
-  EXPECT_FALSE(recorder.HasParam("param3"));
-
-  EXPECT_TRUE(recorder.HasGrad("grad1"));
-  EXPECT_TRUE(recorder.HasGrad("grad2"));
-  EXPECT_FALSE(recorder.HasGrad("grad3"));
-
-  std::vector<int64_t> ret;
-  EXPECT_ANY_THROW(recorder.GetAndClear("param1", trainer_num, &ret));
-
-  for (int i = 0; i < trainer_num; ++i) {
-    std::vector<int64_t> ret;
-    std::unordered_set<int64_t> out;
-
-    recorder.GetAndClear("param1", i, &ret);
-    std::copy(ret.begin(), ret.end(), std::inserter(out, out.begin()));
-
-    EXPECT_EQ(in, out);
-
-    recorder.GetAndClear("param1", i, &ret);
-    EXPECT_EQ(ret.size(), 0UL);
-  }
-}
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/brpc/brpc_client.cc b/paddle/fluid/operators/distributed/brpc/brpc_client.cc
deleted file mode 100644
index b2a26089c8689..0000000000000
--- a/paddle/fluid/operators/distributed/brpc/brpc_client.cc
+++ /dev/null
@@ -1,462 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/distributed/brpc/brpc_client.h"
-#include "paddle/fluid/framework/threadpool.h"
-#include "paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h"
-#include "paddle/fluid/platform/profiler.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-DEFINE_int32(timeout_ms, 30000, "RPC timeout in milliseconds");
-DEFINE_int32(max_retry, 3, "Max retries(not including the first RPC)");
-
-BRPCClient::~BRPCClient() { Wait(); }
-
-void HandleSendResponse(brpc::Controller* cntl, sendrecv::VoidMessage* response,
-                        VarHandlePtr var_h, ChannelQueuePtr ch_ptr,
-                        ChannelContextPtr ch_ctx, BRPCClient* cls) {
-  // std::unique_ptr makes sure cntl/response will be deleted before returning.
-  std::unique_ptr<brpc::Controller> cntl_guard(cntl);
-  std::unique_ptr<sendrecv::VoidMessage> response_guard(response);
-
-  // this channel can be used by other now.
-  ch_ptr->Push(ch_ctx);
-
-  if (cntl->Failed()) {
-    PADDLE_THROW(platform::errors::Unavailable(
-        "Failed to send variable %s, error text is %s.", var_h->name(),
-        cntl->ErrorText()));
-    var_h->Finish(false);
-    cls->DecreaseReqCount();
-    return;
-  }
-  var_h->Finish(true);
-  cls->DecreaseReqCount();
-
-  VLOG(4) << "HandleSendResponse from: " << cntl->remote_side()
-          << ", varname: " << var_h->name()
-          << ", latency: " << cntl->latency_us() << "us";
-  VLOG(4) << "Finish HandleSendResponse";
-}
-
-VarHandlePtr BRPCClient::AsyncSendVar(const std::string& ep,
-                                      const platform::DeviceContext& ctx,
-                                      const framework::Scope& scope,
-                                      const std::string& var_name,
-                                      int64_t time_out) {
-  const platform::DeviceContext* p_ctx = &ctx;
-  const std::string ep_val = ep;
-  const std::string var_name_val = var_name;
-  const framework::Scope* p_scope = &scope;
-  const auto ch_ptr = GetChannel(ep_val);
-  const std::string method = kSendRPC;
-  VarHandlePtr var_h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope));
-
-  framework::AsyncIO([=] {
-    auto ch_ctx = ch_ptr->Pop();
-    brpc::Controller* cntl = new brpc::Controller();
-    sendrecv::VoidMessage* response = new sendrecv::VoidMessage();
-    cntl->set_timeout_ms(time_out);
-
-    auto* var = p_scope->FindVar(var_name_val);
-    sendrecv::VariableMessage request;
-    distributed::SerializeToIOBuf(var_name_val, var, *p_ctx, &request,
-                                  &cntl->request_attachment(), "", false,
-                                  trainer_id_);
-
-    google::protobuf::Closure* done = brpc::NewCallback(
-        &HandleSendResponse, cntl, response, var_h, ch_ptr, ch_ctx, this);
-
-    platform::RecordRPCEvent record_event(method);
-
-    ch_ctx->stub->SendVariable(cntl, &request, response, done);
-
-    if (UNLIKELY(platform::IsProfileEnabled())) {
-      var_h->Wait();
-    }
-  });
-  req_count_++;
-
-  return var_h;
-}
-void HandleFetchBarrierResponse(brpc::Controller* cntl,
-                                sendrecv::VariableMessage* response,
-                                VarHandlePtr var_h, ChannelQueuePtr ch_ptr,
-                                ChannelContextPtr ch_ctx, BRPCClient* cls) {
-  // std::unique_ptr makes sure cntl/response will be deleted before returning.
-  std::unique_ptr<brpc::Controller> cntl_guard(cntl);
-  std::unique_ptr<sendrecv::VariableMessage> response_guard(response);
-
-  // this channel can be used other now.
-  ch_ptr->Push(ch_ctx);
-
-  if (cntl->Failed()) {
-    PADDLE_THROW(platform::errors::Unavailable(
-        "Failed to get HandleFetchBarrierResponse %s, error text is %s.",
-        var_h->name(), cntl->ErrorText()));
-    var_h->Finish(false);
-    cls->DecreaseReqCount();
-    return;
-  }
-
-  var_h->Finish(true);
-  cls->DecreaseReqCount();
-
-  VLOG(4) << "HandleFetchBarrierResponse from: " << cntl->remote_side()
-          << ", varname: " << var_h->name()
-          << ", latency: " << cntl->latency_us() << "us";
-  VLOG(4) << "Finish HandleFetchBarrierResponse";
-}
-void HandleGetResponse(brpc::Controller* cntl,
-                       sendrecv::VariableMessage* response, VarHandlePtr var_h,
-                       ChannelQueuePtr ch_ptr, ChannelContextPtr ch_ctx,
-                       BRPCClient* cls) {
-  // std::unique_ptr makes sure cntl/response will be deleted before returning.
-  std::unique_ptr<brpc::Controller> cntl_guard(cntl);
-  std::unique_ptr<sendrecv::VariableMessage> response_guard(response);
-
-  // this channel can be used other now.
-  ch_ptr->Push(ch_ctx);
-
-  if (cntl->Failed()) {
-    PADDLE_THROW(platform::errors::Unavailable(
-        "Failed to get variable %s, error text is %s.", var_h->name(),
-        cntl->ErrorText()));
-    cls->DecreaseReqCount();
-    var_h->Finish(false);
-    return;
-  }
-
-  VLOG(4) << "HandleGetResponse from: " << cntl->remote_side()
-          << ", varname: " << var_h->name()
-          << ", latency: " << cntl->latency_us() << "us";
-
-  framework::Variable* outvar = nullptr;
-  int trainer_id;
-  distributed::DeserializeFromIOBuf(*response, cntl->response_attachment(),
-                                    *var_h->ctx(), var_h->scope(), &outvar,
-                                    &trainer_id);
-  VLOG(4) << "Finish HandleGetResponse";
-  cls->DecreaseReqCount();
-  var_h->Finish(true);
-}
-
-VarHandlePtr BRPCClient::_AsyncGetVar(const std::string& ep,
-                                      const platform::DeviceContext& ctx,
-                                      const framework::Scope& scope,
-                                      const std::string& var_name,
-                                      const std::string& out_var_name,
-                                      const std::string& method_name,
-                                      int64_t time_out) {
-  const platform::DeviceContext* p_ctx = &ctx;
-  const std::string ep_val = ep;
-  const std::string var_name_val = var_name;
-  const std::string out_varname_val = out_var_name;
-  const framework::Scope* p_scope = &scope;
-  const auto ch_ptr = GetChannel(ep_val);
-  const std::string method = kGetRPC;
-  VarHandlePtr var_h(
-      new VarHandle(ep, method, out_varname_val, p_ctx, p_scope));
-
-  framework::AsyncIO([=] {
-    auto ch_ctx = ch_ptr->Pop();
-
-    brpc::Controller* cntl = new brpc::Controller();
-    sendrecv::VariableMessage* response = new sendrecv::VariableMessage();
-    cntl->set_timeout_ms(time_out);
-
-    sendrecv::VariableMessage req;
-    req.set_varname(var_name_val);
-    req.set_out_varname(out_varname_val);
-    req.set_trainer_id(trainer_id_);
-
-    google::protobuf::Closure* done = brpc::NewCallback(
-        &HandleGetResponse, cntl, response, var_h, ch_ptr, ch_ctx, this);
-
-    platform::RecordRPCEvent record_event(method);
-
-    if (method_name == kGetMonomerRPC) {
-      ch_ctx->stub->GetMonomerVariable(cntl, &req, response, done);
-    } else if (method_name == kGetNoBarrierRPC) {
-      ch_ctx->stub->GetVariableNoBarrier(cntl, &req, response, done);
-    } else {
-      ch_ctx->stub->GetVariable(cntl, &req, response, done);
-    }
-
-    if (UNLIKELY(platform::IsProfileEnabled())) {
-      var_h->Wait();
-    }
-  });
-
-  req_count_++;
-
-  return var_h;
-}
-
-VarHandlePtr BRPCClient::AsyncGetVarNoBarrier(
-    const std::string& ep, const platform::DeviceContext& ctx,
-    const framework::Scope& scope, const std::string& var_name,
-    const std::string& out_var_name, int64_t time_out) {
-  std::string var_name_no_barrier =
-      string::Sprintf("%s%s", var_name, WITHOUT_BARRIER_MESSAGE);
-
-  return _AsyncGetVar(ep, ctx, scope, var_name_no_barrier, out_var_name,
-                      kGetNoBarrierRPC, time_out);
-}
-
-VarHandlePtr BRPCClient::AsyncGetMonomerVariable(
-    const std::string& ep, const platform::DeviceContext& ctx,
-    const framework::Scope& scope, const std::string& var_name,
-    int64_t time_out) {
-  return _AsyncGetVar(ep, ctx, scope, var_name, var_name, kGetMonomerRPC,
-                      time_out);
-}
-
-VarHandlePtr BRPCClient::AsyncGetMonomerBarrier(const std::string& ep,
-                                                const std::string& var_name,
-                                                int64_t time_out) {
-  return AsyncSendMessage(ep, kSendMonomerFetchBarrierRPC, var_name, time_out);
-}
-
-VarHandlePtr BRPCClient::AsyncGetVar(const std::string& ep,
-                                     const platform::DeviceContext& ctx,
-                                     const framework::Scope& scope,
-                                     const std::string& var_name,
-                                     const std::string& out_var_name,
-                                     const std::string& table_name,
-                                     int64_t time_out) {
-  return _AsyncGetVar(ep, ctx, scope, var_name, out_var_name, kGetRPC,
-                      time_out);
-}
-
-VarHandlePtr BRPCClient::AsyncPrefetchVar(const std::string& ep,
-                                          const platform::DeviceContext& ctx,
-                                          const framework::Scope& scope,
-                                          const std::string& in_var_name,
-                                          const std::string& out_var_name,
-                                          const std::string& table_name,
-                                          int64_t time_out) {
-  const platform::DeviceContext* p_ctx = &ctx;
-  const std::string ep_val = ep;
-  const std::string in_var_name_val = in_var_name;
-  const std::string out_var_name_val = out_var_name;
-  const std::string table_name_val = table_name;
-  const framework::Scope* p_scope = &scope;
-  const auto ch_ptr = GetChannel(ep_val);
-
-  const std::string method = kPrefetchRPC;
-
-  VarHandlePtr var_h(
-      new VarHandle(ep, method, out_var_name_val, p_ctx, p_scope));
-
-  framework::AsyncIO([=] {
-    auto ch_ctx = ch_ptr->Pop();
-
-    brpc::Controller* cntl = new brpc::Controller();
-    sendrecv::VariableMessage* response = new sendrecv::VariableMessage();
-    cntl->set_timeout_ms(time_out);
-
-    auto* var = p_scope->FindVar(in_var_name_val);
-    sendrecv::VariableMessage req;
-    distributed::SerializeToIOBuf(in_var_name_val, var, *p_ctx, &req,
-                                  &cntl->request_attachment(), out_var_name_val,
-                                  false, 0, table_name_val);
-
-    platform::RecordRPCEvent record_event(method);
-
-    google::protobuf::Closure* done = brpc::NewCallback(
-        &HandleGetResponse, cntl, response, var_h, ch_ptr, ch_ctx, this);
-
-    ch_ctx->stub->PrefetchVariable(cntl, &req, response, done);
-
-    if (UNLIKELY(platform::IsProfileEnabled())) {
-      var_h->Wait();
-    }
-  });
-
-  req_count_++;
-  return var_h;
-}
-
-VarHandlePtr BRPCClient::AsyncSendBatchBarrier(const std::string& ep,
-                                               int64_t time_out) {
-  return AsyncSendMessage(ep, kBatchBarrierRPC, BATCH_BARRIER_MESSAGE,
-                          time_out);
-}
-
-VarHandlePtr BRPCClient::AsyncSendFetchBarrier(const std::string& ep,
-                                               int64_t time_out) {
-  auto ch_ptr = GetChannel(ep);
-  auto ch_ctx = ch_ptr->Pop();
-
-  brpc::Controller* cntl = new brpc::Controller();
-  sendrecv::VariableMessage* response = new sendrecv::VariableMessage();
-  cntl->set_timeout_ms(time_out);
-
-  sendrecv::VariableMessage req;
-  req.set_varname(FETCH_BARRIER_MESSAGE);
-
-  const std::string method = kFetchBarrierRPC;
-  // var handle
-  VarHandlePtr var_h(
-      new VarHandle(ep, method, FETCH_BARRIER_MESSAGE, nullptr, nullptr));
-
-  platform::RecordRPCEvent record_event(method);
-
-  google::protobuf::Closure* done = brpc::NewCallback(
-      &HandleFetchBarrierResponse, cntl, response, var_h, ch_ptr, ch_ctx, this);
-
-  ch_ctx->stub->GetVariable(cntl, &req, response, done);
-
-  req_count_++;
-
-  if (UNLIKELY(platform::IsProfileEnabled())) {
-    var_h->Wait();
-  }
-
-  return var_h;
-}
-
-bool BRPCClient::Wait() {
-  VLOG(9) << "begin to brpcclient wait";
-  {
-    std::unique_lock<std::mutex> lk(sync_mutex_);
-    sync_cond_.wait(lk, [this] { return req_count_ == 0; });
-  }
-  VLOG(9) << "end to brpcclient wait";
-  return true;
-}
-
-ChannelQueuePtr BRPCClient::GetChannel(const std::string& ep) {
-  VLOG(4) << "begin to GetChannel:" << ep;
-  {
-    std::lock_guard<std::mutex> guard(chan_mutex_);
-    auto it = channels_.find(ep);
-    if (it != channels_.end()) {
-      VLOG(4) << "end to GetChannel:" << ep;
-      return it->second;
-    }
-  }
-
-  ChannelQueuePtr q(new framework::BlockingQueue<ChannelContextPtr>());
-
-  brpc::ChannelOptions options;
-#ifdef PADDLE_WITH_BRPC_RDMA
-  options.use_rdma = true;
-#endif
-  options.protocol = "baidu_std";
-  // don't use pooled type. the server can't afford that.
-  options.connection_type = "single";
-  options.connect_timeout_ms = 1000;
-  options.timeout_ms = FLAGS_timeout_ms /*milliseconds*/;
-  options.max_retry = FLAGS_max_retry;
-
-  VLOG(1) << "create " << brpc_channel_num_per_server_
-          << " brpc channels to pserver:" << ep;
-
-  for (int i = 0; i < brpc_channel_num_per_server_; ++i) {
-    std::shared_ptr<ChannelContext> c(new ChannelContext());
-    if (c->channel.Init(ep.c_str(), &options) != 0) {
-      PADDLE_THROW(
-          platform::errors::Unavailable("Failed to initialize channel."));
-      return nullptr;
-    }
-
-    c->stub.reset(new sendrecv::SendRecvService_Stub(
-        static_cast<google::protobuf::RpcChannel*>(&c->channel)));
-    q->Push(c);
-  }
-
-  {
-    std::lock_guard<std::mutex> guard(chan_mutex_);
-    channels_[ep] = q;
-  }
-
-  VLOG(4) << "end to GetChannel:" << ep;
-  return q;
-}
-
-VarHandlePtr BRPCClient::AsyncSendComplete(const std::string& ep,
-                                           int64_t time_out) {
-  return AsyncSendMessage(ep, kSendCompleteRPC, COMPLETE_MESSAGE, time_out);
-}
-
-void BRPCClient::SendComplete() {
-  for (auto& kv : channels_) {
-    AsyncSendComplete(kv.first);
-  }
-}
-
-VarHandlePtr BRPCClient::AsyncSendVarMessage(
-    const std::string& ep, const std::string& method_name,
-    const sendrecv::VariableMessage& req, int64_t time_out) {
-  auto ch_ptr = GetChannel(ep);
-  auto ch_ctx = ch_ptr->Pop();
-
-  brpc::Controller* cntl = new brpc::Controller();
-  sendrecv::VoidMessage* response = new sendrecv::VoidMessage();
-  cntl->set_timeout_ms(time_out);
-
-  platform::RecordRPCEvent record_event(method_name);
-
-  VarHandlePtr var_h(
-      new VarHandle(ep, method_name, req.varname(), nullptr, nullptr));
-
-  google::protobuf::Closure* done = brpc::NewCallback(
-      &HandleSendResponse, cntl, response, var_h, ch_ptr, ch_ctx, this);
-
-  if (method_name == kCheckPointNotifyRPC) {
-    ch_ctx->stub->CheckpointNotify(cntl, &req, response, done);
-  } else if (method_name == kSendMonomerFetchBarrierRPC) {
-    ch_ctx->stub->GetMonomerBarrier(cntl, &req, response, done);
-  } else {
-    ch_ctx->stub->SendVariable(cntl, &req, response, done);
-  }
-  req_count_++;
-
-  if (UNLIKELY(platform::IsProfileEnabled())) {
-    var_h->Wait();
-  }
-
-  return var_h;
-}
-
-VarHandlePtr BRPCClient::AsyncSendMessage(const std::string& ep,
-                                          const std::string& method_name,
-                                          const std::string& message,
-                                          int64_t time_out) {
-  sendrecv::VariableMessage req;
-  req.set_varname(message);
-
-  return AsyncSendVarMessage(ep, method_name, req, time_out);
-}
-
-VarHandlePtr BRPCClient::AsyncCheckpointNotify(const std::string& ep,
-                                               const std::string& dirname,
-                                               const std::string& varname,
-                                               const int mode,
-                                               int64_t time_out) {
-  sendrecv::VariableMessage req;
-  req.set_varname(varname);
-  req.set_out_varname(dirname);
-
-  return AsyncSendVarMessage(ep, "CheckPointNotifyRPC", req, time_out);
-}
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/brpc/brpc_client.h b/paddle/fluid/operators/distributed/brpc/brpc_client.h
deleted file mode 100644
index 91f94b4c9d5a3..0000000000000
--- a/paddle/fluid/operators/distributed/brpc/brpc_client.h
+++ /dev/null
@@ -1,174 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <time.h>
-
-#include <chrono>  // NOLINT
-#include <ctime>
-#include <functional>
-#include <iostream>
-#include <map>
-#include <memory>
-#include <mutex>  // NOLINT
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "brpc/channel.h"
-#include "paddle/fluid/framework/blocking_queue.h"
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h"
-#include "paddle/fluid/operators/distributed/distributed_pb.h"
-#include "paddle/fluid/operators/distributed/request_handler.h"
-#include "paddle/fluid/operators/distributed/rpc_client.h"
-#include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-struct ChannelContext {
-  brpc::Channel channel;
-  std::shared_ptr<sendrecv::SendRecvService_Stub> stub;
-};
-
-typedef std::shared_ptr<ChannelContext> ChannelContextPtr;
-typedef std::shared_ptr<framework::BlockingQueue<ChannelContextPtr>>
-    ChannelQueuePtr;
-
-class BRPCClient : public RPCClient {
- public:
-  BRPCClient() {}
-  virtual ~BRPCClient();
-
-  VarHandlePtr AsyncSendVar(const std::string& ep,
-                            const platform::DeviceContext& ctx,
-                            const framework::Scope& scope,
-                            const std::string& var_name,
-                            int64_t time_out = FLAGS_rpc_deadline) override;
-
-  VarHandlePtr AsyncGetVar(const std::string& ep,
-                           const platform::DeviceContext& ctx,
-                           const framework::Scope& scope,
-                           const std::string& var_name,
-                           const std::string& out_var_name,
-                           const std::string& table_name = "",
-                           int64_t time_out = FLAGS_rpc_deadline) override;
-
-  VarHandlePtr AsyncGetMonomerBarrier(
-      const std::string& ep, const std::string& var_name,
-      int64_t time_out = FLAGS_rpc_deadline) override;
-
-  VarHandlePtr AsyncGetMonomerVariable(
-      const std::string& ep, const platform::DeviceContext& ctx,
-      const framework::Scope& scope, const std::string& var_name,
-      int64_t time_out = FLAGS_rpc_deadline) override;
-
-  VarHandlePtr AsyncGetVarNoBarrier(const std::string& ep,
-                                    const platform::DeviceContext& ctx,
-                                    const framework::Scope& scope,
-                                    const std::string& var_name,
-                                    const std::string& out_varname,
-                                    int64_t time_out = FLAGS_rpc_deadline);
-
-  VarHandlePtr AsyncPrefetchVar(const std::string& ep,
-                                const platform::DeviceContext& ctx,
-                                const framework::Scope& scope,
-                                const std::string& in_var_name,
-                                const std::string& out_var_name,
-                                const std::string& table_name = "",
-                                int64_t time_out = FLAGS_rpc_deadline) override;
-
-  VarHandlePtr AsyncSendBatchBarrier(
-      const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override;
-
-  VarHandlePtr AsyncSendFetchBarrier(
-      const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override;
-
-  VarHandlePtr AsyncCheckpointNotify(
-      const std::string& ep, const std::string& dirname,
-      const std::string& varname, const int mode,
-      int64_t time_out = FLAGS_rpc_deadline) override;
-
-  bool Wait() override;
-
-  void SendComplete() override;
-
- private:
-  VarHandlePtr _AsyncGetVar(
-      const std::string& ep, const platform::DeviceContext& ctx,
-      const framework::Scope& scope, const std::string& var_name,
-      const std::string& out_var_name, const std::string& method_name,
-      const std::string& table_name, int64_t time_out = FLAGS_rpc_deadline);
-
-  void Proceed();
-  ChannelQueuePtr GetChannel(const std::string& ep);
-
-  VarHandlePtr AsyncSendComplete(const std::string& ep,
-                                 int64_t time_out = FLAGS_rpc_deadline);
-
-  VarHandlePtr AsyncSendMessage(const std::string& ep,
-                                const std::string& method_name,
-                                const std::string& message, int64_t time_out);
-
-  VarHandlePtr AsyncSendVarMessage(const std::string& ep,
-                                   const std::string& method_name,
-                                   const sendrecv::VariableMessage& req,
-                                   int64_t time_out);
-
-  friend void HandleSendResponse(brpc::Controller* cntl,
-                                 sendrecv::VoidMessage* response,
-                                 VarHandlePtr var_h, ChannelQueuePtr ch_ptr,
-                                 ChannelContextPtr ch_ctx, BRPCClient* cls);
-
-  friend void HandleGetResponse(brpc::Controller* cntl,
-                                sendrecv::VariableMessage* response,
-                                VarHandlePtr var_h, ChannelQueuePtr ch_ptr,
-                                ChannelContextPtr ch_ctx, BRPCClient* cls);
-
-  friend void HandleFetchBarrierResponse(brpc::Controller* cntl,
-                                         sendrecv::VariableMessage* response,
-                                         VarHandlePtr var_h,
-                                         ChannelQueuePtr ch_ptr,
-                                         ChannelContextPtr ch_ctx,
-                                         BRPCClient* cls);
-  void DecreaseReqCount() {
-    if (--req_count_ <= 0) {
-      sync_cond_.notify_all();
-    }
-  }
-
- private:
-  std::unordered_map<std::string, ChannelQueuePtr> channels_;
-
-  // mutex for Wait client sync
-  std::mutex sync_mutex_;
-  std::condition_variable sync_cond_;
-  std::atomic<int64_t> req_count_{0};
-
-  static constexpr int brpc_channel_num_per_server_ = 4;
-
-  // mutex for GetChannel thread safety
-  std::mutex chan_mutex_;
-  DISABLE_COPY_AND_ASSIGN(BRPCClient);
-};
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.cc b/paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.cc
deleted file mode 100644
index 94f0b9919ace8..0000000000000
--- a/paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.cc
+++ /dev/null
@@ -1,86 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifdef PADDLE_WITH_BRPC_RDMA
-
-#include "paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.h"
-#include "brpc/channel.h"
-#include "brpc/rdma/rdma_helper.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-RdmaMemPool& RdmaMemPool::Instance() {
-  static RdmaMemPool* g_rdma_mem_pool = new RdmaMemPool();
-  return *g_rdma_mem_pool;
-}
-
-void* RdmaMemPool::Find(const std::string& varname, int64_t size) {
-  pthread_rwlock_rdlock(&access_);
-  auto it = pool_.find(varname);
-  if (it == pool_.end()) {
-    pthread_rwlock_unlock(&access_);
-    return nullptr;
-  }
-
-  auto info = it->second;
-  if (info.data_size != size) {
-    pthread_rwlock_unlock(&access_);
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "var:%s size:%ld != %ld", varname, size, info.data_size));
-    return nullptr;
-  }
-
-  pthread_rwlock_unlock(&access_);
-  return info.data;
-}
-
-void RdmaMemPool::Register(const std::string& varname, void* data,
-                           int64_t data_size) {
-  void* old = Find(varname, data_size);
-  if (old != nullptr) {
-    PADDLE_ENFORCE_EQ(
-        data, old, platform::errors::InvalidArgument("var:%s data:%ld != %ld",
-                                                     varname, data, old));
-    VLOG(7) << "Find on rdma:" << varname << " data:" << data
-            << " data_size:" << data_size;
-    return;
-  }
-
-  VarInfo info;
-  info.data = data;
-  info.data_size = data_size;
-
-  pthread_rwlock_wrlock(&access_);
-  pool_[varname] = info;
-  pthread_rwlock_unlock(&access_);
-
-  if (brpc::rdma::RegisterMemoryForRdma(data, data_size)) {
-    PADDLE_THROW(platform::errors::Unavailable(
-        "Register memory for RDMA failed. Register %s data: %s data size %d "
-        "error.",
-        varname, data, data_size));
-  }
-
-  VLOG(4) << "register on rdma:" << varname << " data:" << data
-          << " data_size:" << data_size;
-}
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
-
-#endif
diff --git a/paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.h b/paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.h
deleted file mode 100644
index 156a93ec57847..0000000000000
--- a/paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.h
+++ /dev/null
@@ -1,56 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#ifdef PADDLE_WITH_BRPC_RDMA
-
-#include <pthread.h>  // NOLINT
-#include <string>
-#include <unordered_map>
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-/*
- * This class is used to avoid duplicated registion of brpc::rdma.
- */
-class RdmaMemPool {
- public:
-  static RdmaMemPool& Instance();
-  RdmaMemPool() : access_(PTHREAD_RWLOCK_INITIALIZER) {}
-
-  virtual ~RdmaMemPool() { pthread_rwlock_destroy(&access_); }
-
-  void Register(const std::string& varname, void* data, int64_t size);
-  void* Find(const std::string& varname, int64_t size);
-
- private:
-  struct VarInfo {
-    void* data;
-    int64_t data_size;
-
-    VarInfo() : data(nullptr), data_size(0) {}
-  };
-
- private:
-  std::unordered_map<std::string, VarInfo> pool_;
-  pthread_rwlock_t access_;
-};
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
-
-#endif
diff --git a/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.cc b/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.cc
deleted file mode 100644
index 411c0f36debd3..0000000000000
--- a/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.cc
+++ /dev/null
@@ -1,224 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_NCCL
-#include <nccl.h>
-#endif
-#ifdef PADDLE_WITH_RCCL
-#include <rccl.h>
-#endif
-#include <sys/time.h>
-#include <limits>
-#include <memory>
-#include <thread>  // NOLINT
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.h"
-#include "paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h"
-#include "paddle/fluid/operators/distributed/brpc/brpc_variable_response.h"
-#include "paddle/fluid/operators/distributed/distributed_pb.h"
-#include "paddle/fluid/platform/profiler.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-class IOBufWriter {
- public:
-  static void Append(const std::string& varname, butil::IOBuf* iobuf, int k,
-                     const char* v, int64_t vlen) {
-    if (vlen >= std::numeric_limits<int>::max() || vlen < 0) {
-      PADDDLE_THROW(platform::errors::Unavailable(
-          "Variable lenght is invalid. Variable name is %s, length is %d.",
-          varname, vlen));
-    }
-
-    iobuf->append(reinterpret_cast<char*>(&k), 4);
-    iobuf->append(reinterpret_cast<char*>(&vlen), 8);
-    iobuf->append(v, vlen);
-  }
-
-  static void AppendTCPZeroCopy(butil::IOBuf* iobuf, int k, const char* v,
-                                int64_t vlen, bool in_cuda_pinned,
-                                void (*destroy)(void*), void* user_data) {
-    VLOG(7) << "AppendTCPZeroCopy "
-            << " k:" << k
-            << " data:" << static_cast<void*>(const_cast<char*>(v))
-            << " data_size:" << vlen << " in_cuda_pinned:" << in_cuda_pinned;
-
-    iobuf->append(reinterpret_cast<char*>(&k), 4);
-    iobuf->append(reinterpret_cast<char*>(&vlen), 8);
-
-    // FIXME(gongwb): use append_zerocopy
-    /*
-    if (in_cuda_pinned) {
-      iobuf->append_zerocopy(v, vlen, IOBufWriter::FreeMemory);
-    } else {
-      iobuf->append_zerocopy(v, vlen, nullptr);
-    }
-    */
-    iobuf->append(v, vlen);
-    destroy(user_data);
-  }
-
-#ifdef PADDLE_WITH_BRPC_RDMA
-  static void AppendRdmaZeroCopy(const std::string varname, butil::IOBuf* iobuf,
-                                 int k, const char* v, int64_t vlen,
-                                 bool in_cuda_pinned, void (*destroy)(void*),
-                                 void* user_data) {
-    VLOG(7) << "AppendRdmaZeroCopy varname:" << varname << " k:" << k
-            << " data:" << static_cast<void*>(const_cast<char*>(v))
-            << " data_size:" << vlen << " in_cuda_pinned:" << in_cuda_pinned;
-
-    iobuf->append(reinterpret_cast<char*>(&k), 4);
-    iobuf->append(reinterpret_cast<char*>(&vlen), 8);
-
-    RdmaMemPool::Instance().Register(
-        varname, static_cast<void*>(const_cast<char*>(v)), vlen);
-
-    // FIXME(gongwb): use append_zerocopy
-    // iobuf->append_zerocopy(v, vlen, nullptr);
-    iobuf->append(v, vlen);
-    destroy(user_data);
-    return;
-  }
-#endif
-
-  static void AppendZeroCopy(const std::string varname, butil::IOBuf* iobuf,
-                             int k, const char* v, int64_t vlen,
-                             bool in_cuda_pinned, void (*destroy)(void*),
-                             void* user_data) {
-    if (vlen >= std::numeric_limits<int>::max() || vlen < 0) {
-      PADDDLE_THROW(platform::errors::Unavailable(
-          "Variable lenght is invalid. Variable name is %s, length is %d.",
-          varname, vlen));
-    }
-
-#ifdef PADDLE_WITH_BRPC_RDMA
-    IOBufWriter::AppendRdmaZeroCopy(varname, iobuf, k, v, vlen, in_cuda_pinned,
-                                    destroy, user_data);
-#else
-    IOBufWriter::AppendTCPZeroCopy(iobuf, k, v, vlen, in_cuda_pinned, destroy,
-                                   user_data);
-#endif
-  }
-};
-
-void SerializeToIOBuf(const std::string& name, framework::Variable* var,
-                      const platform::DeviceContext& ctx, VarMsg* request,
-                      butil::IOBuf* iobuf, const std::string& out_varname,
-                      bool var_is_not_stable, int trainer_id,
-                      const std::string& table_name) {
-  std::unique_ptr<TensorPayload> payload;
-
-  request->set_varname(name);
-  request->set_trainer_id(trainer_id);
-  // Note: normally the profiler is enabled in 1 trainer, hence only
-  // 1 trainer returns true for ShouldSendProfileState(). It tells PS
-  // servers the trainer's profiling state so that PS can follow the
-  // trainer.
-  if (platform::ShouldSendProfileState()) {
-    if (platform::IsProfileEnabled()) {
-      request->set_profile(platform::kEnableProfiler);
-    } else {
-      request->set_profile(platform::kDisableProfiler);
-    }
-  }
-  if (!out_varname.empty()) {
-    request->set_out_varname(out_varname);
-  }
-  if (!table_name.empty()) {
-    request->set_table_name(table_name);
-  }
-  if (var->IsType<framework::LoDTensor>()) {
-    request->set_type(::sendrecv::LOD_TENSOR);
-    payload.reset(new TensorPayload(GetTensorPayload(var, ctx, request)));
-  } else if (var->IsType<framework::SelectedRows>()) {
-    request->set_type(::sendrecv::SELECTED_ROWS);
-    payload.reset(new TensorPayload(GetSelectedRowsPayload(var, ctx, request)));
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-  } else if (var->IsType<ncclUniqueId>()) {
-    request->set_type(::sendrecv::NCCL_ID);
-    const ncclUniqueId& uid = var->Get<ncclUniqueId>();
-    // TODO(gongwb): use append_zero to avoid data copy.
-    IOBufWriter::Append(name, iobuf,
-                        sendrecv::VariableMessage::kSerializedFieldNumber,
-                        uid.internal, NCCL_UNIQUE_ID_BYTES);
-    return;
-#endif
-  } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "Serialize does not support type: %s", typeid(var->Type()).name()));
-  }
-
-  PADDLE_ENFORCE_NOT_NULL(
-      payload,
-      platform::errors::InvalidArgument(
-          "Not support type: %s, need to be LOD_TENSOR or SELECTED_ROWS.",
-          var->Type()));
-
-  // FIXME(gongwb): it seems that can use zero copy.
-  if (var_is_not_stable) {
-    IOBufWriter::Append(
-        name, iobuf, ::sendrecv::VariableMessage::kSerializedFieldNumber,
-        static_cast<const char*>(payload->ptr()), payload->memory_size());
-  } else {
-    if (platform::is_gpu_place(ctx.GetPlace())) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      IOBufWriter::AppendZeroCopy(
-          name, iobuf, ::sendrecv::VariableMessage::kSerializedFieldNumber,
-          static_cast<const char*>(payload->ptr()), payload->memory_size(),
-          true, SerializeDestroyCallback, static_cast<void*>(payload.get()));
-      payload.release();
-#endif
-    } else {
-      IOBufWriter::AppendZeroCopy(
-          name, iobuf, ::sendrecv::VariableMessage::kSerializedFieldNumber,
-          static_cast<const char*>(payload->ptr()), payload->memory_size(),
-          false, SerializeDestroyCallback, static_cast<void*>(payload.get()));
-      payload.release();
-    }
-  }
-
-  if (var->IsType<framework::SelectedRows>()) {
-    auto* slr = var->GetMutable<framework::SelectedRows>();
-    PADDLE_ENFORCE_EQ(VectorElemName(slr->rows()), typeid(int64_t).name(),
-                      platform::errors::InvalidArgument(
-                          "Got wrong type: %s, expect type: int64_t",
-                          VectorElemName(slr->rows())));
-    size_t rows_memory_size = slr->rows().size() * sizeof(int64_t);
-
-    IOBufWriter::Append(name, iobuf,
-                        ::sendrecv::VariableMessage::kRowsFieldNumber,
-                        reinterpret_cast<const char*>(slr->rows().data()),
-                        static_cast<int64_t>(rows_memory_size));
-  }
-}
-
-void DeserializeFromIOBuf(const ::sendrecv::VariableMessage& meta,
-                          const butil::IOBuf& iobuf,
-                          const platform::DeviceContext& ctx,
-                          const framework::Scope* scope,
-                          framework::Variable** var, int* trainer_id) {
-  operators::distributed::BRPCVariableResponse resp(scope, &ctx);
-  PADDLE_ENFORCE_EQ(
-      resp.Parse(iobuf, meta), 0,
-      platform::errors::InvalidArgument("parse iobuf to tensor error!"));
-  *var = resp.GetVar();
-  *trainer_id = resp.GetTrainerId();
-}
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h b/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h
deleted file mode 100644
index a5bdc331eb29c..0000000000000
--- a/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <sys/time.h>
-#include <iostream>
-#include <string>
-#include <vector>
-
-#include "brpc/channel.h"
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/operators/distributed/distributed_pb.h"
-#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-void SerializeToIOBuf(const std::string& name, framework::Variable* var,
-                      const platform::DeviceContext& ctx, VarMsg* request,
-                      butil::IOBuf* iobuf, const std::string& out_varname,
-                      bool var_is_not_stable, const int trainer_id = 0,
-                      const std::string& table_name = std::string());
-
-void DeserializeFromIOBuf(const VarMsg& meta, const butil::IOBuf& iobuf,
-                          const platform::DeviceContext& ctx,
-                          const framework::Scope* scope,
-                          framework::Variable** var, int* trainer_id);
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/brpc/brpc_serde_test.cc b/paddle/fluid/operators/distributed/brpc/brpc_serde_test.cc
deleted file mode 100644
index bcf20ad076b11..0000000000000
--- a/paddle/fluid/operators/distributed/brpc/brpc_serde_test.cc
+++ /dev/null
@@ -1,175 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <unistd.h>
-#include <string>
-#include <thread>  // NOLINT
-
-#include "brpc/channel.h"
-#include "google/protobuf/text_format.h"
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/framework/variable.h"
-#include "paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h"
-#include "paddle/fluid/operators/distributed/brpc/brpc_variable_response.h"
-#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
-#include "paddle/fluid/operators/distributed/variable_response.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/string/printf.h"
-
-namespace framework = paddle::framework;
-namespace platform = paddle::platform;
-namespace operators = paddle::operators;
-namespace math = paddle::operators::math;
-namespace memory = paddle::memory;
-
-void RunSerdeTestSelectedRows(platform::Place place) {
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  auto& ctx = *pool.Get(place);
-
-  butil::IOBuf iobuf;
-  sendrecv::VariableMessage msg;
-  int tensor_numel = 564 * 128;
-
-  // serialize var to IOBuf
-  {
-    framework::Variable var;
-    auto* slr = var.GetMutable<framework::SelectedRows>();
-    slr->set_height(1000);
-    auto* tensor = slr->mutable_value();
-    auto* rows = slr->mutable_rows();
-    tensor->Resize(framework::make_ddim({564, 128}));
-    tensor->mutable_data<float>(place);
-    math::set_constant(ctx, tensor, 32.7);
-    for (int i = 0; i < 564; ++i) rows->push_back(i);
-
-    operators::distributed::SerializeToIOBuf("myvar", &var, ctx, &msg, &iobuf,
-                                             "", false);
-  }
-
-  // desrialize
-  {
-    framework::Scope scope;
-    scope.Var("myvar");
-    operators::distributed::BRPCVariableResponse resp(&scope, &ctx);
-    EXPECT_EQ(resp.Parse(iobuf, msg), 0);
-
-    framework::Variable* var2 = resp.GetVar();
-
-    auto* slr2 = var2->GetMutable<framework::SelectedRows>();
-    auto* tensor2 = slr2->mutable_value();
-    auto* rows2 = slr2->mutable_rows();
-    float* tensor_data2 = nullptr;
-    framework::Tensor tmp_tensor;
-
-    if (platform::is_gpu_place(ctx.GetPlace())) {
-      platform::CPUPlace cpu;
-      framework::TensorCopy(*tensor2, cpu, &tmp_tensor);
-      tensor_data2 = tmp_tensor.data<float>();
-    } else {
-      tensor_data2 = const_cast<float*>(tensor2->data<float>());
-    }
-    const int64_t* rows_data2 = rows2->data();
-
-    for (int i = 0; i < tensor_numel; ++i) {
-      EXPECT_FLOAT_EQ(tensor_data2[i], 32.7);
-    }
-    for (size_t i = 0; i < rows2->size(); ++i) {
-      EXPECT_EQ(rows_data2[i], static_cast<int64_t>(i));
-    }
-    EXPECT_EQ(slr2->height(), 1000);
-  }
-}
-
-void RunTestLodTensor(platform::Place place) {
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  auto& ctx = *pool.Get(place);
-
-  // serialize var to ByteBuffer
-  butil::IOBuf iobuf;
-  sendrecv::VariableMessage msg;
-  int tensor_numel = 512 * 8 * 4 * 2;
-  {
-    framework::Variable var;
-    auto* tensor = var.GetMutable<framework::LoDTensor>();
-    tensor->Resize(framework::make_ddim({512, 8, 4, 2}));
-    framework::LoD lod;
-    lod.push_back(framework::Vector<size_t>({1, 3, 8}));
-    tensor->set_lod(lod);
-    tensor->mutable_data<float>(place);
-    math::set_constant(ctx, tensor, 31.9);
-
-    operators::distributed::SerializeToIOBuf("myvar", &var, ctx, &msg, &iobuf,
-                                             "", false);
-  }
-
-  // check sendrecv::VariableMessage meta data
-  {
-    EXPECT_EQ(msg.varname(), "myvar");
-    EXPECT_EQ(msg.type(), 0);
-    EXPECT_EQ(msg.dims()[0], 512);
-    EXPECT_EQ(msg.dims()[1], 8);
-    EXPECT_EQ(msg.dims()[2], 4);
-    EXPECT_EQ(msg.dims()[3], 2);
-    EXPECT_EQ(msg.lod_level(), 1);
-    EXPECT_EQ(msg.lod(0).lod_data(0), 1);
-    EXPECT_EQ(msg.lod(0).lod_data(1), 3);
-    EXPECT_EQ(msg.lod(0).lod_data(2), 8);
-  }
-
-  // deserialize
-  {
-    framework::Scope scope;
-    scope.Var("myvar");
-    operators::distributed::BRPCVariableResponse resp(&scope, &ctx);
-    EXPECT_EQ(resp.Parse(iobuf, msg), 0);
-
-    framework::Variable* var2 = resp.GetVar();
-
-    auto tensor2 = var2->Get<framework::LoDTensor>();
-    float* tensor_data2 = nullptr;
-    framework::Tensor tmp_tensor;
-
-    if (platform::is_gpu_place(ctx.GetPlace())) {
-      platform::CPUPlace cpu;
-      framework::TensorCopy(tensor2, cpu, &tmp_tensor);
-      tensor_data2 = tmp_tensor.data<float>();
-    } else {
-      tensor_data2 = const_cast<float*>(tensor2.data<float>());
-    }
-
-    for (int i = 0; i < tensor_numel; ++i)
-      EXPECT_FLOAT_EQ(tensor_data2[i], 31.9);
-  }
-}
-
-TEST(LodTensor, Run) {
-  platform::CPUPlace place;
-  RunTestLodTensor(place);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  platform::CUDAPlace gpu(0);
-  RunTestLodTensor(gpu);
-#endif
-}
-
-TEST(SelectedRows, Run) {
-  platform::CPUPlace place;
-  RunSerdeTestSelectedRows(place);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  platform::CUDAPlace gpu;
-  RunSerdeTestSelectedRows(gpu);
-#endif
-}
diff --git a/paddle/fluid/operators/distributed/brpc/brpc_server.cc b/paddle/fluid/operators/distributed/brpc/brpc_server.cc
deleted file mode 100644
index 5ca26f006bf20..0000000000000
--- a/paddle/fluid/operators/distributed/brpc/brpc_server.cc
+++ /dev/null
@@ -1,417 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/distributed/brpc/brpc_server.h"
-#include <memory>
-#include <unordered_map>
-#include "paddle/fluid/framework/threadpool.h"
-#include "paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h"
-#include "paddle/fluid/operators/distributed/brpc/brpc_variable_response.h"
-#include "paddle/fluid/operators/distributed/request_handler.h"
-
-namespace sendrecv {
-
-namespace distributed = paddle::operators::distributed;
-
-typedef std::unordered_map<std::string, distributed::RequestHandler*>
-    HandlerMap;
-
-class BRPCServiceImpl : public SendRecvService {
- public:
-  explicit BRPCServiceImpl(const HandlerMap& rpc_call_map,
-                           distributed::RPCServer* rpc_server)
-      : rpc_server_(rpc_server) {
-    VLOG(3) << "BRPCServiceImpl size: " << rpc_call_map.size();
-    auto it = rpc_call_map.find(distributed::kRequestSend);
-    if (it != rpc_call_map.end()) {
-      request_send_h_ = it->second;
-      send_threads_.reset(new paddle::framework::ThreadPool(
-          rpc_server_->GetThreadNum(distributed::kRequestSend)));
-    }
-
-    it = rpc_call_map.find(distributed::kRequestGet);
-    if (it != rpc_call_map.end()) {
-      request_get_h_ = it->second;
-      get_threads_.reset(new paddle::framework::ThreadPool(
-          rpc_server_->GetThreadNum(distributed::kRequestGet)));
-    }
-
-    it = rpc_call_map.find(distributed::kRequestGetNoBarrier);
-    if (it != rpc_call_map.end()) {
-      request_getnobarrier_h_ = it->second;
-      getnobarrier_threads_.reset(new paddle::framework::ThreadPool(
-          rpc_server_->GetThreadNum(distributed::kRequestGetNoBarrier)));
-    }
-
-    it = rpc_call_map.find(distributed::kRequestPrefetch);
-    if (it != rpc_call_map.end()) {
-      request_prefetch_h_ = it->second;
-      prefetch_threads_.reset(new paddle::framework::ThreadPool(
-          rpc_server_->GetThreadNum(distributed::kRequestPrefetch)));
-    }
-
-    it = rpc_call_map.find(distributed::kRequestCheckpoint);
-    if (it != rpc_call_map.end()) {
-      request_checkpoint_h_ = it->second;
-      checkpoint_notify_threads_.reset(new paddle::framework::ThreadPool(
-          rpc_server_->GetThreadNum(distributed::kRequestPrefetch)));
-    }
-
-    it = rpc_call_map.find(distributed::kRequestGetMonomerVariable);
-    if (it != rpc_call_map.end()) {
-      request_get_monomer_handler_h_ = it->second;
-    }
-
-    it = rpc_call_map.find(distributed::kRequestGetMonomerBarrier);
-    if (it != rpc_call_map.end()) {
-      request_get_monomer_barrier_handler_h_ = it->second;
-    }
-  }
-
-  virtual ~BRPCServiceImpl() {}
-  void SendVariable(google::protobuf::RpcController* cntl_butil,
-                    const VariableMessage* request, VoidMessage* response,
-                    google::protobuf::Closure* done) override {
-    send_threads_->Run(
-        [=] { _SendVariable(cntl_butil, request, response, done); });
-  }
-
-  void _SendVariable(google::protobuf::RpcController* cntl_butil,
-                     const VariableMessage* request, VoidMessage* response,
-                     google::protobuf::Closure* done) {
-    PADDLE_ENFORCE_NOT_NULL(
-        request_send_h_, platform::errors::PreconditionNotMet(
-                             "RequestSend handler should be registed first!"));
-    brpc::ClosureGuard done_guard(done);
-    brpc::Controller* cntl = static_cast<brpc::Controller*>(cntl_butil);
-
-    std::string varname = request->varname();
-    VLOG(3) << "RequestSend var_name:" << varname
-            << ", trainer_id:" << request->trainer_id()
-            << ", from:" << cntl->remote_side();
-
-    distributed::BRPCVariableResponse resp(request_send_h_->scope(),
-                                           request_send_h_->dev_ctx(),
-                                           request_send_h_->distributed_mode());
-    PADDLE_ENFORCE_EQ(
-        resp.Parse(cntl->request_attachment(), *request), 0,
-        platform::errors::InvalidArgument("parse iobuf to tensor error!"));
-
-    auto scope = resp.GetMutableLocalScope();
-    auto invar = resp.GetVar();
-    int trainer_id = request->trainer_id();
-    paddle::framework::Variable* outvar = nullptr;
-
-    request_send_h_->Handle(varname, scope, invar, &outvar, trainer_id);
-  }
-
-  void GetVariable(google::protobuf::RpcController* cntl_butil,
-                   const VariableMessage* request, VariableMessage* response,
-                   google::protobuf::Closure* done) override {
-    get_threads_->Run(
-        [=] { _GetVariable(cntl_butil, request, response, done); });
-  }
-
-  void GetVariableNoBarrier(google::protobuf::RpcController* cntl_butil,
-                            const VariableMessage* request,
-                            VariableMessage* response,
-                            google::protobuf::Closure* done) override {
-    getnobarrier_threads_->Run(
-        [=] { _GetVariableNoBarrier(cntl_butil, request, response, done); });
-  }
-
-  void _GetVariable(google::protobuf::RpcController* cntl_butil,
-                    const VariableMessage* request, VariableMessage* response,
-                    google::protobuf::Closure* done) {
-    PADDLE_ENFORCE_NOT_NULL(
-        request_get_h_, platform::errors::PreconditionNotMet(
-                            "RequestGet handler should be registed first!"));
-
-    brpc::ClosureGuard done_guard(done);
-    brpc::Controller* cntl = static_cast<brpc::Controller*>(cntl_butil);
-
-    std::string varname = request->varname();
-    std::string out_varname = request->out_varname();
-    VLOG(3) << "RequestGet varname:" << varname
-            << ", out_varname:" << out_varname
-            << ", trainer_id:" << request->trainer_id()
-            << ", from:" << cntl->remote_side();
-
-    auto scope = request_get_h_->scope();
-    paddle::framework::Variable* invar = nullptr;
-    int trainer_id = request->trainer_id();
-    paddle::framework::Variable* outvar = nullptr;
-
-    request_get_h_->Handle(varname, scope, invar, &outvar, trainer_id,
-                           out_varname);
-
-    if (outvar) {
-      distributed::SerializeToIOBuf(out_varname, outvar,
-                                    *request_get_h_->dev_ctx(), response,
-                                    &cntl->response_attachment(), "", false);
-    }
-  }
-
-  void _GetVariableNoBarrier(google::protobuf::RpcController* cntl_butil,
-                             const VariableMessage* request,
-                             VariableMessage* response,
-                             google::protobuf::Closure* done) {
-    PADDLE_ENFORCE_NOT_NULL(
-        request_getnobarrier_h_,
-        platform::errors::PreconditionNotMet(
-            "RequestGetNoBarrier handler should be registed first!"));
-
-    brpc::ClosureGuard done_guard(done);
-    brpc::Controller* cntl = static_cast<brpc::Controller*>(cntl_butil);
-
-    std::string varname = request->varname();
-    std::string out_varname = request->out_varname();
-    int trainer_id = request->trainer_id();
-
-    VLOG(3) << "RequestGetNoBarrier varname:" << varname
-            << ", out_varname:" << out_varname << ", trainer_id:" << trainer_id
-            << ", from:" << cntl->remote_side();
-
-    auto scope = request_getnobarrier_h_->scope();
-    paddle::framework::Variable* invar = nullptr;
-    paddle::framework::Variable* outvar = nullptr;
-
-    request_getnobarrier_h_->Handle(varname, scope, invar, &outvar, trainer_id,
-                                    out_varname);
-
-    if (outvar) {
-      distributed::SerializeToIOBuf(
-          out_varname, outvar, *request_getnobarrier_h_->dev_ctx(), response,
-          &cntl->response_attachment(), "", false);
-    }
-  }
-
-  void PrefetchVariable(google::protobuf::RpcController* cntl_butil,
-                        const VariableMessage* request,
-                        VariableMessage* response,
-                        google::protobuf::Closure* done) override {
-    prefetch_threads_->Run(
-        [=] { _PrefetchVariable(cntl_butil, request, response, done); });
-  }
-
-  void _PrefetchVariable(google::protobuf::RpcController* cntl_butil,
-                         const VariableMessage* request,
-                         VariableMessage* response,
-                         google::protobuf::Closure* done) {
-    PADDLE_ENFORCE_NOT_NULL(request_prefetch_h_,
-                   platform::errors::PreconditionNotMet(
-                       "kRequestPrefetch handler should be registed first!");
-
-    brpc::ClosureGuard done_guard(done);
-    brpc::Controller* cntl = static_cast<brpc::Controller*>(cntl_butil);
-
-    // prefetch process...
-    std::string in_var_name = request->varname();
-    std::string out_var_name = request->out_varname();
-    VLOG(3) << "RequestPrefetch, in_var_name: " << in_var_name
-            << ", out_var_name: " << out_var_name
-            << ", trainer_id:" << request->trainer_id()
-            << ", from:" << cntl->remote_side();
-
-    distributed::BRPCVariableResponse resp(
-        request_prefetch_h_->scope(), request_prefetch_h_->dev_ctx(), true);
-
-    PADDLE_ENFORCE_EQ(resp.Parse(cntl->request_attachment(), *request), 0,
-                   platform::errors::InvalidArgument(
-                       "parse iobuf to tensor error!"));
-
-    auto scope = resp.GetMutableLocalScope();
-    auto invar = scope->FindVar(in_var_name);
-    std::string table_name = request->table_name();
-    int trainer_id = request->trainer_id();
-    paddle::framework::Variable* outvar = scope->Var(out_var_name);
-
-    request_prefetch_h_->Handle(in_var_name, scope, invar, &outvar, trainer_id,
-                                out_var_name, table_name);
-
-    distributed::SerializeToIOBuf(out_var_name, outvar,
-                                  *request_prefetch_h_->dev_ctx(), response,
-                                  &cntl->response_attachment(), "", true);
-  }
-
-  void CheckpointNotify(google::protobuf::RpcController* cntl_butil,
-                        const VariableMessage* request, VoidMessage* response,
-                        google::protobuf::Closure* done) override {
-    checkpoint_notify_threads_->Run(
-        [=] { _CheckpointNotify(cntl_butil, request, response, done); });
-  }
-
-  void _CheckpointNotify(google::protobuf::RpcController* cntl_butil,
-                         const VariableMessage* request, VoidMessage* response,
-                         google::protobuf::Closure* done) {
-    PADDLE_ENFORCE_NOT_NULL(
-        request_checkpoint_h_,
-        platform::errors::PreconditionNotMet(
-            "kRequestCheckpointNotify handler should be registed first!"));
-
-    brpc::ClosureGuard done_guard(done);
-    brpc::Controller* cntl = static_cast<brpc::Controller*>(cntl_butil);
-
-    distributed::BRPCVariableResponse resp(request_checkpoint_h_->scope(),
-                                           request_checkpoint_h_->dev_ctx());
-
-    auto scope = resp.GetMutableLocalScope();
-
-    std::string checkpoint_notify = request->varname();
-    std::string checkpoint_dir = request->out_varname();
-    int trainer_id = request->trainer_id();
-
-    VLOG(4) << "RequestCheckpointNotify notify: " << checkpoint_notify
-            << ", dir: " << checkpoint_dir
-            << ", trainer_id:" << request->trainer_id()
-            << ", from:" << cntl->remote_side();
-
-    request_checkpoint_h_->Handle(checkpoint_notify, scope, nullptr, nullptr,
-                                  trainer_id, checkpoint_dir);
-  }
-
-  void GetMonomerVariable(google::protobuf::RpcController* cntl_butil,
-                          const VariableMessage* request,
-                          VariableMessage* response,
-                          google::protobuf::Closure* done) override {
-    PADDLE_ENFORCE_NOT_NULL(
-        request_get_monomer_handler_h_,
-        platform::errors::PreconditionNotMet(
-            "kRequestGetMonomerVariable handler should be registed first!"));
-
-    brpc::ClosureGuard done_guard(done);
-    brpc::Controller* cntl = static_cast<brpc::Controller*>(cntl_butil);
-
-    // proc request.
-    std::string varname = request->varname();
-    VLOG(3) << "GetMonomerVariable " << varname
-            << ", trainer_id:" << request->trainer_id()
-            << ", from:" << cntl->remote_side();
-
-    rpc_server_->WaitVarCond(varname);
-    distributed::MonomerHandle h = rpc_server_->GetMonomer(varname);
-
-    auto scope = h.scope_;
-    auto invar = scope->FindVar(varname);
-    paddle::framework::Variable* outvar = nullptr;
-
-    request_get_monomer_handler_h_->Handle(varname, scope, invar, &outvar,
-                                           request->trainer_id());
-
-    if (outvar) {
-      distributed::SerializeToIOBuf(varname, outvar, *h.dev_ctx_, response,
-                                    &cntl->response_attachment(), "", false);
-    }
-  }
-
-  void GetMonomerBarrier(google::protobuf::RpcController* cntl_butil,
-                         const VariableMessage* request, VoidMessage* response,
-                         google::protobuf::Closure* done) override {
-    PADDLE_ENFORCE_NOT_NULL(
-        request_get_monomer_barrier_handler_h_,
-        platform::errors::PreconditionNotMet(
-            "RequestGetMonomerBarrier handler should be registed first!"));
-
-    brpc::ClosureGuard done_guard(done);
-    brpc::Controller* cntl = static_cast<brpc::Controller*>(cntl_butil);
-
-    std::string varname = request->varname();
-    VLOG(3) << "RequestGetMonomerBarrier var_name:" << varname
-            << ", trainer_id:" << request->trainer_id()
-            << ", from:" << cntl->remote_side();
-
-    rpc_server_->WaitVarCond(varname);
-    distributed::MonomerHandle h = rpc_server_->GetMonomer(varname);
-
-    paddle::framework::Scope* scope = nullptr;
-    paddle::framework::Variable* invar = nullptr;
-    paddle::framework::Variable* outvar = nullptr;
-
-    request_get_monomer_barrier_handler_h_->Handle(
-        varname, scope, invar, &outvar, request->trainer_id());
-  }
-
- private:
-  distributed::RequestHandler* request_send_h_{nullptr};
-  distributed::RequestHandler* request_get_h_{nullptr};
-  distributed::RequestHandler* request_getnobarrier_h_{nullptr};
-  distributed::RequestHandler* request_prefetch_h_{nullptr};
-  distributed::RequestHandler* request_checkpoint_h_{nullptr};
-  distributed::RequestHandler* request_get_monomer_handler_h_{nullptr};
-  distributed::RequestHandler* request_get_monomer_barrier_handler_h_{nullptr};
-
-  distributed::RPCServer* rpc_server_{nullptr};
-
-  // FIXME(gongwb): brpc should support process one rpc use one threadpool.
-  std::unique_ptr<paddle::framework::ThreadPool> send_threads_;
-  std::unique_ptr<paddle::framework::ThreadPool> get_threads_;
-  std::unique_ptr<paddle::framework::ThreadPool> getnobarrier_threads_;
-  std::unique_ptr<paddle::framework::ThreadPool> prefetch_threads_;
-  std::unique_ptr<paddle::framework::ThreadPool> checkpoint_notify_threads_;
-};
-}  // namespace sendrecv
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-void AsyncBRPCServer::StartServer() {
-  // Instance of your service.
-  sendrecv::BRPCServiceImpl service_impl(rpc_call_map_, this);
-
-  // Add the service into server. Notice the second parameter, because the
-  // service is put on stack, we don't want server to delete it, otherwise
-  // use brpc::SERVER_OWNS_SERVICE.
-  if (server_.AddService(&service_impl, brpc::SERVER_DOESNT_OWN_SERVICE) != 0) {
-    PADDDLE_THROW(platform::errors::Unavailable(
-        "Failed to add service into BRPC server."));
-    return;
-  }
-
-  brpc::ServerOptions options;
-#ifdef PADDLE_WITH_BRPC_RDMA
-  options.use_rdma = true;
-#endif
-  options.idle_timeout_sec = idle_timeout_s_;
-  options.max_concurrency = max_concurrency_;
-  if (server_.Start(bind_address_.c_str(), &options) != 0) {
-    PADDDLE_THROW(platform::errors::Unavailable(
-        "Failed to start EchoServer %s.", bind_address_));
-    return;
-  }
-
-  butil::EndPoint ep = server_.listen_address();
-  selected_port_ = ep.port;
-
-  {
-    std::lock_guard<std::mutex> lock(this->mutex_ready_);
-    ready_ = 1;
-  }
-  condition_ready_.notify_all();
-
-  server_.Join();
-}
-
-void AsyncBRPCServer::ShutDownImpl() { server_.Stop(1000); }
-
-void AsyncBRPCServer::WaitServerReady() {
-  VLOG(3) << "AsyncGRPCServer is wait server ready";
-  std::unique_lock<std::mutex> lock(this->mutex_ready_);
-  condition_ready_.wait(lock, [=] { return this->ready_ == 1; });
-  VLOG(3) << "AsyncGRPCServer WaitSeverReady";
-}
-
-};  // namespace distributed
-};  // namespace operators
-};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/brpc/brpc_server.h b/paddle/fluid/operators/distributed/brpc/brpc_server.h
deleted file mode 100644
index 78bbe5adc0813..0000000000000
--- a/paddle/fluid/operators/distributed/brpc/brpc_server.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <condition_variable>  // NOLINT
-#include <mutex>               // NOLINT
-#include <string>
-
-#include "brpc/server.h"
-#include "paddle/fluid/operators/distributed/distributed_pb.h"
-#include "paddle/fluid/operators/distributed/rpc_server.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-class AsyncBRPCServer final : public RPCServer {
- public:
-  explicit AsyncBRPCServer(const std::string& address, int client_num)
-      : RPCServer(address, client_num), ready_(0) {}
-
-  virtual ~AsyncBRPCServer() {}
-  void StartServer() override;
-  void WaitServerReady() override;
-
- private:
-  void ShutDownImpl() override;
-
-  brpc::Server server_;
-
-  static constexpr int idle_timeout_s_ = -1;
-  static constexpr int max_concurrency_ = 0;
-
-  std::mutex mutex_ready_;
-  std::condition_variable condition_ready_;
-  int ready_;
-};
-
-};  // namespace distributed
-};  // namespace operators
-};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/brpc/brpc_variable_response.cc b/paddle/fluid/operators/distributed/brpc/brpc_variable_response.cc
deleted file mode 100644
index 49521e8a77057..0000000000000
--- a/paddle/fluid/operators/distributed/brpc/brpc_variable_response.cc
+++ /dev/null
@@ -1,75 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-#include "paddle/fluid/operators/distributed/brpc/brpc_variable_response.h"
-#include "paddle/fluid/operators/distributed/send_recv.pb.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-namespace pb = ::google::protobuf;
-using vr = ::sendrecv::VariableMessage;
-
-int BRPCVariableResponse::Parse(Source* source) {
-  pb::io::ZeroCopyInputStream* input_stream = source->contents();
-  pb::io::CodedInputStream input(input_stream);
-  input.SetTotalBytesLimit(INT_MAX, INT_MAX);
-
-  while (1) {
-    unsigned int tag = 0;
-    if (!input.ReadLittleEndian32(&tag)) {
-      break;
-    }
-
-    uint64_t num_bytes = 0;
-    if (!input.ReadLittleEndian64(&num_bytes)) {
-      break;
-    }
-
-    int field = static_cast<int>(tag);
-    int ret = field == 0 ? -1 : field;
-    switch (field) {
-      case vr::kSerializedFieldNumber: {
-        if (!ProcSerializedField(field, &input, num_bytes)) {
-          return ret;
-        }
-        break;
-      }
-      case vr::kRowsFieldNumber: {
-        PADDLE_ENFORCE((meta_.type() == sendrecv::SELECTED_ROWS ||
-                        meta_.type() == sendrecv::LOD_TENSOR) &&
-                           meta_.varname() != "",
-                       platform::errors::PreconditionNotMet(
-                           "meta info should be got first!"));
-
-        if (!CopySelectRowsData(&input, *dev_ctx_, num_bytes)) {
-          return ret;
-        }
-        break;
-      }
-      default: {
-        PADDLE_THROW(platform::errors::Unavailable(
-            "not surpported %u fieldnumber", field));
-        return ret;
-      }
-    }
-  }
-
-  return 0;
-}
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/brpc/brpc_variable_response.h b/paddle/fluid/operators/distributed/brpc/brpc_variable_response.h
deleted file mode 100644
index 6282f08a72536..0000000000000
--- a/paddle/fluid/operators/distributed/brpc/brpc_variable_response.h
+++ /dev/null
@@ -1,67 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-
-#include "brpc/channel.h"
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/var_type.h"
-
-#include "paddle/fluid/operators/distributed/distributed_pb.h"
-
-#include "google/protobuf/io/coded_stream.h"
-#include "google/protobuf/io/zero_copy_stream.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/distributed/variable_response.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-class BRPCSourceWrapper : public Source {
- public:
-  explicit BRPCSourceWrapper(const butil::IOBuf& iobuf) : source_(iobuf) {}
-  ::google::protobuf::io::ZeroCopyInputStream* contents() override {
-    return &source_;
-  }
-
- private:
-  butil::IOBufAsZeroCopyInputStream source_;
-};
-
-class BRPCVariableResponse : public VariableResponse {
- public:
-  BRPCVariableResponse(const framework::Scope* scope,
-                       const platform::DeviceContext* dev_ctx,
-                       bool create_scope = false)
-      : VariableResponse(scope, dev_ctx, create_scope) {}
-
-  virtual ~BRPCVariableResponse() {}
-
-  // parse attachment from iobuf
-  int Parse(Source* source) override;
-  int Parse(const butil::IOBuf& iobuf, const sendrecv::VariableMessage& meta) {
-    BRPCSourceWrapper wrapper(iobuf);
-    return VariableResponse::Parse(&wrapper, meta);
-  }
-};
-
-};  // namespace distributed
-};  // namespace operators
-};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/collective_client.cc b/paddle/fluid/operators/distributed/collective_client.cc
deleted file mode 100644
index fcd3e6abead51..0000000000000
--- a/paddle/fluid/operators/distributed/collective_client.cc
+++ /dev/null
@@ -1,57 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/distributed/collective_client.h"
-#include <memory>
-#include "gflags/gflags.h"
-
-DECLARE_int32(rpc_deadline);
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-std::once_flag CollectiveClient::init_flag_;
-std::unique_ptr<CollectiveClient> CollectiveClient::client_(nullptr);
-
-bool CollectiveClient::Gather(const std::vector<RemoteVar>& remote_vars,
-                              std::vector<const framework::SelectedRows*>* dst,
-                              const platform::DeviceContext& ctx,
-                              framework::Scope* scope, int64_t time_out) {
-  for (auto r : remote_vars) {
-    VLOG(50) << "begin gather from ep:" << r.String();
-    scope->Var(r.var_name_)->GetMutable<framework::SelectedRows>();
-    VarHandlePtr ptr = rpc_client_->AsyncGetMonomerVariable(
-        r.ep_, ctx, *scope, r.var_name_, time_out);
-  }
-
-  rpc_client_->Wait();
-
-  for (auto r : remote_vars) {
-    auto select_rows =
-        scope->FindVar(r.var_name_)->GetMutable<framework::SelectedRows>();
-    dst->push_back(select_rows);
-
-    VLOG(4) << "gather from ep:" << r.String()
-            << ", select_rows:" << GetSelectedRowsInfo(*select_rows);
-
-    rpc_client_->AsyncGetMonomerBarrier(r.ep_, r.var_name_);
-  }
-
-  rpc_client_->Wait();
-  return true;
-}
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/collective_client.h b/paddle/fluid/operators/distributed/collective_client.h
deleted file mode 100644
index e7d8bb8df9834..0000000000000
--- a/paddle/fluid/operators/distributed/collective_client.h
+++ /dev/null
@@ -1,104 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <condition_variable>  // NOLINT
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "gflags/gflags.h"
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/distributed/request_handler.h"
-
-namespace paddle {
-namespace framework {
-class Scope;
-class SelectedRows;
-}  // namespace framework
-namespace platform {
-class DeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
-DECLARE_int32(rpc_deadline);
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-inline std::string GetSelectedRowsInfo(const framework::SelectedRows& slr) {
-  std::stringstream ss;
-  ss << ", height:" << slr.height() << ", rows:[";
-  for (unsigned int i = 0; i < slr.rows().size(); i++) {
-    if (i != slr.rows().size() - 1) {
-      ss << slr.rows()[i] << ",";
-    } else {
-      ss << slr.rows()[i];
-    }
-  }
-  ss << "], dims:" << slr.value().dims();
-  return ss.str();
-}
-
-struct RemoteVar {
-  std::string ep_;
-  std::string var_name_;
-  int trainer_id_{0};
-
-  std::string String() {
-    std::stringstream ss;
-    ss << "ep:" << ep_ << ", var_name:" << var_name_
-       << ", trainer_id:" << trainer_id_;
-
-    return ss.str();
-  }
-};
-
-class CollectiveClient {
- public:
-  CollectiveClient() {
-    rpc_client_.reset(new RPCCLIENT_T());
-    rpc_client_->InitImpl();
-  }
-  virtual ~CollectiveClient() {}
-
-  // note this function will retain the rank order.
-  bool Gather(const std::vector<RemoteVar>& remote_vars,
-              std::vector<const framework::SelectedRows*>* dst,
-              const platform::DeviceContext& ctx, framework::Scope* scope,
-              int64_t time_out = FLAGS_rpc_deadline);
-
-  static CollectiveClient* GetInstance() {
-    std::call_once(init_flag_, [&]() {
-      if (client_.get() == nullptr) {
-        client_.reset(new CollectiveClient());
-      }
-    });
-    return client_.get();
-  }
-
- private:
-  std::unique_ptr<RPCClient> rpc_client_;
-
-  static std::once_flag init_flag_;
-  static std::unique_ptr<CollectiveClient> client_;
-};
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/collective_server.cc b/paddle/fluid/operators/distributed/collective_server.cc
deleted file mode 100644
index cdd37742d2d5a..0000000000000
--- a/paddle/fluid/operators/distributed/collective_server.cc
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/distributed/collective_server.h"
-#include <memory>
-
-DEFINE_int32(collective_get_thread_num, 5, "number of threads for rpc get");
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-std::once_flag CollectiveServer::init_flag_;
-std::shared_ptr<CollectiveServer> CollectiveServer::collective_server_(nullptr);
-
-CollectiveServer::CollectiveServer(const std::string& end_point, int fan_in) {
-  VLOG(1) << "Create colllective server:" << end_point << ", fan_in:" << fan_in;
-  rpc_server_.reset(new RPCSERVER_T(end_point, fan_in));
-}
-
-void CollectiveServer::Stop() {
-  rpc_server_->ShutDown();
-  server_thread_->join();
-  loop_thread_->join();
-}
-
-void CollectiveServer::StartServer() {
-  get_monomer_handler_.reset(new GetMonomerHandler());
-  get_monomer_handler_->SetRPCServer(rpc_server_.get());
-
-  get_barrier_handler_.reset(new GetMonomerBarrierHandler());
-  get_barrier_handler_->SetRPCServer(rpc_server_.get());
-
-  rpc_server_->RegisterRPC(distributed::kRequestGetMonomerVariable,
-                           get_monomer_handler_.get(),
-                           FLAGS_collective_get_thread_num);
-  rpc_server_->RegisterRPC(distributed::kRequestGetMonomerBarrier,
-                           get_barrier_handler_.get(), 1);
-
-  server_thread_.reset(new std::thread([&]() { rpc_server_->StartServer(); }));
-  rpc_server_->WaitServerReady();
-
-  loop_thread_.reset(new std::thread([&]() {
-    while (true) {
-      if (rpc_server_->IsExit()) {
-        LOG(WARNING) << "get exit!rpc_processor break!";
-        break;
-      }
-      sleep(1);
-    }
-    VLOG(1) << "CollectiveServer loop_thread end";
-  }));
-}
-
-};  // namespace distributed
-};  // namespace operators
-};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/collective_server.h b/paddle/fluid/operators/distributed/collective_server.h
deleted file mode 100644
index 4964923286094..0000000000000
--- a/paddle/fluid/operators/distributed/collective_server.h
+++ /dev/null
@@ -1,116 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <map>
-#include <memory>
-#include <set>
-#include <string>
-#include <thread>  // NOLINT
-#include <utility>
-#include <vector>
-#include "gflags/gflags.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/distributed/request_handler.h"
-#include "paddle/fluid/operators/distributed/request_handler_impl.h"
-#include "paddle/fluid/operators/distributed/rpc_server.h"
-
-namespace paddle {
-namespace framework {
-class Variable;
-}  // namespace framework
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-class CollectiveServer;
-
-class GetMonomerHandler final : public RequestHandler {
- public:
-  GetMonomerHandler() : RequestHandler(true) {}
-  virtual ~GetMonomerHandler() {}
-  bool Handle(const std::string& var_name, framework::Scope* scope,
-              framework::Variable* var, framework::Variable** outvar,
-              const int trainer_id, const std::string& out_var_name = "",
-              const std::string& table_name = "") override {
-    VLOG(50) << "GetMonomerHandler recv " << var_name;
-
-    *outvar = scope->FindVar(var_name);
-    PADDLE_ENFORCE_NOT_NULL(
-        outvar, platform::errors::NotFound("var: %s is not found.", var_name));
-
-    return true;
-  }
-};
-
-class GetMonomerBarrierHandler final : public RequestHandler {
- public:
-  GetMonomerBarrierHandler() : RequestHandler(true) {}
-  virtual ~GetMonomerBarrierHandler() {}
-  bool Handle(const std::string& var_name, framework::Scope* scope,
-              framework::Variable* var, framework::Variable** outvar,
-              const int trainer_id, const std::string& out_var_name = "",
-              const std::string& table_name = "") override {
-    VLOG(50) << "GetMonomerHandler recv " << var_name;
-
-    rpc_server_->IncreaseVarBarrier(var_name);
-
-    return true;
-  }
-};
-
-class CollectiveServer final {
- public:
-  explicit CollectiveServer(const std::string& end_point, int fan_in);
-
-  virtual ~CollectiveServer() {}
-
-  void StartServer();
-
-  static CollectiveServer* GetInstance(const std::string& end_point,
-                                       int fan_in) {
-    std::call_once(init_flag_, [&]() {
-      if (collective_server_.get() == nullptr) {
-        collective_server_.reset(new CollectiveServer(end_point, fan_in));
-        collective_server_->StartServer();
-      }
-    });
-
-    return collective_server_.get();
-  }
-
-  std::shared_ptr<RPCServer> GetRPCServer() { return rpc_server_; }
-
-  void Stop();
-
- private:
-  std::unique_ptr<GetMonomerHandler> get_monomer_handler_;
-  std::unique_ptr<GetMonomerBarrierHandler> get_barrier_handler_;
-
-  std::shared_ptr<distributed::RPCServer> rpc_server_;
-  std::shared_ptr<std::thread> server_thread_;
-  std::shared_ptr<std::thread> loop_thread_;
-
-  bool ready_{false};
-
-  static std::once_flag init_flag_;
-  static std::shared_ptr<CollectiveServer> collective_server_;
-};
-
-};  // namespace distributed
-};  // namespace operators
-};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/collective_server_test.cc b/paddle/fluid/operators/distributed/collective_server_test.cc
deleted file mode 100644
index 92b2eb4b51e59..0000000000000
--- a/paddle/fluid/operators/distributed/collective_server_test.cc
+++ /dev/null
@@ -1,131 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <stdlib.h>
-#include <memory>
-#include <string>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/operators/distributed/collective_client.h"
-#include "paddle/fluid/operators/distributed/collective_server.h"
-
-namespace paddle {
-namespace framework {
-class Variable;
-}  // namespace framework
-}  // namespace paddle
-
-namespace framework = paddle::framework;
-namespace platform = paddle::platform;
-namespace distributed = paddle::operators::distributed;
-
-std::unique_ptr<distributed::CollectiveServer> StartServer(
-    const std::string& ep, int fan_in, framework::Scope* scope,
-    platform::DeviceContext* dev_ctx) {
-  distributed::CollectiveServer* server =
-      distributed::CollectiveServer::GetInstance(ep, fan_in);
-
-  auto rpc_server = server->GetRPCServer();
-  rpc_server->RegisterVar("var1", distributed::kRequestGetMonomerVariable,
-                          scope, dev_ctx);
-
-  std::cout << "StartServer return" << std::endl;
-  return std::unique_ptr<distributed::CollectiveServer>(server);
-}
-
-std::unique_ptr<framework::Scope> GenerateVars(platform::Place place) {
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  auto& ctx = *pool.Get(place);
-
-  framework::Scope* scope = new framework::Scope();
-  framework::Variable* var = scope->Var("var1");
-  auto* slr = var->GetMutable<framework::SelectedRows>();
-  slr->set_height(20000);
-
-  auto* tensor = slr->mutable_value();
-  auto* rows = slr->mutable_rows();
-
-  tensor->Resize(framework::make_ddim({3, 1024}));
-  tensor->mutable_data<float>(place);
-
-  paddle::operators::math::set_constant(ctx, tensor, 32.7);
-  for (int i = 0; i < 3; ++i) rows->push_back(i);
-
-  std::cout << "src:" << distributed::GetSelectedRowsInfo(*slr);
-
-  return std::unique_ptr<framework::Scope>(scope);
-}
-
-void Gather(const std::vector<distributed::RemoteVar>& vars,
-            platform::DeviceContext* dev_ctx) {
-  distributed::CollectiveClient* client =
-      distributed::CollectiveClient::GetInstance();
-
-  framework::Scope* scope = new framework::Scope();
-  framework::Variable* var = scope->Var("var1");
-  var->GetMutable<framework::SelectedRows>();
-
-  std::vector<const framework::SelectedRows*> dst;
-  client->Gather(vars, &dst, *dev_ctx, scope);
-  std::cout << "dst:" << distributed::GetSelectedRowsInfo(*dst[0]);
-  dev_ctx->Wait();
-
-  ASSERT_EQ(dst[0]->value().dims(), framework::make_ddim({3, 1024}));
-  ASSERT_EQ(dst[0]->height(), 20000);
-  ASSERT_EQ(dst[0]->rows().size(), static_cast<size_t>(3));
-  for (int i = 0; i < 3; i++) {
-    ASSERT_EQ(dst[0]->rows()[i], i);
-  }
-
-  std::vector<float> vec;
-  TensorToVector(dst[0]->value(), *dev_ctx, &vec);
-  for (size_t i = 0; i < 3 * 1024; i++) {
-    ASSERT_FLOAT_EQ(vec[i], 32.7);
-  }
-}
-
-TEST(CollectiveServer, GPU) {
-  setenv("http_proxy", "", 1);
-  setenv("https_proxy", "", 1);
-
-  platform::CUDAPlace place;
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  auto& ctx = *pool.Get(place);
-
-  std::string ep = "127.0.0.1:7164";
-  auto scope = GenerateVars(place);
-
-  auto* v1 = scope->FindVar("var1");
-  std::cout << "var1:" << v1 << std::endl;
-
-  auto server = StartServer(ep, 2, scope.get(), &ctx);
-  auto rpc_server = server->GetRPCServer();
-
-  distributed::RemoteVar var;
-  var.ep_ = ep;
-  var.var_name_ = "var1";
-  var.trainer_id_ = 0;
-
-  std::vector<distributed::RemoteVar> vars{var};
-  Gather(vars, &ctx);
-  Gather(vars, &ctx);
-
-  std::cout << "begin WaitVarBarrier" << std::endl;
-  rpc_server->WaitVarBarrier("var1");
-  rpc_server->ClearRegisteredVars();
-  server->Stop();
-
-  scope.release();
-  server.release();
-}
diff --git a/paddle/fluid/operators/distributed/communicator.cc b/paddle/fluid/operators/distributed/communicator.cc
deleted file mode 100644
index 4ee27a6414698..0000000000000
--- a/paddle/fluid/operators/distributed/communicator.cc
+++ /dev/null
@@ -1,989 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/distributed/communicator.h"
-
-#include <paddle/fluid/framework/program_desc.h>
-
-#include <algorithm>
-#include <chrono>  // NOLINT
-#include <map>
-#include <thread>  // NOLINT
-#include <unordered_set>
-
-#include "gflags/gflags.h"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/framework/threadpool.h"
-#include "paddle/fluid/framework/variable_helper.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/distributed/parameter_recv.h"
-#include "paddle/fluid/operators/distributed/parameter_send.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/fluid/string/split.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-using Tree =
-    std::map<std::string, std::map<std::string, std::vector<std::string>>>;
-using RpcCtxMap = operators::distributed::RpcCtxMap;
-
-inline double GetCurrentUS() {
-  struct timeval time;
-  gettimeofday(&time, NULL);
-  return 1e+6 * time.tv_sec + time.tv_usec;
-}
-
-Communicator::Communicator() {}
-
-std::once_flag Communicator::init_flag_;
-std::shared_ptr<Communicator> Communicator::communicator_(nullptr);
-
-void AsyncCommunicator::InitImpl(const RpcCtxMap &send_varname_to_ctx,
-                                 const RpcCtxMap &recv_varname_to_ctx,
-                                 Scope *recv_scope) {
-  send_varname_to_ctx_ = std::move(send_varname_to_ctx);
-  recv_varname_to_ctx_ = std::move(recv_varname_to_ctx);
-  recv_scope_ = std::move(recv_scope);
-
-  if (send_varname_to_ctx.size() == 0) {
-    VLOG(0) << "nothing need to be send, will not start send_thread";
-  } else {
-    send_scope_.reset(new Scope());
-    for (auto &iter : send_varname_to_ctx_) {
-      if (iter.first == STEP_COUNTER && !need_global_step_) continue;
-      send_varname_to_queue_[iter.first] =
-          std::make_shared<BlockingQueue<std::shared_ptr<Variable>>>(
-              send_queue_size_);
-    }
-    send_threadpool_.reset(new ::ThreadPool(thread_pool_size_));
-  }
-
-  if (recv_varname_to_ctx.size() == 0) {
-    VLOG(0) << "nothing need to be received, will not start recv_thread";
-  } else {
-    recv_threadpool_.reset(new ::ThreadPool(thread_pool_size_));
-  }
-
-  InitParams();
-}
-
-void AsyncCommunicator::InitParams() { RecvNoBarrier(); }
-
-AsyncCommunicator::~AsyncCommunicator() {
-  running_ = false;
-  if (main_thread_) main_thread_->join();
-}
-
-void AsyncCommunicator::SendGlobalStep(int batches) {
-  if (!need_global_step_) {
-    return;
-  }
-
-  if (batches == 0) {
-    return;
-  }
-
-  auto &var_name = STEP_COUNTER;
-  auto *out_var = send_scope_->Var(var_name);
-  auto *out_t = out_var->GetMutable<framework::LoDTensor>();
-  auto *data = out_t->mutable_data<int64_t>({1}, platform::CPUPlace());
-  data[0] = static_cast<int64_t>(batches);
-
-  auto &ctx = send_varname_to_ctx_.at(var_name);
-  auto send_functor = distributed::ParameterSend<float>();
-  send_functor(ctx, *send_scope_, true, 1);
-}
-
-void AsyncCommunicator::SendByCommunicator() {
-  std::vector<std::future<void>> task_futures;
-  task_futures.reserve(send_varname_to_ctx_.size());
-  VLOG(3) << "run send graph";
-
-  auto before_run_send_graph = GetCurrentUS();
-  for (auto &iter : send_varname_to_queue_) {
-    auto &var_name = iter.first;
-    auto &var_queue = iter.second;
-
-    auto send_task = [this, &var_name, &var_queue] {
-      VLOG(3) << var_name << " merge and send; ";
-      std::vector<std::shared_ptr<Variable>> vars;
-
-      int merged_var_num = 0;
-      int wait_times = 0;
-      while (merged_var_num < max_merge_var_num_) {
-        if (var_queue->Size() == 0) {
-          VLOG(4) << "wait_times -> " << wait_times;
-          if (wait_times >= send_wait_times_) {
-            break;
-          }
-          std::this_thread::sleep_for(std::chrono::milliseconds(10));
-          wait_times++;
-          continue;
-        } else {
-          wait_times = 0;
-
-          vars.push_back(var_queue->Pop());
-          merged_var_num++;
-        }
-      }
-      auto before_merge = GetCurrentUS();
-      if (var_name == STEP_COUNTER) {
-        SendGlobalStep(merged_var_num);
-        auto after_merge = GetCurrentUS();
-        VLOG(3) << "merge and send " << merged_var_num << " " << var_name
-                << " use time " << after_merge - before_merge;
-        return;
-      }
-
-      auto &ctx = send_varname_to_ctx_.at(var_name);
-
-      MergeVars<float>(var_name, vars, send_scope_.get(), ctx.merge_add);
-      auto after_merge = GetCurrentUS();
-      VLOG(3) << "merge " << merged_var_num << " " << var_name << " use time "
-              << after_merge - before_merge;
-
-      auto send_functor = distributed::ParameterSend<float>();
-      send_functor(ctx, *send_scope_, true, 1);
-      auto after_send = GetCurrentUS();
-      VLOG(3) << "send " << var_name << " use time "
-              << after_send - after_merge;
-
-      if (var_name.rfind("@GRAD") != var_name.size() - 5) return;
-
-      auto recv_param = var_name.substr(0, var_name.size() - 5);
-      if (recv_varname_to_ctx_.find(recv_param) == recv_varname_to_ctx_.end())
-        return;
-
-      auto recv_functor = distributed::ParameterRecv<float>();
-      recv_functor(recv_varname_to_ctx_.at(recv_param), *recv_scope_);
-      auto after_recv = GetCurrentUS();
-      VLOG(3) << "recv " << recv_param << " use time "
-              << after_recv - after_send;
-    };
-    task_futures.emplace_back(send_threadpool_->enqueue(std::move(send_task)));
-  }
-  for (auto &task_f : task_futures) {
-    task_f.wait();
-  }
-  auto after_run_send_graph = GetCurrentUS();
-
-  VLOG(3) << "run send graph use time "
-          << (after_run_send_graph - before_run_send_graph);
-}
-
-void HalfAsyncCommunicator::SendByCommunicator() {
-  std::vector<std::future<void>> task_futures;
-  task_futures.reserve(send_varname_to_ctx_.size());
-  VLOG(3) << "run send graph";
-
-  int batches = BatchesCounter();
-  if (batches <= 0) return;
-
-  auto before_run_send_graph = GetCurrentUS();
-  for (auto &iter : send_varname_to_queue_) {
-    auto &var_name = iter.first;
-    auto &var_queue = iter.second;
-
-    auto send_task = [this, batches, &var_name, &var_queue] {
-      VLOG(3) << var_name << " merge and send; ";
-      auto before_task = GetCurrentUS();
-      std::vector<std::shared_ptr<Variable>> vars;
-      vars.reserve(batches);
-
-      for (int i = 0; i < batches; ++i) {
-        vars.push_back(var_queue->Pop());
-      }
-
-      if (var_name == STEP_COUNTER) {
-        SendGlobalStep(batches);
-        auto end_task = GetCurrentUS();
-        VLOG(3) << "merge " << batches << " " << var_name << " use time "
-                << end_task - before_task;
-        return;
-      }
-
-      auto &ctx = send_varname_to_ctx_.at(var_name);
-
-      auto before_merge = GetCurrentUS();
-      MergeVars<float>(var_name, vars, send_scope_.get(), ctx.merge_add);
-      auto after_merge = GetCurrentUS();
-      VLOG(3) << "merge " << batches << " " << var_name << " use time "
-              << after_merge - before_merge;
-
-      auto send_functor = distributed::ParameterSend<float>();
-      send_functor(ctx, *send_scope_, true, 1);
-      auto after_send = GetCurrentUS();
-      VLOG(3) << "send " << var_name << " use time "
-              << after_send - before_task;
-
-      if (var_name.rfind("@GRAD") != var_name.size() - 5) return;
-
-      auto recv_param = var_name.substr(0, var_name.size() - 5);
-      if (recv_varname_to_ctx_.find(recv_param) == recv_varname_to_ctx_.end())
-        return;
-
-      auto recv_functor = distributed::ParameterRecv<float>();
-      recv_functor(recv_varname_to_ctx_.at(recv_param), *recv_scope_);
-      auto after_recv = GetCurrentUS();
-      VLOG(3) << "recv " << recv_param << " use time "
-              << after_recv - after_send;
-      return;
-    };
-    task_futures.emplace_back(send_threadpool_->enqueue(std::move(send_task)));
-  }
-  for (auto &task_f : task_futures) {
-    task_f.wait();
-  }
-  auto after_run_send_graph = GetCurrentUS();
-
-  VLOG(3) << "run send graph use time "
-          << (after_run_send_graph - before_run_send_graph);
-}
-
-void AsyncCommunicator::MainThread() {
-  VLOG(3) << "MainThread start and wait";
-
-  while (waiting_ && running_) {
-    std::this_thread::sleep_for(std::chrono::milliseconds(100));
-    VLOG(3) << "wait for running";
-  }
-
-  while (running_) {
-    SendByCommunicator();
-    BarrierSend();
-  }
-  VLOG(3) << "communicator stopped, send thread exit";
-}
-
-void HalfAsyncCommunicator::MainThread() {
-  VLOG(3) << "MainThread start and wait";
-
-  while (waiting_ && running_) {
-    std::this_thread::sleep_for(std::chrono::milliseconds(100));
-    VLOG(3) << "wait for running";
-  }
-
-  while (running_) {
-    SendByCommunicator();
-    BarrierSend();
-    RecvByCommunicator();
-    BarrierRecv();
-    BarrierWeakUp();
-  }
-  VLOG(3) << "communicator stopped, send thread exit";
-}
-
-void AsyncCommunicator::RecvByCommunicator() {
-  VLOG(3) << "parallel run recv graph";
-  if (!running_) return;
-  RecvNoBarrier();
-  VLOG(3) << "run recv graph use time";
-}
-
-void AsyncCommunicator::RecvNoBarrier() {
-  std::vector<std::future<void>> task_futures;
-  task_futures.reserve(recv_varname_to_ctx_.size());
-
-  for (auto &iter : recv_varname_to_ctx_) {
-    auto recv_task = [this, &iter] {
-      auto before_task = GetCurrentUS();
-      auto &var_name = iter.first;
-      auto recv_functor = distributed::ParameterRecv<float>();
-      recv_functor(iter.second, *recv_scope_);
-      auto end_task = GetCurrentUS();
-      VLOG(1) << "recv var " << var_name << " use time "
-              << (end_task - before_task);
-    };
-    task_futures.emplace_back(recv_threadpool_->enqueue(std::move(recv_task)));
-  }
-
-  for (auto &task : task_futures) {
-    task.wait();
-  }
-}
-
-void AsyncCommunicator::Start() {
-  VLOG(3) << "Communicator start";
-  if (!communicator_) {
-    VLOG(0) << "Communicator is not inited, do nothing";
-  } else {
-    VLOG(3) << "start send thread and recv thread";
-    waiting_ = true;
-    running_ = true;
-    BarrierTriggerReset(max_merge_var_num_);
-    // start send and recv thread
-    main_thread_.reset(
-        new std::thread(std::bind(&AsyncCommunicator::MainThread, this)));
-  }
-}
-
-void AsyncCommunicator::Stop() {
-  VLOG(3) << "Communicator stop";
-  running_ = false;
-  if (!communicator_) {
-    VLOG(0) << "Communicator is not inited, do nothing";
-  } else {
-    if (main_thread_) {
-      VLOG(3) << "stop send thread";
-      main_thread_->join();
-      main_thread_.reset(nullptr);
-    }
-  }
-  VLOG(3) << "Communicator stop done";
-}
-
-void AsyncCommunicator::Send(const std::vector<std::string> &var_names,
-                             const std::vector<std::string> &var_tables,
-                             const framework::Scope &scope) {
-  waiting_ = false;
-
-  PADDLE_ENFORCE_EQ(
-      var_tables.size(), 1,
-      platform::errors::InvalidArgument("var_tables.size() == 1 is permitted"));
-
-  auto table_name = var_tables[0];
-
-  if (table_name == STEP_COUNTER && !need_global_step_) return;
-
-  auto before_send_op = GetCurrentUS();
-  auto &queue = send_varname_to_queue_.at(table_name);
-
-  if (table_name == STEP_COUNTER) {
-    auto tmp_var = std::make_shared<Variable>();
-    auto *tensor = tmp_var->GetMutable<framework::LoDTensor>();
-    tensor->Resize(framework::make_ddim({1}));
-    auto *out_d = tensor->mutable_data<int64_t>(platform::CPUPlace());
-    out_d[0] = 1;
-    queue->Push(tmp_var);
-  } else {
-    PADDLE_ENFORCE_GE(var_names.size(), 1,
-                      platform::errors::InvalidArgument(
-                          "var_names.size() >= 1 is permitted"));
-
-    auto *var = scope.FindVar(var_names[0]);
-
-    PADDLE_ENFORCE_EQ(
-        var->IsInitialized(), true,
-        platform::errors::InvalidArgument("grad var should be inited"));
-
-    auto tmp_var = std::make_shared<Variable>();
-    if (var->IsType<framework::SelectedRows>()) {
-      framework::CopyVariable(*var, tmp_var.get());
-      queue->Push(tmp_var);
-    } else if (var->IsType<framework::LoDTensor>()) {
-      // push var into send queue by var_name
-      auto var_name = var_names[0];
-      framework::CopyVariable(*var, tmp_var.get());
-      queue->Push(tmp_var);
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "unknown var type to copy, only support LoDTensor/SelectedRows"));
-    }
-  }
-  auto after_send_op = GetCurrentUS();
-  VLOG(3) << "send to " << table_name << " with queue size " << queue->Size()
-          << ", use time " << (after_send_op - before_send_op);
-}
-
-void HalfAsyncCommunicator::Clean() {
-  for (auto &iter : send_varname_to_queue_) {
-    auto &var_name = iter.first;
-    auto &var_queue = iter.second;
-
-    while (var_queue->Size() > 0) {
-      var_queue->Pop();
-    }
-
-    VLOG(3) << "clean var: " << var_name << " done";
-  }
-}
-
-int HalfAsyncCommunicator::BatchesCounter() {
-  while (running_) {
-    if (barrier_counter_.load() >= barrier_trigger_.load() &&
-        barrier_trigger_.load() != 0) {
-      break;
-    } else {
-      std::this_thread::sleep_for(std::chrono::milliseconds(10));
-    }
-  }
-
-  return barrier_counter_.load();
-}
-
-void HalfAsyncCommunicator::Barrier() {
-  barrier_counter_++;
-
-  if (!running_) {
-    VLOG(3) << "Communicator is not running, release barrier";
-    return;
-  }
-
-  {
-    std::unique_lock<std::mutex> lk(barrier_mutex_);
-    barrier_cond_.wait(lk, [this] { return (barrier_counter_ == 0); });
-  }
-}
-
-void HalfAsyncCommunicator::BarrierTriggerDecrement() {
-  barrier_trigger_--;
-  VLOG(3) << "BarrierTriggerDecrement decrement barrier trigger to "
-          << barrier_trigger_.load();
-}
-
-void HalfAsyncCommunicator::BarrierTriggerReset(int initial_val) {
-  barrier_trigger_.store(initial_val);
-
-  VLOG(3) << "BarrierTriggerReset reset barrier trigger to "
-          << barrier_trigger_.load();
-}
-
-void HalfAsyncCommunicator::BarrierWeakUp() {
-  barrier_counter_.store(0);
-  barrier_cond_.notify_all();
-}
-
-void SyncCommunicator::BarrierSend() {
-  if (!running_) return;
-
-  distributed::RPCClient *rpc_client =
-      distributed::RPCClient::GetInstance<RPCCLIENT_T>(trainer_id_);
-
-  std::vector<distributed::VarHandlePtr> rets;
-
-  for (auto &ep : pserver_endpoints_) {
-    rets.push_back(rpc_client->AsyncSendBatchBarrier(ep));
-  }
-
-  for (size_t i = 0; i < rets.size(); i++) {
-    PADDLE_ENFORCE_NE(rets[i]->Wait(), 0U, platform::errors::External(
-                                               "internal error in RPCClient"));
-  }
-
-  VLOG(4) << "BarrierSend with SyncCommunicator";
-}
-
-void SyncCommunicator::BarrierRecv() {
-  if (!running_) return;
-
-  distributed::RPCClient *rpc_client =
-      distributed::RPCClient::GetInstance<RPCCLIENT_T>(trainer_id_);
-
-  std::vector<distributed::VarHandlePtr> rets;
-  for (auto &ep : pserver_endpoints_) {
-    rets.push_back(rpc_client->AsyncSendFetchBarrier(ep));
-  }
-
-  for (size_t i = 0; i < rets.size(); i++) {
-    PADDLE_ENFORCE_NE(rets[i]->Wait(), 0U, platform::errors::External(
-                                               "internal error in RPCClient"));
-  }
-
-  VLOG(4) << "BarrierRecv with SyncCommunicator";
-}
-
-void GeoCommunicator::InitImpl(const RpcCtxMap &send_varname_to_ctx,
-                               const RpcCtxMap &recv_varname_to_ctx,
-                               Scope *recv_scope) {
-  send_varname_to_ctx_ = std::move(send_varname_to_ctx);
-  recv_varname_to_ctx_ = std::move(recv_varname_to_ctx);
-  recv_scope_ = std::move(recv_scope);
-
-  PADDLE_ENFORCE_GT(
-      send_varname_to_ctx.size(), 0,
-      platform::errors::InvalidArgument("send var contexts can not be zero"));
-
-  send_scope_.reset(new Scope());
-  for (auto &iter : send_varname_to_ctx_) {
-    auto &varname = iter.first;
-
-    if (varname == STEP_COUNTER) {
-      send_varname_to_queue_[varname] =
-          std::make_shared<BlockingQueue<std::shared_ptr<Variable>>>(
-              send_queue_size_);
-    } else {
-      auto &send_ctx = iter.second;
-
-      send_var_nums_ += send_ctx.splited_varnames.size();
-      if (!send_ctx.is_sparse) {
-        continue;
-      }
-      int pserver_num = static_cast<int>(send_ctx.epmap.size());
-      for (int ep_idx = 0; ep_idx < pserver_num; ep_idx++) {
-        sparse_id_queues_.insert(
-            std::pair<std::string, std::shared_ptr<BlockingQueue<
-                                       std::shared_ptr<std::vector<int64_t>>>>>(
-                send_ctx.splited_varnames[ep_idx],
-                std::make_shared<
-                    BlockingQueue<std::shared_ptr<std::vector<int64_t>>>>(
-                    send_queue_size_)));
-      }
-    }
-  }
-  send_threadpool_.reset(new ::ThreadPool(thread_pool_size_));
-
-  if (recv_varname_to_ctx.size() == 0) {
-    VLOG(0) << "nothing need to be received, will not start recv_thread";
-  } else {
-    recv_threadpool_.reset(new ::ThreadPool(thread_pool_size_));
-  }
-
-  delta_scope_.reset(new Scope());
-  old_scope_.reset(new Scope());
-  pserver_scope_.reset(new Scope());
-
-  InitParams();
-}
-
-void GeoCommunicator::Send(const std::vector<std::string> &var_names,
-                           const std::vector<std::string> &var_tables,
-                           const framework::Scope &scope) {
-  waiting_ = false;
-  PADDLE_ENFORCE_EQ(
-      var_tables.size(), 1,
-      platform::errors::InvalidArgument("var_tables.size() == 1 is permitted"));
-
-  auto table_name = var_tables[0];
-  if (table_name == STEP_COUNTER) return;
-
-  auto before_send = GetCurrentUS();
-  size_t splited_var_nums =
-      send_varname_to_ctx_[table_name].splited_varnames.size();
-
-  std::unordered_map<std::string, std::unordered_set<int64_t>> ids_table;
-
-  for (size_t j = 0; j < splited_var_nums; j++) {
-    ids_table.insert(std::pair<std::string, std::unordered_set<int64_t>>(
-        send_varname_to_ctx_[table_name].splited_varnames[j],
-        std::unordered_set<int64_t>()));
-  }
-  auto *var = scope.FindVar(var_names[0]);
-  auto &rows = var->Get<framework::SelectedRows>().rows();
-
-  // insert ids which has not been record
-  for (size_t j = 0; j < rows.size(); j++) {
-    auto ep_idx = rows[j] % splited_var_nums;
-    ids_table.at(send_varname_to_ctx_[table_name].splited_varnames[ep_idx])
-        .insert(rows[j]);
-  }
-
-  auto before_push = GetCurrentUS();
-  for (auto &iter : ids_table) {
-    auto &key = iter.first;
-    auto &sparse_ids_set = iter.second;
-    auto sparse_ids_vec = std::make_shared<std::vector<int64_t>>();
-    sparse_ids_vec->assign(sparse_ids_set.begin(), sparse_ids_set.end());
-    sparse_id_queues_.at(key)->Push(sparse_ids_vec);
-    VLOG(3) << "push " << sparse_ids_vec->size() << " ids to " << key
-            << "'s queue";
-  }
-  auto after_send = GetCurrentUS();
-  VLOG(3) << "run send " << table_name << " op finish. using "
-          << (before_push - before_send) << "; " << (after_send - before_push);
-}
-
-void GeoCommunicator::MainThread() {
-  VLOG(3) << "MainThread start and wait";
-
-  while (waiting_ && running_) {
-    std::this_thread::sleep_for(std::chrono::milliseconds(100));
-    VLOG(3) << "wait for running";
-  }
-
-  while (running_) {
-    std::vector<std::future<void>> tasks;
-    tasks.reserve(send_var_nums_);
-
-    for (auto &iter : send_varname_to_ctx_) {
-      auto &var_name = iter.first;
-      auto &send_ctx = iter.second;
-      int pserver_num = static_cast<int>(send_ctx.epmap.size());
-      if (send_ctx.is_sparse) {
-        for (int ep_idx = 0; ep_idx < pserver_num; ep_idx++) {
-          auto send_recv_task = [this, ep_idx, &var_name] {
-            auto before_send_sparse = GetCurrentUS();
-            if (var_name == STEP_COUNTER) {
-              return;
-            }
-            auto send_varname =
-                send_varname_to_ctx_.at(var_name).splited_varnames[ep_idx];
-            auto sparse_ids = MergeSparseIds(send_varname);
-            if (sparse_ids.size() == 0) {
-              return;
-            }
-            SendSparse(var_name, ep_idx, sparse_ids);
-            auto after_send_sparse = GetCurrentUS();
-            RecvSparse(var_name, ep_idx);
-            auto after_recv_sparse = GetCurrentUS();
-            VLOG(3)
-                << "send recv "
-                << send_varname_to_ctx_.at(var_name).splited_varnames[ep_idx]
-                << " finish, using " << (after_send_sparse - before_send_sparse)
-                << " and " << (after_recv_sparse - after_send_sparse)
-                << "; total = " << (after_recv_sparse - before_send_sparse);
-          };
-          tasks.emplace_back(
-              send_threadpool_->enqueue(std::move(send_recv_task)));
-        }
-      } else {
-        auto send_recv_task = [this, &var_name, &send_ctx] {
-          if (var_name == STEP_COUNTER) {
-            return;
-          }
-          SendDense(var_name);
-          RecvDense(var_name);
-        };
-        tasks.emplace_back(
-            send_threadpool_->enqueue(std::move(send_recv_task)));
-      }
-    }
-    for (auto &task : tasks) {
-      task.wait();
-    }
-  }
-}
-
-std::vector<int64_t> GeoCommunicator::MergeSparseIds(
-    const std::string &send_varname) {
-  size_t merge_num = 0, wait_times = 0;
-  std::unordered_set<int64_t> sparse_ids;
-  while (merge_num < static_cast<size_t>(max_merge_var_num_)) {
-    VLOG(3) << "Merge Number of " << send_varname << " = " << merge_num;
-    if (sparse_id_queues_.at(send_varname)->Size() > 0) {
-      wait_times = 0;
-      std::shared_ptr<std::vector<int64_t>> pop_ids =
-          sparse_id_queues_.at(send_varname)->Pop();
-      for (size_t j = 0; j < pop_ids->size(); j++) {
-        sparse_ids.insert(pop_ids->at(j));
-      }
-      merge_num += 1;
-      VLOG(3) << "sparse_id_queues_(" << send_varname << ") pushed";
-    } else if (sparse_id_queues_.at(send_varname)->Size() == 0) {
-      VLOG(3) << "wait_times -> " << wait_times;
-      if (wait_times >= static_cast<size_t>(send_wait_times_)) {
-        break;
-      }
-      std::this_thread::sleep_for(std::chrono::milliseconds(10));
-      wait_times++;
-      continue;
-    }
-  }
-  std::vector<int64_t> res;
-  res.assign(sparse_ids.begin(), sparse_ids.end());
-  return res;
-}
-void GeoCommunicator::SendSparse(const std::string &varname, int ep_idx,
-                                 const std::vector<int64_t> &sparse_ids) {
-  auto &rpc_ctx = send_varname_to_ctx_.at(varname);
-  auto send_varname = rpc_ctx.splited_varnames[ep_idx];
-  auto trainer_id = rpc_ctx.trainer_id;
-  auto endpoint = rpc_ctx.epmap[ep_idx];
-  auto pserver_num = rpc_ctx.epmap.size();
-
-  auto *var_latest = recv_scope_->FindVar(varname);
-
-  PADDLE_ENFORCE_EQ(var_latest->IsInitialized(), true,
-                    platform::errors::Unavailable(
-                        "%s is not initialized, please check", varname));
-  auto &t_latest = var_latest->Get<framework::LoDTensor>();
-
-  auto dims1 = t_latest.dims()[1];
-
-  auto cpu_ctx = paddle::platform::CPUDeviceContext();
-  auto *var_delta = delta_scope_->Var(send_varname);
-  auto *t_delta = var_delta->GetMutable<framework::SelectedRows>();
-
-  auto *t_value = t_delta->mutable_value();
-  t_value->mutable_data<float>(
-      framework::make_ddim({static_cast<int64_t>(sparse_ids.size()), dims1}),
-      cpu_ctx.GetPlace());
-
-  std::vector<std::vector<std::vector<float> *>> values;
-  auto *ins = distributed::LargeScaleKV::GetInstance();
-  ins->Get(varname)->Get(sparse_ids, {"Param"}, &values);
-
-  auto blas = math::GetBlas<platform::CPUDeviceContext, float>(cpu_ctx);
-  float coefficient = 1.0 / static_cast<float>(trainers_);
-
-  for (auto j = 0; j < static_cast<int>(sparse_ids.size()); ++j) {
-    blas.VSUB(dims1, t_latest.data<float>() + sparse_ids[j] * dims1,
-              values[j][0]->data(), t_value->data<float>() + j * dims1);
-    blas.SCAL(dims1, coefficient, t_value->data<float>() + j * dims1);
-    blas.VADD(dims1, values[j][0]->data(), t_value->data<float>() + j * dims1,
-              values[j][0]->data());
-  }
-
-  std::vector<int64_t> send_rows;
-  send_rows.reserve(sparse_ids.size());
-  for (auto idx : sparse_ids) {
-    send_rows.push_back(idx / pserver_num);
-  }
-  t_delta->set_height(rpc_ctx.height_sections[ep_idx]);
-  t_delta->set_rows(send_rows);
-
-  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-  auto &cpu_ctx_send = *pool.Get(platform::CPUPlace());
-  distributed::RPCClient *rpc_client =
-      distributed::RPCClient::GetInstance<RPCCLIENT_T>(trainer_id);
-
-  auto ret = rpc_client->AsyncSendVar(endpoint, cpu_ctx_send,
-                                      *delta_scope_.get(), send_varname);
-  ret->Wait();
-}
-
-void GeoCommunicator::SendDense(const std::string &varname) {
-  auto *var_latest = recv_scope_->FindVar(varname);
-  auto *var_timestamp = old_scope_->FindVar(varname);
-
-  PADDLE_ENFORCE_EQ(var_latest->IsInitialized(), true,
-                    platform::errors::Unavailable(
-                        "%s is not initialized, please check", varname));
-  PADDLE_ENFORCE_EQ(var_timestamp->IsInitialized(), true,
-                    platform::errors::Unavailable(
-                        "%s is not initialized, please check", varname));
-
-  auto &t_latest = var_latest->Get<framework::LoDTensor>();
-  auto t_timestamp = var_timestamp->GetMutable<framework::LoDTensor>();
-
-  auto cpu_ctx = paddle::platform::CPUDeviceContext();
-  auto *var_delta = delta_scope_->Var(varname);
-  auto *t_delta = var_delta->GetMutable<framework::LoDTensor>();
-  t_delta->mutable_data<float>(t_latest.dims(), cpu_ctx.GetPlace());
-
-  auto blas = math::GetBlas<platform::CPUDeviceContext, float>(cpu_ctx);
-  blas.VSUB(t_latest.numel(), t_latest.data<float>(),
-            t_timestamp->data<float>(), t_delta->data<float>());
-
-  float coefficient = 1.0 / static_cast<float>(trainers_);
-  blas.SCAL(t_latest.numel(), coefficient, t_delta->data<float>());
-
-  blas.VADD(t_latest.numel(), t_timestamp->data<float>(),
-            t_delta->data<float>(), t_timestamp->data<float>());
-
-  auto &ctx = send_varname_to_ctx_.at(varname);
-  auto send = distributed::ParameterSend<float>();
-  send(ctx, *delta_scope_, true, 1);
-}
-
-void GeoCommunicator::RecvByCommunicator() { return; }
-
-void GeoCommunicator::RecvSparse(const std::string &varname, int ep_idx) {
-  auto train_id = recv_varname_to_ctx_.at(varname).trainer_id;
-  auto endpoint = recv_varname_to_ctx_.at(varname).epmap[ep_idx];
-  auto splited_var_name =
-      recv_varname_to_ctx_.at(varname).splited_varnames[ep_idx];
-  auto pserver_num = recv_varname_to_ctx_.at(varname).epmap.size();
-
-  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-  auto &cpu_ctx_recv = *pool.Get(platform::CPUPlace());
-  distributed::RPCClient *rpc_client =
-      distributed::RPCClient::GetInstance<RPCCLIENT_T>(train_id);
-
-  auto *var_psrever = pserver_scope_->Var(splited_var_name);
-  auto handle = rpc_client->AsyncGetVar(endpoint, cpu_ctx_recv,
-                                        *pserver_scope_.get(), splited_var_name,
-                                        splited_var_name, splited_var_name);
-  handle->Wait();
-
-  auto *var_latest = recv_scope_->FindVar(varname);
-
-  PADDLE_ENFORCE_EQ(
-      var_psrever->IsInitialized(), true,
-      platform::errors::Unavailable(
-          "%s in pserver scope is not initialized, please check", varname));
-
-  std::vector<int64_t> ids;
-  ids.assign(var_psrever->Get<framework::SelectedRows>().rows().begin(),
-             var_psrever->Get<framework::SelectedRows>().rows().end());
-
-  for (size_t j = 0; j < ids.size(); j++) {
-    ids[j] = ids[j] * pserver_num + ep_idx;
-  }
-
-  VLOG(3) << "RecvSparse receive var: " << splited_var_name
-          << " ids Size: " << ids.size();
-
-  auto t_psrever = var_psrever->Get<framework::SelectedRows>().value();
-
-  std::vector<std::vector<std::vector<float> *>> old_values;
-
-  auto *ins = distributed::LargeScaleKV::GetInstance();
-  ins->Get(varname)->Get(ids, {"Param"}, &old_values);
-
-  auto *t_latest = var_latest->GetMutable<framework::LoDTensor>();
-
-  auto dims1 = t_latest->dims()[1];
-  auto numel = ids.size() * dims1;
-
-  std::vector<float> v_delta;
-  v_delta.resize(numel);
-
-  auto cpu_ctx = paddle::platform::CPUDeviceContext();
-  auto blas = math::GetBlas<platform::CPUDeviceContext, float>(cpu_ctx);
-
-  for (auto j = 0; j < static_cast<int>(ids.size()); ++j) {
-    blas.VSUB(dims1, t_psrever.data<float>() + j * dims1,
-              old_values[j][0]->data(), v_delta.data() + j * dims1);
-    blas.VADD(dims1, t_latest->data<float>() + ids[j] * dims1,
-              v_delta.data() + j * dims1,
-              t_latest->data<float>() + ids[j] * dims1);
-    blas.VCOPY(dims1, t_psrever.data<float>() + j * dims1,
-               old_values[j][0]->data());
-  }
-}
-
-void GeoCommunicator::RecvDense(const std::string &varname) {
-  auto *var_latest = recv_scope_->FindVar(varname);
-  auto *var_timestamp = old_scope_->FindVar(varname);
-  auto *var_psrever = pserver_scope_->Var(varname);
-
-  auto &ctx = recv_varname_to_ctx_.at(varname);
-  auto recv = distributed::ParameterRecv<float>();
-  recv(ctx, *pserver_scope_);
-
-  PADDLE_ENFORCE_EQ(
-      var_psrever->IsInitialized(), true,
-      platform::errors::Unavailable(
-          "%s in pserver scope is not initialized, please check", varname));
-
-  auto t_psrever = var_psrever->Get<framework::LoDTensor>();
-  auto t_latest = var_latest->GetMutable<framework::LoDTensor>();
-  auto t_timestamp = var_timestamp->GetMutable<framework::LoDTensor>();
-
-  auto cpu_ctx = paddle::platform::CPUDeviceContext();
-  auto *var_delta = delta_scope_->Var(varname);
-  auto *t_delta = var_delta->GetMutable<framework::LoDTensor>();
-  t_delta->mutable_data<float>(t_latest->dims(), cpu_ctx.GetPlace());
-
-  auto blas = math::GetBlas<platform::CPUDeviceContext, float>(cpu_ctx);
-  blas.VSUB(t_latest->numel(), t_psrever.data<float>(),
-            t_timestamp->data<float>(), t_delta->data<float>());
-  blas.VADD(t_latest->numel(), t_latest->data<float>(), t_delta->data<float>(),
-            t_latest->data<float>());
-  blas.VCOPY(t_latest->numel(), t_psrever.data<float>(),
-             t_timestamp->data<float>());
-}
-
-void GeoCommunicator::InitParams() {
-  std::vector<std::future<void>> tasks;
-  tasks.reserve(recv_varname_to_ctx_.size());
-
-  for (auto &iter : recv_varname_to_ctx_) {
-    auto &var_name = iter.first;
-    auto &recv_ctx = iter.second;
-
-    auto recv_task = [this, &var_name, &recv_ctx] {
-      if (!recv_ctx.is_sparse) {
-        InitDense(var_name);
-      }
-    };
-    tasks.emplace_back(send_threadpool_->enqueue(std::move(recv_task)));
-  }
-
-  for (auto &task : tasks) {
-    task.wait();
-  }
-  InitSparse();
-}
-
-void GeoCommunicator::InitDense(const std::string varname) {
-  auto &ctx = recv_varname_to_ctx_.at(varname);
-  auto recv = distributed::ParameterRecv<float>();
-  recv(ctx, *recv_scope_);
-
-  auto *global_var = recv_scope_->FindVar(varname);
-  global_var->GetMutable<framework::LoDTensor>();
-
-  auto *old_var = old_scope_->Var(varname);
-  old_var->GetMutable<framework::LoDTensor>();
-
-  framework::CopyVariable(*global_var, old_var);
-  VLOG(1) << "init dense variable " << varname << " done";
-}
-
-void GeoCommunicator::InitSparse() {
-  auto sparse_metas = string::split_string<std::string>(sparse_attrs_, "#");
-
-  std::vector<distributed::SparseMeta> metas;
-  std::vector<int64_t> dicts;
-
-  for (auto &sparse_meta : sparse_metas) {
-    auto attrs = string::split_string<std::string>(sparse_meta, ":");
-
-    auto meta = distributed::SparseMeta();
-    meta.name = attrs[0];
-    meta.value_names = {"Param"};
-
-    auto dic = string::split_string<std::string>(attrs[1], ",");
-    dicts.push_back(std::stoi(dic[0]));
-    meta.value_dims = {std::stoi(dic[1])};
-    meta.mode = distributed::Mode::training;
-    meta.grad_name = "none";
-    meta.cached_varnames = {};
-    meta.initializer_attrs = string::split_string<std::string>(attrs[2]);
-    meta.entry = "none";
-
-    VLOG(3) << "add sparse meta: " << meta.ToString();
-    metas.push_back(meta);
-  }
-
-  LargeScaleKV::Init(metas);
-
-  for (auto &meta : metas) {
-    auto &ctx = recv_varname_to_ctx_.at(meta.name);
-    auto recv = distributed::ParameterRecv<float>();
-
-    auto *global_var = recv_scope_->FindVar(meta.name);
-    auto global_value = global_var->Get<framework::LoDTensor>();
-    auto rows = global_value.dims()[0];
-    auto dim1 = global_value.dims()[1];
-
-    recv(ctx, *recv_scope_);
-    VLOG(1) << "recv " << meta.name << " with global scope for init";
-
-    auto n_rows = global_var->Get<framework::LoDTensor>().dims()[0];
-
-    PADDLE_ENFORCE_EQ(
-        rows, n_rows,
-        platform::errors::InvalidArgument(
-            "global var: %s origin dim must equal recved rows", meta.name));
-
-    std::vector<int64_t> ids(rows);
-    std::iota(ids.begin(), ids.end(), 0);
-
-    auto *ins = distributed::LargeScaleKV::GetInstance();
-    std::vector<std::vector<std::vector<float> *>> values;
-
-    ins->Get(meta.name)->Init(ids);
-    ins->Get(meta.name)->Get(ids, {"Param"}, &values);
-
-    auto blas = math::GetBlas<platform::CPUDeviceContext, float>(
-        paddle::platform::CPUDeviceContext());
-
-    for (auto &id : ids) {
-      blas.VCOPY(dim1, global_value.data<float>() + id * dim1,
-                 values[id][0]->data());
-    }
-  }
-
-  VLOG(3) << "init sparse variable done";
-}
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/communicator.h b/paddle/fluid/operators/distributed/communicator.h
deleted file mode 100644
index 4be3253d3923f..0000000000000
--- a/paddle/fluid/operators/distributed/communicator.h
+++ /dev/null
@@ -1,490 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <ThreadPool.h>
-#include <stdint.h>
-#include <atomic>
-#include <deque>
-#include <map>
-#include <memory>
-#include <numeric>
-#include <set>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-
-#include "gflags/gflags.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/variable.h"
-#include "paddle/fluid/framework/variable_helper.h"
-#include "paddle/fluid/operators/distributed/communicator_common.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/distributed/large_scale_kv.h"
-#include "paddle/fluid/operators/distributed/rpc_client.h"
-#include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
-#include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/operators/math/selected_rows_functor.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/string/split.h"
-
-DECLARE_bool(communicator_is_sgd_optimizer);
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-using Scope = framework::Scope;
-using Variable = framework::Variable;
-
-template <typename T>
-class BlockingQueue {
- public:
-  explicit BlockingQueue(size_t capacity) : capacity_(capacity) {
-    PADDLE_ENFORCE_GT(capacity_, 0,
-                      platform::errors::InvalidArgument(
-                          "The capacity must be greater than 0."));
-  }
-
-  bool Push(const T &elem) {
-    {
-      std::unique_lock<std::mutex> lock(mutex_);
-      cv_.wait(lock, [&] { return queue_.size() < capacity_; });
-      PADDLE_ENFORCE_LT(
-          queue_.size(), capacity_,
-          platform::errors::OutOfRange("The queue size: %s out of capacity:%s",
-                                       queue_.size(), capacity_));
-      queue_.push_back(elem);
-    }
-    cv_.notify_one();
-    return true;
-  }
-
-  bool Push(T &&elem) {
-    {
-      std::unique_lock<std::mutex> lock(mutex_);
-      cv_.wait(lock, [&] { return queue_.size() < capacity_; });
-      PADDLE_ENFORCE_LT(
-          queue_.size(), capacity_,
-          platform::errors::OutOfRange("The queue size: %s out of capacity:%s",
-                                       queue_.size(), capacity_));
-      queue_.emplace_back(std::move(elem));
-    }
-    cv_.notify_one();
-    return true;
-  }
-
-  T Pop() {
-    std::unique_lock<std::mutex> lock(mutex_);
-    cv_.wait(lock, [=] { return !queue_.empty(); });
-    T rc(std::move(queue_.front()));
-    queue_.pop_front();
-    cv_.notify_one();
-    return rc;
-  }
-
-  size_t Cap() const {
-    std::lock_guard<std::mutex> lock(mutex_);
-    return capacity_;
-  }
-
-  size_t Size() const {
-    std::lock_guard<std::mutex> lock(mutex_);
-    return queue_.size();
-  }
-
- private:
-  const size_t capacity_;
-  std::deque<T> queue_;
-
-  mutable std::mutex mutex_;
-  std::condition_variable cv_;
-};
-
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
-template <typename T>
-inline void MergeVars(const std::string &var_name,
-                      const std::vector<std::shared_ptr<Variable>> &vars,
-                      Scope *scope, bool merge_add = true) {
-  PADDLE_ENFORCE_NE(vars.empty(), true, platform::errors::InvalidArgument(
-                                            "vector vars are empty."));
-  auto cpu_place = platform::CPUPlace();
-  auto &var0 = vars[0];
-  auto *out_var = scope->Var(var_name);
-  if (var0->IsType<framework::LoDTensor>()) {
-    auto dims = var0->Get<framework::LoDTensor>().dims();
-    VLOG(3) << "merge " << var_name << " LoDTensor dims " << dims
-            << "; merge add: " << merge_add;
-    // init output tensor
-    auto *out_t = out_var->GetMutable<framework::LoDTensor>();
-    out_t->mutable_data<T>(dims, cpu_place);
-    // check the input dims
-    for (auto &var : vars) {
-      auto &var_t = var->Get<framework::LoDTensor>();
-      PADDLE_ENFORCE_EQ(
-          var_t.dims(), dims,
-          platform::errors::InvalidArgument("vars should have the same dims"));
-    }
-
-    // set output tensor to 0.
-    auto cpu_ctx = paddle::platform::CPUDeviceContext();
-    math::SetConstant<paddle::platform::CPUDeviceContext, T> constant_functor;
-    constant_functor(cpu_ctx, out_t, static_cast<T>(0));
-    // sum all vars to out
-    auto result = EigenVector<T>::Flatten(*out_t);
-    for (auto &var : vars) {
-      auto &in_t = var->Get<framework::LoDTensor>();
-      auto in = EigenVector<T>::Flatten(in_t);
-      result.device(*cpu_ctx.eigen_device()) = result + in;
-    }
-    if (!merge_add) {
-      result.device(*cpu_ctx.eigen_device()) =
-          result / static_cast<T>(vars.size());
-    }
-  } else if (var0->IsType<framework::SelectedRows>()) {
-    auto &slr0 = var0->Get<framework::SelectedRows>();
-    auto *out_slr = out_var->GetMutable<framework::SelectedRows>();
-    out_slr->mutable_rows()->clear();
-    out_slr->mutable_value()->mutable_data<T>({{}}, cpu_place);
-    std::vector<const paddle::framework::SelectedRows *> inputs;
-    inputs.reserve(vars.size());
-    for (auto &var : vars) {
-      inputs.push_back(&var->Get<framework::SelectedRows>());
-    }
-    auto dev_ctx = paddle::platform::CPUDeviceContext();
-    if (merge_add) {
-      math::scatter::MergeAdd<paddle::platform::CPUDeviceContext, T> merge_add;
-      merge_add(dev_ctx, inputs, out_slr);
-    } else {
-      math::scatter::MergeAverage<paddle::platform::CPUDeviceContext, T>
-          merge_average;
-      merge_average(dev_ctx, inputs, out_slr);
-    }
-
-    VLOG(3) << "merge " << var_name << " SelectedRows height: " << slr0.height()
-            << " dims: " << slr0.value().dims() << "; merge add: " << merge_add;
-  } else {
-    PADDLE_THROW(platform::errors::InvalidArgument("unsupported var type: %s!",
-                                                   var0->Type()));
-  }
-}
-
-using RpcCtxMap = std::unordered_map<std::string, CommContext>;
-using SparseValue = std::unordered_map<int64_t, std::vector<float>>;
-
-class Communicator {
- public:
-  Communicator();
-
-  explicit Communicator(const std::map<std::string, std::string> &envs_) {
-    for (auto &iter : envs_) {
-      envs[iter.first] = iter.second;
-    }
-  }
-
-  virtual ~Communicator() {}
-
-  virtual void Start() = 0;
-
-  virtual void Stop() = 0;
-
-  virtual bool IsRunning() { return running_; }
-
-  virtual void Clean() {}
-
-  virtual void Send(const std::vector<std::string> &var_names,
-                    const std::vector<std::string> &var_tables,
-                    const framework::Scope &scope) = 0;
-
-  virtual void RecvNoBarrier() {}
-
-  virtual void Barrier() {}
-
-  virtual void BarrierTriggerDecrement() {}
-
-  virtual void BarrierTriggerReset(int init_counter) {}
-
-  virtual void InitEnvs() = 0;
-
-  virtual void InitImpl(const RpcCtxMap &send_varname_to_ctx,
-                        const RpcCtxMap &recv_varname_to_ctx,
-                        Scope *recv_scope) {}
-
-  static Communicator *GetInstance() { return communicator_.get(); }
-
-  static std::shared_ptr<Communicator> GetInstantcePtr() {
-    return communicator_;
-  }
-
-  template <typename T>
-  static Communicator *InitInstance(
-      const RpcCtxMap &send_ctx, const RpcCtxMap &recv_ctx, Scope *recv_scope,
-      const std::map<std::string, std::string> &envs) {
-    std::call_once(init_flag_, &Communicator::InitWithRpcCtx<T>, send_ctx,
-                   recv_ctx, recv_scope, std::ref(envs));
-    return communicator_.get();
-  }
-
-  // Init is called by InitInstance.
-  template <typename T>
-  static void InitWithRpcCtx(const RpcCtxMap &send_ctx,
-                             const RpcCtxMap &recv_ctx, Scope *recv_scope,
-                             const std::map<std::string, std::string> &envs) {
-    if (communicator_.get() == nullptr) {
-      communicator_.reset(new T(std::ref(envs)));
-      communicator_->InitEnvs();
-      communicator_->InitImpl(send_ctx, recv_ctx, recv_scope);
-    }
-  }
-
- protected:
-  bool running_ = false;
-  bool waiting_ = true;
-  static std::shared_ptr<Communicator> communicator_;
-  static std::once_flag init_flag_;
-  std::unordered_map<std::string, std::string> envs;
-};
-
-class AsyncCommunicator : public Communicator {
- public:
-  AsyncCommunicator() : Communicator() {}
-
-  explicit AsyncCommunicator(const std::map<std::string, std::string> &envs)
-      : Communicator(envs) {}
-
-  ~AsyncCommunicator();
-
-  void InitEnvs() {
-    min_send_grad_num_before_recv_ =
-        std::stoi(envs.at("communicator_min_send_grad_num_before_recv"));
-    thread_pool_size_ = std::stoi(envs.at("communicator_thread_pool_size"));
-    max_merge_var_num_ = std::stoi(envs.at("communicator_max_merge_var_num"));
-    send_wait_times_ = std::stoi(envs.at("communicator_send_wait_times"));
-    send_queue_size_ = std::stoi(envs.at("communicator_send_queue_size"));
-    need_global_step_ =
-        static_cast<bool>(std::stoi(envs.at("need_global_step")));
-    VLOG(0) << "AsyncCommunicator Initialized";
-  }
-
-  void Start() override;
-
-  void Stop() override;
-
-  void InitImpl(const RpcCtxMap &send_varname_to_ctx,
-                const RpcCtxMap &recv_varname_to_ctx,
-                Scope *recv_scope) override;
-
-  void InitParams();
-
-  virtual void MainThread();
-
-  void Send(const std::vector<std::string> &var_names,
-            const std::vector<std::string> &var_tables,
-            const framework::Scope &scope) override;
-
-  virtual void SendByCommunicator();
-  virtual void SendGlobalStep(int batches);
-
-  virtual void RecvByCommunicator();
-
-  virtual void RecvNoBarrier();
-
-  virtual void BarrierSend() {}
-
-  virtual void BarrierRecv() {}
-
-  virtual void BarrierWeakUp() {}
-
- protected:
-  int min_send_grad_num_before_recv_;
-  int thread_pool_size_;
-  int max_merge_var_num_;
-  int send_wait_times_;
-  int send_queue_size_;
-  int trainer_id_ = 0;
-  bool need_global_step_ = false;
-
-  std::unordered_map<std::string,
-                     std::shared_ptr<BlockingQueue<std::shared_ptr<Variable>>>>
-      send_varname_to_queue_;
-  RpcCtxMap send_varname_to_ctx_;
-  RpcCtxMap recv_varname_to_ctx_;
-  std::unique_ptr<std::thread> main_thread_{nullptr};
-  Scope *recv_scope_;                  // should be global scope
-  std::unique_ptr<Scope> send_scope_;  // an independent scope
-  std::unique_ptr<::ThreadPool> send_threadpool_{nullptr};
-  std::unique_ptr<::ThreadPool> recv_threadpool_{nullptr};
-  std::atomic_uint grad_num_{0};  // the num of gradient sent since last recv
-};
-
-class HalfAsyncCommunicator : public AsyncCommunicator {
- public:
-  HalfAsyncCommunicator() {}
-
-  explicit HalfAsyncCommunicator(const std::map<std::string, std::string> &envs)
-      : AsyncCommunicator(envs) {}
-
-  void InitEnvs() {
-    min_send_grad_num_before_recv_ = 0;
-
-    max_merge_var_num_ = std::stoi(envs.at("communicator_max_merge_var_num"));
-    send_wait_times_ = std::stoi(envs.at("communicator_send_wait_times"));
-    thread_pool_size_ = std::stoi(envs.at("communicator_thread_pool_size"));
-    send_queue_size_ = std::stoi(envs.at("communicator_send_queue_size"));
-    need_global_step_ =
-        static_cast<bool>(std::stoi(envs.at("need_global_step")));
-    VLOG(0) << "HalfAsyncCommunicator Initialized";
-  }
-
-  void MainThread() override;
-
-  void SendByCommunicator() override;
-
-  void Clean() override;
-
-  void Barrier() override;
-
-  void BarrierTriggerDecrement() override;
-
-  void BarrierTriggerReset(int initial_val) override;
-
-  int BatchesCounter();
-
-  void BarrierWeakUp();
-
- protected:
-  // mutex for Wait for barrier
-  std::mutex barrier_mutex_;
-  std::condition_variable barrier_cond_;
-  std::atomic<int64_t> barrier_trigger_{0};
-  std::atomic<int64_t> barrier_counter_{0};
-};
-
-class SyncCommunicator : public HalfAsyncCommunicator {
- public:
-  SyncCommunicator() : HalfAsyncCommunicator() {}
-
-  explicit SyncCommunicator(const std::map<std::string, std::string> &envs)
-      : HalfAsyncCommunicator(envs) {}
-
-  void InitEnvs() {
-    min_send_grad_num_before_recv_ = 0;
-
-    max_merge_var_num_ = std::stoi(envs.at("communicator_max_merge_var_num"));
-    send_wait_times_ = std::stoi(envs.at("communicator_send_wait_times"));
-    thread_pool_size_ = std::stoi(envs.at("communicator_thread_pool_size"));
-    send_queue_size_ = std::stoi(envs.at("communicator_send_queue_size"));
-    need_global_step_ =
-        static_cast<bool>(std::stoi(envs.at("need_global_step")));
-
-    trainer_id_ = std::stoi(envs.at("trainer_id"));
-    auto pserver_strings = envs.at("pserver_endpoints");
-    pserver_endpoints_ = paddle::string::Split(pserver_strings, ',');
-    VLOG(0) << "SyncCommunicator Initialized";
-  }
-
-  void BarrierSend();
-
-  void BarrierRecv();
-
- private:
-  std::vector<std::string> pserver_endpoints_{};
-};
-
-class GeoCommunicator : public AsyncCommunicator {
- public:
-  GeoCommunicator() : AsyncCommunicator() {}
-
-  explicit GeoCommunicator(const std::map<std::string, std::string> &envs)
-      : AsyncCommunicator(envs) {}
-
-  void InitImpl(const RpcCtxMap &send_varname_to_ctx,
-                const RpcCtxMap &recv_varname_to_ctx,
-                Scope *recv_scope) override;
-  void MainThread() override;
-  void InitEnvs() {
-    min_send_grad_num_before_recv_ = 0;
-
-    max_merge_var_num_ = std::stoi(envs.at("communicator_max_merge_var_num"));
-    send_wait_times_ = std::stoi(envs.at("communicator_send_wait_times"));
-    thread_pool_size_ = std::stoi(envs.at("communicator_thread_pool_size"));
-
-    send_queue_size_ = max_merge_var_num_;
-    trainers_ = std::stoi(envs.at("trainers"));
-    sparse_attrs_ = envs.at("sparse_attrs");
-    VLOG(0) << "GeoCommunicator Initialized";
-  }
-
-  void Send(const std::vector<std::string> &var_names,
-            const std::vector<std::string> &var_tables,
-            const framework::Scope &scope) override;
-
-  void SendByCommunicator() { return; }
-
-  std::vector<int64_t> MergeSparseIds(const std::string &send_varname);
-
-  void SendSparse(const std::string &varname, int ep_idx,
-                  const std::vector<int64_t> &sparse_ids);
-
-  void SendDense(const std::string &varname);
-
-  void SendGlobalStep(int batches) override {}
-
-  void RecvByCommunicator() override;
-
-  void RecvSparse(const std::string &varname, int ep_idx);
-
-  void RecvDense(const std::string &varname);
-
-  void InitParams();
-
-  void InitSparse();
-
-  void InitDense(const std::string varname);
-
- private:
-  int trainers_;
-  std::string sparse_attrs_;
-
-  // parameter for delta calc and send
-  std::shared_ptr<Scope> delta_scope_;
-
-  // parameter for storage the pserver param after last recv
-  std::shared_ptr<Scope> old_scope_;
-
-  // parameter on pserver
-  std::shared_ptr<Scope> pserver_scope_;
-
-  int send_var_nums_ = 0;
-
-  std::unordered_map<std::string, std::shared_ptr<SparseValue>> old_sparses_;
-
-  std::unordered_map<
-      std::string,
-      std::shared_ptr<BlockingQueue<std::shared_ptr<std::vector<int64_t>>>>>
-      sparse_id_queues_;
-};
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/communicator_common.h b/paddle/fluid/operators/distributed/communicator_common.h
deleted file mode 100644
index 122d904eba27a..0000000000000
--- a/paddle/fluid/operators/distributed/communicator_common.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <iostream>
-#include <string>
-#include <vector>
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-struct CommContext {
-  CommContext() = default;
-
-  CommContext(const std::string &name, const std::vector<std::string> &names,
-              const std::vector<std::string> &emap,
-              const std::vector<int64_t> &sections,
-              const std::vector<std::string> &origin_names, int id,
-              bool merge_add_ = true, bool is_sparse_ = true,
-              bool is_distributed_ = false)
-      : var_name(name),
-        splited_varnames(names),
-        epmap(emap),
-        height_sections(sections),
-        origin_varnames(origin_names),
-        trainer_id(id),
-        merge_add(merge_add_),
-        is_sparse(is_sparse_),
-        is_distributed(is_distributed_) {}
-
-  CommContext(const CommContext &ctx) {
-    var_name = ctx.var_name;
-    splited_varnames = ctx.splited_varnames;
-    epmap = ctx.epmap;
-    height_sections = ctx.height_sections;
-    trainer_id = ctx.trainer_id;
-    merge_add = ctx.merge_add;
-    is_sparse = ctx.is_sparse;
-    origin_varnames = ctx.origin_varnames;
-    is_distributed = ctx.is_distributed;
-  }
-
-  std::string print() const {
-    std::stringstream ss;
-
-    ss << "varname: " << var_name << " trainer_id: " << trainer_id << " ";
-
-    for (size_t i = 0; i < splited_varnames.size(); i++) {
-      ss << "slice varname: " << splited_varnames[i] << " ep: " << epmap[i]
-         << " section: " << height_sections[i] << " ";
-    }
-
-    ss << "origin varnames: ";
-    for (size_t i = 0; i < origin_varnames.size(); i++) {
-      ss << origin_varnames[i] << " ";
-    }
-
-    ss << " aggregation->add: " << merge_add << " ";
-    ss << " is_sparse: " << is_sparse << "\n";
-    ss << " is_distributed: " << is_distributed << "\n";
-
-    return ss.str();
-  }
-
-  std::string var_name;
-  std::vector<std::string> splited_varnames;
-  std::vector<std::string> epmap;
-  std::vector<int64_t> height_sections;
-  std::vector<std::string> origin_varnames;
-  int trainer_id;
-  bool merge_add;
-  bool is_sparse;
-  bool is_distributed;
-};
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/communicator_test.cc b/paddle/fluid/operators/distributed/communicator_test.cc
deleted file mode 100644
index 38b7c8b00317e..0000000000000
--- a/paddle/fluid/operators/distributed/communicator_test.cc
+++ /dev/null
@@ -1,106 +0,0 @@
-//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-
-#include "paddle/fluid/operators/distributed/communicator.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-using LoDTensor = framework::LoDTensor;
-using SelectedRows = framework::SelectedRows;
-
-TEST(communicator, merge_lod_tensors) {
-  auto cpu_place = platform::CPUPlace();
-  auto dims = framework::make_ddim({2, 3});
-  std::vector<std::shared_ptr<framework::Variable>> in_vars;
-  float out_value = 0;
-  for (auto i = 0; i < 10; ++i) {
-    auto var = std::make_shared<Variable>();
-    in_vars.emplace_back(var);
-    auto *tensor = var->GetMutable<LoDTensor>();
-    auto *data = tensor->mutable_data<float>(dims, cpu_place);
-    for (auto j = 0; j < tensor->numel(); ++j) {
-      data[j] = static_cast<float>(i);
-    }
-    out_value += static_cast<float>(i);
-  }
-  const std::string out_name = "Out";
-  std::unique_ptr<framework::Scope> scope;
-  scope.reset(new framework::Scope());
-  scope->Var(out_name);
-  for (auto i = 0; i < 10; ++i) {
-    MergeVars<float>(out_name, in_vars, scope.get());
-  }
-  auto &out_tensor = scope->FindVar(out_name)->Get<LoDTensor>();
-  auto *out_data = out_tensor.data<float>();
-  ASSERT_EQ(out_tensor.dims(), dims);
-  for (auto i = 0; i < out_tensor.numel(); ++i) {
-    ASSERT_EQ(out_data[i], out_value);
-  }
-}
-
-TEST(communicator, merge_selected_rows) {
-  auto cpu_place = platform::CPUPlace();
-  int64_t width = 10;
-  std::vector<std::shared_ptr<framework::Variable>> in_vars;
-  const int64_t height = 100;
-  for (auto i = 0; i < 10; ++i) {
-    std::vector<int64_t> rows;
-    for (auto k = 0; k <= i; ++k) {
-      rows.push_back(k);
-    }
-    auto var = std::make_shared<Variable>();
-    in_vars.emplace_back(var);
-    auto *slr = var->GetMutable<SelectedRows>();
-    slr->set_height(height);
-    slr->set_rows(rows);
-    auto dims =
-        framework::make_ddim({static_cast<int64_t>(rows.size()), width});
-    auto *data = slr->mutable_value()->mutable_data<float>(dims, cpu_place);
-    for (size_t i = 0; i < rows.size(); ++i) {
-      for (auto j = 0; j < width; ++j) {
-        data[i * width + j] = static_cast<float>(rows[i]);
-      }
-    }
-  }
-  const std::string out_name = "Out";
-  std::unique_ptr<framework::Scope> scope;
-  scope.reset(new framework::Scope());
-  scope->Var(out_name);
-  for (auto i = 0; i < 10; ++i) {
-    MergeVars<float>(out_name, in_vars, scope.get());
-  }
-  auto &out_slr = scope->FindVar(out_name)->Get<SelectedRows>();
-  auto &out_t = out_slr.value();
-  auto *out_data = out_t.data<float>();
-  ASSERT_EQ(out_t.dims(), framework::make_ddim({10, width}));
-  std::vector<float> out_values;
-  out_values.reserve(10);
-  for (auto i = 0; i < 10; ++i) {
-    out_values.push_back(static_cast<float>(i * (10 - i)));
-  }
-  for (size_t i = 0; i < out_slr.rows().size(); ++i) {
-    ASSERT_EQ(out_slr.rows()[i], static_cast<int>(i));
-    for (auto j = 0; j < width; ++j) {
-      ASSERT_EQ(out_data[i * width + j], out_values[i]);
-    }
-  }
-}
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/distributed.h b/paddle/fluid/operators/distributed/distributed.h
deleted file mode 100644
index 5917c18fb0d20..0000000000000
--- a/paddle/fluid/operators/distributed/distributed.h
+++ /dev/null
@@ -1,36 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#ifdef PADDLE_WITH_DISTRIBUTE
-
-#ifdef PADDLE_WITH_GRPC
-#include "paddle/fluid/operators/distributed/communicator.h"
-
-#include "paddle/fluid/operators/distributed/grpc/grpc_client.h"
-#include "paddle/fluid/operators/distributed/grpc/grpc_server.h"
-#define RPCSERVER_T paddle::operators::distributed::AsyncGRPCServer
-#define RPCCLIENT_T paddle::operators::distributed::GRPCClient
-
-#else  // PADDLE_WITH_GRPC
-
-#include "paddle/fluid/operators/distributed/brpc/brpc_client.h"
-#include "paddle/fluid/operators/distributed/brpc/brpc_server.h"
-#define RPCSERVER_T paddle::operators::distributed::AsyncBRPCServer
-#define RPCCLIENT_T paddle::operators::distributed::BRPCClient
-
-#endif  // PADDLE_WITH_GRPC
-
-#endif  // PADDLE_WITH_DISTRIBUTE
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.cc b/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.cc
deleted file mode 100644
index 7d6756b41363d..0000000000000
--- a/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.cc
+++ /dev/null
@@ -1,92 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-// NOTE: This file was originally created by tensorflow
-//       (https://github.com/tensorflow/tensorflow/) we borrow this
-//       file and did some modifications so that we can send gRPC
-//       requests without too much copying of the tensor data.
-
-#include "paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h"
-
-namespace grpc {
-class ByteBuffer;
-}  // namespace grpc
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-GrpcByteBufferSource::GrpcByteBufferSource() {}
-
-bool GrpcByteBufferSource::Init(const grpc::ByteBuffer& src) {
-  cur_ = -1;
-  left_ = 0;
-  ptr_ = nullptr;
-  byte_count_ = 0;
-  bool ok = src.Dump(&slices_).ok();
-  if (!ok) {
-    slices_.clear();
-  }
-  return ok;
-}
-
-bool GrpcByteBufferSource::Next(const void** data, int* size) {
-  // Use loop instead of if in case buffer contained empty slices.
-  while (left_ == 0) {
-    // Advance to next slice.
-    cur_++;
-    if (cur_ >= slices_.size()) {
-      return false;
-    }
-    const ::grpc::Slice& s = slices_[cur_];
-    left_ = s.size();
-    ptr_ = reinterpret_cast<const char*>(s.begin());
-  }
-
-  *data = ptr_;
-  *size = left_;
-  byte_count_ += left_;
-  ptr_ += left_;
-  left_ = 0;
-  return true;
-}
-
-void GrpcByteBufferSource::BackUp(int count) {
-  ptr_ -= count;
-  left_ += count;
-  byte_count_ -= count;
-}
-
-bool GrpcByteBufferSource::Skip(int count) {
-  const void* data;
-  int size;
-  while (Next(&data, &size)) {
-    if (size >= count) {
-      BackUp(size - count);
-      return true;
-    }
-    // size < count;
-    count -= size;
-  }
-  // error or we have too large count;
-  return false;
-}
-
-google::protobuf::int64 GrpcByteBufferSource::ByteCount() const {
-  return byte_count_;
-}
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h b/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h
deleted file mode 100644
index 486870de7a554..0000000000000
--- a/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h
+++ /dev/null
@@ -1,174 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-// NOTE: This file was originally created by tensorflow
-//       (https://github.com/tensorflow/tensorflow/) we borrow this
-//       file and did some modifications so that we can send gRPC
-//       requests without too much copying of the tensor data.
-
-#pragma once
-
-#include <vector>
-
-#include "google/protobuf/io/coded_stream.h"
-#include "google/protobuf/io/zero_copy_stream.h"
-#include "grpc++/grpc++.h"
-#include "paddle/fluid/operators/distributed/variable_response.h"
-
-struct grpc_byte_buffer;
-
-namespace grpc {
-// A ZeroCopyInputStream that reads from grpc_byte_buffer
-class ByteBuffer;
-
-class GrpcBufferReader final
-    : public ::google::protobuf::io::ZeroCopyInputStream {
-  typedef void (CoreCodegenInterface::*OldReaderInitAPI)(
-      grpc_byte_buffer_reader* reader, grpc_byte_buffer* buffer);
-  typedef int (CoreCodegenInterface::*NewReaderInitAPI)(
-      grpc_byte_buffer_reader* reader, grpc_byte_buffer* buffer);
-  void ReaderInit(OldReaderInitAPI ptr, grpc_byte_buffer_reader* reader,
-                  grpc_byte_buffer* buffer) {
-    (g_core_codegen_interface->*ptr)(reader, buffer);
-  }
-  void ReaderInit(NewReaderInitAPI ptr, grpc_byte_buffer_reader* reader,
-                  grpc_byte_buffer* buffer) {
-    int result = (g_core_codegen_interface->*ptr)(reader, buffer);
-    (void)result;
-  }
-
- public:
-  explicit GrpcBufferReader(grpc_byte_buffer* buffer)
-      : byte_count_(0), backup_count_(0) {
-    ReaderInit(&CoreCodegenInterface::grpc_byte_buffer_reader_init, &reader_,
-               buffer);
-  }
-  ~GrpcBufferReader() override {
-    g_core_codegen_interface->grpc_byte_buffer_reader_destroy(&reader_);
-  }
-
-  bool Next(const void** data, int* size) override {
-    if (backup_count_ > 0) {
-      *data = GRPC_SLICE_START_PTR(slice_) + GRPC_SLICE_LENGTH(slice_) -
-              backup_count_;
-      GPR_CODEGEN_ASSERT(backup_count_ <= INT_MAX);
-      *size = static_cast<int>(backup_count_);
-      backup_count_ = 0;
-      return true;
-    }
-    if (!g_core_codegen_interface->grpc_byte_buffer_reader_next(&reader_,
-                                                                &slice_)) {
-      return false;
-    }
-    g_core_codegen_interface->grpc_slice_unref(slice_);
-    *data = GRPC_SLICE_START_PTR(slice_);
-    // On win x64, int is only 32bit
-    GPR_CODEGEN_ASSERT(GRPC_SLICE_LENGTH(slice_) <= INT_MAX);
-    byte_count_ += * size = static_cast<int>(GRPC_SLICE_LENGTH(slice_));
-    return true;
-  }
-
-  void BackUp(int count) override { backup_count_ = count; }
-
-  bool Skip(int count) override {
-    const void* data;
-    int size;
-    while (Next(&data, &size)) {
-      if (size >= count) {
-        BackUp(size - count);
-        return true;
-      }
-      // size < count;
-      count -= size;
-    }
-    // error or we have too large count;
-    return false;
-  }
-
-  ::google::protobuf::int64 ByteCount() const override {
-    return byte_count_ - backup_count_;
-  }
-
- private:
-  int64_t byte_count_;
-  int64_t backup_count_;
-  grpc_byte_buffer_reader reader_;
-  grpc_slice slice_;
-};
-
-};  // namespace grpc
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-// A ZeroCopyInputStream that reads from a grpc::ByteBuffer.
-class GrpcByteBufferSource
-    : public ::google::protobuf::io::ZeroCopyInputStream {
- public:
-  GrpcByteBufferSource();
-  bool Init(const ::grpc::ByteBuffer& src);  // Can be called multiple times.
-  bool Next(const void** data, int* size) override;
-  void BackUp(int count) override;
-  bool Skip(int count) override;
-  ::google::protobuf::int64 ByteCount() const override;
-
- private:
-  std::vector<::grpc::Slice> slices_;
-  size_t cur_;       // Current slice index.
-  int left_;         // Number of bytes in slices_[cur_] left to yield.
-  const char* ptr_;  // Address of next byte in slices_[cur_] to yield.
-  ::google::protobuf::int64 byte_count_;
-};
-
-class GrpcByteBufferSourceWrapper : public Source {
- public:
-  explicit GrpcByteBufferSourceWrapper(GrpcByteBufferSource* source)
-      : source_(source) {}
-  ::google::protobuf::io::ZeroCopyInputStream* contents() override {
-    return source_;
-  }
-
- private:
-  GrpcByteBufferSource* source_;
-};
-
-class GrpcByteSource : public Source {
- public:
-  explicit GrpcByteSource(grpc_byte_buffer* buffer) : buffer_(buffer) {}
-  ~GrpcByteSource() override { DeleteStream(); }
-
-  typedef ::grpc::GrpcBufferReader Reader;
-
-  ::google::protobuf::io::ZeroCopyInputStream* contents() override {
-    DeleteStream();
-    stream_ = new (&space_) Reader(buffer_);
-    return stream_;
-  }
-
- private:
-  void DeleteStream() {
-    if (stream_) {
-      stream_->~Reader();
-    }
-  }
-
-  grpc_byte_buffer* buffer_;  // Not owned
-  Reader* stream_ = nullptr;  // Points into space_ if non-nullptr
-  char space_[sizeof(Reader)];
-};
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_client.cc b/paddle/fluid/operators/distributed/grpc/grpc_client.cc
deleted file mode 100644
index 97a9c14e4f185..0000000000000
--- a/paddle/fluid/operators/distributed/grpc/grpc_client.cc
+++ /dev/null
@@ -1,671 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <stdlib.h>
-#include <limits>
-
-#include "glog/logging.h"  // For VLOG
-#include "paddle/fluid/framework/threadpool.h"
-#include "paddle/fluid/operators/distributed/grpc/grpc_client.h"
-#include "paddle/fluid/operators/distributed/grpc/grpc_serde.h"
-#include "paddle/fluid/operators/distributed/request_handler.h"
-#include "paddle/fluid/platform/port.h"
-#include "paddle/fluid/platform/profiler.h"
-
-DEFINE_int32(rpc_client_threads, 2, "");
-DECLARE_bool(rpc_disable_reuse_port);
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-void GRPCClient::InitImpl() {
-  // start the client process thread
-  // TODO(wuyi): can make this in a threadpool
-  client_threads_.resize(FLAGS_rpc_client_threads);
-  for (int i = 0; i < FLAGS_rpc_client_threads; i++) {
-    client_threads_[i].reset(
-        new std::thread(std::bind(&GRPCClient::Proceed, this)));
-  }
-}
-
-void GRPCClient::SendComplete() {
-  std::unique_lock<std::mutex> lk(completed_mutex_);
-  if (!completed_) {
-    for (auto& it : channels_) {
-      VLOG(3) << "send complete message to " << it.first;
-      this->AsyncSendComplete(it.first);
-    }
-    PADDLE_ENFORCE_EQ(this->Wait(), true, platform::errors::PreconditionNotMet(
-                                              "internal grpc service error."));
-    completed_ = true;
-  }
-}
-
-GRPCClient::~GRPCClient() {
-  stopped_ = true;
-  Wait();
-  cq_.Shutdown();
-  {
-    std::lock_guard<std::mutex> guard(chan_mutex_);
-    for (auto& it : channels_) {
-      it.second.reset();
-    }
-    channels_.clear();
-  }
-  for (size_t i = 0; i < client_threads_.size(); i++)
-    client_threads_[i]->join();
-}
-
-VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep,
-                                      const platform::DeviceContext& ctx,
-                                      const framework::Scope& scope,
-                                      const std::string& var_name,
-                                      int64_t time_out) {
-  const platform::DeviceContext* p_ctx = &ctx;
-  const std::string ep_val = ep;
-  const std::string var_name_val = var_name;
-  const framework::Scope* p_scope = &scope;
-  const auto ch = GetChannel(ep_val);
-  const std::string method = kSendRPC;
-
-  int retry_times_ = 0;
-
-  while (true) {
-    SendProcessor* s = new SendProcessor(ch);
-    VarHandlePtr h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope));
-    s->Prepare(h, time_out);
-
-    framework::Async([var_name_val, p_scope, p_ctx, s, method, h, this] {
-      auto* var = p_scope->FindVar(var_name_val);
-
-      ::grpc::ByteBuffer req;
-      SerializeToByteBuffer(var_name_val, var, *p_ctx, &req, "", trainer_id_);
-
-      VLOG(3) << s->GetVarHandlePtr()->String() << " begin";
-
-      // stub context
-      s->response_call_back_ = nullptr;
-
-      platform::RecordRPCEvent record_event(method);
-
-      auto call = s->stub_g_.PrepareUnaryCall(
-          s->context_.get(), "/sendrecv.SendRecvService/SendVariable", req,
-          &cq_);
-      call->StartCall();
-      call->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
-
-      if (UNLIKELY(platform::IsProfileEnabled())) {
-        h->Wait();
-      }
-    });
-    req_count_++;
-
-    if (FLAGS_rpc_retry_times > 0 && retry_times_ < FLAGS_rpc_retry_times) {
-      h->Wait();
-      if (h->should_retry) {
-        VLOG(3) << "rpc call failed, retry times " << retry_times_;
-        retry_times_++;
-        std::random_device rd;
-        std::this_thread::sleep_for(std::chrono::milliseconds(rd() % 5));
-        continue;
-      }
-    }
-
-    return h;
-  }
-}
-
-void ProcGetResponse(const VarHandle& var_h,
-                     const ::grpc::ByteBuffer& ret_msg) {
-  VLOG(4) << "ProcGetResponse";
-  framework::Variable* outvar = nullptr;
-  // get response's trainer_id is not used
-  int trainer_id;
-  DeserializeFromByteBuffer(ret_msg, *var_h.ctx(), var_h.scope(), &outvar,
-                            &trainer_id);
-}
-
-void ProcGetRecvResponse(const VarHandle& var_h,
-                         const ::grpc::ByteBuffer& ret_msg) {
-  VLOG(4) << "ProcGetRecvResponse";
-  framework::Variable* outvar = nullptr;
-  int trainer_id;
-  DeserializeRecvFromByteBuffer(ret_msg, *var_h.ctx(), var_h.scope(), &outvar,
-                                &trainer_id);
-}
-
-template <typename T>
-void RequestToByteBuffer(const T& proto, ::grpc::ByteBuffer* result) {
-  ::grpc::Slice slice(proto.ByteSizeLong());
-  proto.SerializeWithCachedSizesToArray(const_cast<uint8_t*>(slice.begin()));
-  ::grpc::ByteBuffer tmp(&slice, 1);
-  result->Swap(&tmp);
-}
-
-VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep,
-                                     const platform::DeviceContext& ctx,
-                                     const framework::Scope& scope,
-                                     const std::string& var_name,
-                                     const std::string& out_varname,
-                                     const std::string& table_name,
-                                     int64_t time_out) {
-  return _AsyncGetVar(ep, ctx, scope, kGetRPC, var_name, out_varname,
-                      "/sendrecv.SendRecvService/GetVariable", table_name,
-                      time_out);
-}
-
-VarHandlePtr GRPCClient::AsyncGetVarNoBarrier(
-    const std::string& ep, const platform::DeviceContext& ctx,
-    const framework::Scope& scope, const std::string& var_name,
-    const std::string& out_varname, int64_t time_out) {
-  std::string var_name_no_barrier =
-      string::Sprintf("%s%s", var_name, WITHOUT_BARRIER_MESSAGE);
-
-  return _AsyncGetVar(
-      ep, ctx, scope, kGetNoBarrierRPC, var_name_no_barrier, out_varname,
-      "/sendrecv.SendRecvService/GetVariableNoBarrier", "", time_out);
-}
-
-VarHandlePtr GRPCClient::AsyncGetMonomerVariable(
-    const std::string& ep, const platform::DeviceContext& ctx,
-    const framework::Scope& scope, const std::string& var_name,
-    int64_t time_out) {
-  return _AsyncGetVar(ep, ctx, scope, kGetMonomerRPC, var_name, var_name,
-                      "/sendrecv.SendRecvService/GetMonomerVariable", "",
-                      time_out);
-}
-
-VarHandlePtr GRPCClient::_AsyncGetVar(
-    const std::string& ep, const platform::DeviceContext& ctx,
-    const framework::Scope& scope, const std::string& method,
-    const std::string& var_name, const std::string& out_varname,
-    const std::string& rpc_path, const std::string& table_name,
-    int64_t time_out) {
-  const platform::DeviceContext* p_ctx = &ctx;
-  const std::string ep_val = ep;
-  const std::string var_name_val = var_name;
-  const std::string out_varname_val = out_varname;
-  const std::string table_name_val = table_name;
-  const framework::Scope* p_scope = &scope;
-  const auto ch = GetChannel(ep_val);
-
-  int retry_times_ = 0;
-
-  while (true) {
-    GetProcessor* s = new GetProcessor(ch);
-
-    VarHandlePtr h(new VarHandle(ep, method, out_varname_val, p_ctx, p_scope));
-    s->Prepare(h, time_out);
-
-    framework::Async([var_name_val, out_varname_val, table_name_val, s, method,
-                      p_ctx, h, rpc_path, this] {
-      // prepare input
-      sendrecv::VariableMessage req;
-      req.set_varname(var_name_val);
-      req.set_out_varname(out_varname_val);
-      req.set_trainer_id(trainer_id_);
-      req.set_table_name(table_name_val);
-      ::grpc::ByteBuffer buf;
-      RequestToByteBuffer<sendrecv::VariableMessage>(req, &buf);
-
-      VLOG(3) << s->GetVarHandlePtr()->String() << " begin";
-
-      // stub context
-      s->response_call_back_ = ProcGetResponse;
-
-      platform::RecordRPCEvent record_event(method);
-
-      auto call =
-          s->stub_g_.PrepareUnaryCall(s->context_.get(), rpc_path, buf, &cq_);
-      call->StartCall();
-      call->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
-
-      if (UNLIKELY(platform::IsProfileEnabled())) {
-        h->Wait();
-      }
-    });
-    req_count_++;
-
-    if (FLAGS_rpc_retry_times > 0 && retry_times_ < FLAGS_rpc_retry_times) {
-      h->Wait();
-      if (h->should_retry) {
-        VLOG(3) << "rpc call failed, retry times " << retry_times_;
-        retry_times_++;
-        std::random_device rd;
-        std::this_thread::sleep_for(std::chrono::milliseconds(rd() % 5));
-        continue;
-      }
-    }
-
-    return h;
-  }
-}
-
-VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep,
-                                          const platform::DeviceContext& ctx,
-                                          const framework::Scope& scope,
-                                          const std::string& in_var_name,
-                                          const std::string& out_var_name,
-                                          const std::string& table_name,
-                                          int64_t time_out) {
-  const platform::DeviceContext* p_ctx = &ctx;
-  const std::string ep_val = ep;
-  const std::string in_var_name_val = in_var_name;
-  const std::string out_var_name_val = out_var_name;
-  const std::string table_name_val = table_name;
-  const framework::Scope* p_scope = &scope;
-  const auto ch = GetChannel(ep_val);
-
-  const std::string method = kPrefetchRPC;
-  int retry_times_ = 0;
-
-  while (true) {
-    GetProcessor* s = new GetProcessor(ch);
-    VarHandlePtr h(new VarHandle(ep, method, out_var_name_val, p_ctx, p_scope));
-    s->Prepare(h, kPrefetchTimeout);
-
-    auto* var = p_scope->FindVar(in_var_name_val);
-
-    ::grpc::ByteBuffer req;
-    SerializeToByteBuffer(in_var_name_val, var, *p_ctx, &req, out_var_name_val,
-                          0, table_name_val);
-
-    VLOG(3) << s->GetVarHandlePtr()->String() << " begin";
-
-    // stub context
-    s->response_call_back_ = ProcGetResponse;
-
-    platform::RecordRPCEvent record_event(method);
-
-    auto call = s->stub_g_.PrepareUnaryCall(
-        s->context_.get(), "/sendrecv.SendRecvService/PrefetchVariable", req,
-        &cq_);
-    call->StartCall();
-    call->Finish(&s->reply_, &s->status_, static_cast<void*>(s));
-
-    if (UNLIKELY(platform::IsProfileEnabled())) {
-      h->Wait();
-    }
-
-    req_count_++;
-
-    if (FLAGS_rpc_retry_times > 0 && retry_times_ < FLAGS_rpc_retry_times) {
-      h->Wait();
-      if (h->should_retry) {
-        VLOG(3) << "rpc call failed, retry times " << retry_times_;
-        retry_times_++;
-        std::random_device rd;
-        std::this_thread::sleep_for(std::chrono::milliseconds(rd() % 5));
-        continue;
-      }
-    }
-
-    return h;
-  }
-}
-
-VarHandlePtr GRPCClient::AsyncSendBatchBarrier(const std::string& ep,
-                                               int64_t time_out) {
-  const auto ch = GetChannel(ep);
-
-  BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
-  const std::string method = kBatchBarrierRPC;
-  VarHandlePtr h(
-      new VarHandle(ep, method, BATCH_BARRIER_MESSAGE, nullptr, nullptr));
-  s->Prepare(h, time_out);
-
-  sendrecv::VariableMessage req;
-  req.set_varname(BATCH_BARRIER_MESSAGE);
-
-  platform::RecordRPCEvent record_event(method);
-
-  auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
-  rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
-  req_count_++;
-
-  if (UNLIKELY(platform::IsProfileEnabled())) {
-    h->Wait();
-  }
-
-  return h;
-}
-
-VarHandlePtr GRPCClient::AsyncSendFetchBarrier(const std::string& ep,
-                                               int64_t time_out) {
-  const auto ch = GetChannel(ep);
-  FetchBarrierProcessor* s = new FetchBarrierProcessor(ch);
-  const std::string method = kFetchBarrierRPC;
-  VarHandlePtr h(
-      new VarHandle(ep, method, FETCH_BARRIER_MESSAGE, nullptr, nullptr));
-  s->Prepare(h, time_out);
-
-  sendrecv::VariableMessage req;
-  req.set_varname(FETCH_BARRIER_MESSAGE);
-
-  platform::RecordRPCEvent record_event(method);
-
-  auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_);
-  rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
-  req_count_++;
-
-  if (UNLIKELY(platform::IsProfileEnabled())) {
-    h->Wait();
-  }
-
-  return h;
-}
-
-VarHandlePtr GRPCClient::AsyncGetMonomerBarrier(const std::string& ep,
-                                                const std::string& var_name,
-                                                int64_t time_out) {
-  const auto ch = GetChannel(ep);
-  BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
-  const std::string method = kSendMonomerFetchBarrierRPC;
-  VarHandlePtr h(new VarHandle(ep, method, var_name, nullptr, nullptr));
-  s->Prepare(h, time_out);
-
-  VLOG(30) << s->GetVarHandlePtr()->String() << " begin";
-
-  sendrecv::VariableMessage req;
-  req.set_varname(var_name);
-
-  platform::RecordRPCEvent record_event(method);
-
-  auto rpc = s->stub_->AsyncGetMonomerBarrier(s->context_.get(), req, &cq_);
-  rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
-  req_count_++;
-
-  if (UNLIKELY(platform::IsProfileEnabled())) {
-    h->Wait();
-  }
-
-  return h;
-}
-
-VarHandlePtr GRPCClient::AsyncSendComplete(const std::string& ep,
-                                           int64_t time_out) {
-  const auto ch = GetChannel(ep);
-
-  BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
-  const std::string method = kSendCompleteRPC;
-  VarHandlePtr h(new VarHandle(ep, method, COMPLETE_MESSAGE, nullptr, nullptr));
-  s->Prepare(h, time_out);
-
-  sendrecv::VariableMessage req;
-  req.set_trainer_id(trainer_id_);
-  req.set_varname(COMPLETE_MESSAGE);
-
-  platform::RecordRPCEvent record_event(method);
-
-  auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
-  rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
-  req_count_++;
-
-  if (UNLIKELY(platform::IsProfileEnabled())) {
-    h->Wait();
-  }
-
-  return h;
-}
-
-VarHandlePtr GRPCClient::AsyncCheckpointNotify(const std::string& ep,
-                                               const std::string& dirname,
-                                               const std::string& varname,
-                                               const int mode,
-                                               int64_t time_out) {
-  const auto ch = GetChannel(ep);
-
-  CheckpointNotifyProcessor* s = new CheckpointNotifyProcessor(ch);
-
-  const std::string method = kCheckPointNotifyRPC;
-
-  VarHandlePtr h(
-      new VarHandle(ep, method, CHECKPOINT_SAVE_MESSAGE, nullptr, nullptr));
-  s->Prepare(h, time_out);
-
-  sendrecv::VariableMessage req;
-  req.set_varname(varname);
-  req.set_table_name(std::to_string(mode));
-  req.set_out_varname(dirname);
-
-  platform::RecordRPCEvent record_event(method);
-
-  auto rpc = s->stub_->AsyncCheckpointNotify(s->context_.get(), req, &cq_);
-  rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
-  req_count_++;
-
-  if (UNLIKELY(platform::IsProfileEnabled())) {
-    h->Wait();
-  }
-
-  return h;
-}
-
-VarHandlePtr GRPCClient::AsyncDistributeNotify(
-    const std::string& ep, const platform::DeviceContext& ctx,
-    const framework::Scope& scope, const std::string& var_name,
-    int64_t time_out) {
-  const platform::DeviceContext* p_ctx = &ctx;
-  const std::string ep_val = ep;
-  const std::string var_name_val = var_name;
-  const framework::Scope* p_scope = &scope;
-  const auto ch = GetChannel(ep_val);
-  const std::string method = kRequestNotify;
-
-  SendProcessor* s = new SendProcessor(ch);
-  VarHandlePtr h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope));
-  s->Prepare(h, time_out);
-
-  framework::Async([var_name_val, p_scope, p_ctx, s, method, h, this] {
-    auto* var = p_scope->FindVar(var_name_val);
-
-    ::grpc::ByteBuffer req;
-    SerializeToByteBuffer(var_name_val, var, *p_ctx, &req, "", trainer_id_);
-
-    VLOG(3) << s->GetVarHandlePtr()->String() << " begin";
-
-    // stub context
-    s->response_call_back_ = nullptr;
-
-    platform::RecordRPCEvent record_event(method);
-
-    auto call = s->stub_g_.PrepareUnaryCall(
-        s->context_.get(), "/sendrecv.SendRecvService/DistributeNotify", req,
-        &cq_);
-    call->StartCall();
-    call->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
-  });
-  req_count_++;
-
-  if (UNLIKELY(platform::IsProfileEnabled())) {
-    h->Wait();
-  }
-
-  return h;
-}
-
-VarHandlePtr GRPCClient::AsyncSendAndRecv(const std::string& ep,
-                                          const platform::DeviceContext& ctx,
-                                          const framework::Scope& scope,
-                                          const std::string& send_var_name,
-                                          const std::string& recv_var_name,
-                                          const std::string& table_name,
-                                          int64_t time_out) {
-  const platform::DeviceContext* p_ctx = &ctx;
-  const std::string ep_val = ep;
-  const std::string send_var_name_val = send_var_name;
-  const std::string recv_var_name_val = recv_var_name;
-  const std::string table_name_val = table_name;
-  const framework::Scope* p_scope = &scope;
-  const auto ch = GetChannel(ep_val);
-  const std::string method = kSendAndRecvRPC;
-  VLOG(4) << "GRPCClient::SendAndRecv Begin ,Send_var_name: "
-          << send_var_name_val << " Recv_var_name: " << recv_var_name_val;
-  int retry_times_ = 0;
-
-  while (true) {
-    SendAndRecvProcessor* s = new SendAndRecvProcessor(ch);
-    VarHandlePtr h(
-        new VarHandle(ep, method, send_var_name_val, p_ctx, p_scope));
-    VarHandlePtr h_recv(
-        new VarHandle(ep, method, recv_var_name_val, p_ctx, p_scope));
-    s->Prepare(h, time_out);
-    s->RecvPrepare(h_recv);
-
-    framework::Async([send_var_name_val, recv_var_name_val, table_name_val,
-                      p_scope, p_ctx, s, method, h, this] {
-      auto* send_var = p_scope->FindVar(send_var_name_val);
-      send_var->GetMutable<framework::LoDTensor>()->set_lod({});
-      ::grpc::ByteBuffer buf;
-      VLOG(4) << "SerializeToByteBuffer: send_var_name_val: "
-              << send_var_name_val
-              << " recv_var_name_val: " << recv_var_name_val;
-      SerializeToByteBuffer(send_var_name_val, send_var, *p_ctx, &buf,
-                            recv_var_name_val, trainer_id_, table_name_val);
-
-      VLOG(3) << s->GetVarHandlePtr()->String() << " begin";
-
-      // stub context
-      s->response_call_back_ = ProcGetRecvResponse;
-
-      platform::RecordRPCEvent record_event(method);
-
-      auto call = s->stub_g_.PrepareUnaryCall(
-          s->context_.get(), "/sendrecv.SendRecvService/SendAndRecvVariable",
-          buf, &cq_);
-      call->StartCall();
-      call->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
-
-      if (UNLIKELY(platform::IsProfileEnabled())) {
-        h->Wait();
-      }
-    });
-    req_count_++;
-
-    if (FLAGS_rpc_retry_times > 0 && retry_times_ < FLAGS_rpc_retry_times) {
-      h->Wait();
-      if (h->should_retry) {
-        VLOG(3) << "rpc call failed, retry times " << retry_times_;
-        retry_times_++;
-        std::random_device rd;
-        std::this_thread::sleep_for(std::chrono::milliseconds(rd() % 5));
-        continue;
-      }
-    }
-
-    return h;
-  }
-}
-
-bool GRPCClient::Wait() {
-  std::unique_lock<std::mutex> lk(sync_mutex_);
-  sync_cond_.wait(lk, [this] { return (req_count_ == 0 || ok_ == false); });
-  return ok_;
-}
-
-inline bool ShouldRetry(const std::string& method, int error_code) {
-  if (method == kPrefetchRPC) {
-    return true;
-  }
-
-  if (error_code == grpc::StatusCode::DEADLINE_EXCEEDED) {
-    return true;
-  }
-
-  return false;
-}
-
-void GRPCClient::Proceed() {
-  void* tag = nullptr;
-  bool ok = false;
-
-  VLOG(3) << "GRPCClient Proceed begin";
-  while (!stopped_ && cq_.Next(&tag, &ok)) {
-    BaseProcessor* c = static_cast<BaseProcessor*>(tag);
-    GPR_ASSERT(ok);
-    PADDLE_ENFORCE_NOT_NULL(
-        c, platform::errors::PreconditionNotMet("Make BaseProcessor failed."));
-
-    if (c->status_.ok()) {
-      VLOG(3) << c->GetVarHandlePtr()->String() << " process";
-      c->Process();
-    } else if (ShouldRetry(c->GetVarHandlePtr()->method(),
-                           c->status_.error_code())) {
-      VLOG(0) << c->GetVarHandlePtr()->String()
-              << " meets grpc error, error_code:" << c->status_.error_code()
-              << " error_message:" << c->status_.error_message()
-              << " error_details:" << c->status_.error_details()
-              << " should retry!";
-      c->GetVarHandlePtr()->should_retry = true;
-      c->Finish(false);
-    } else {
-      PADDLE_THROW(platform::errors::External(
-          "%s meets grpc error, error_code is %d, error message is %s, error "
-          "details is %s.",
-          c->GetVarHandlePtr()->String(), c->status_.error_code(),
-          c->status_.error_message(), c->status_.error_details()));
-      c->Finish(false);
-    }
-
-    bool notify = false;
-    {
-      std::lock_guard<std::mutex> lk(sync_mutex_);
-      req_count_--;
-      notify = (req_count_ <= 0 || !c->status_.ok());
-    }
-
-    delete c;
-
-    if (notify) {
-      sync_cond_.notify_all();
-    }
-  }
-
-  // Last log message
-  // Avoid using VLOG() and LOG(): in the destructor of google::LogMessage() a
-  // static Mutex log_mutex is used for synchronization, which might have been
-  // destructed at this moment.
-  if (FLAGS_v >= 3) {
-    std::string msg("GRPCClient Proceed end");
-    fwrite(msg.c_str(), msg.length(), 1, stderr);
-  }
-}
-
-std::shared_ptr<grpc::Channel> GRPCClient::GetChannel(const std::string& ep) {
-  std::lock_guard<std::mutex> guard(chan_mutex_);
-  auto it = channels_.find(ep);
-  if (it != channels_.end()) {
-    return it->second;
-  }
-
-  // Channel configurations:
-  grpc::ChannelArguments args;
-  args.SetInt(GRPC_ARG_MAX_RECONNECT_BACKOFF_MS, 2000);
-  if (FLAGS_rpc_disable_reuse_port) {
-    args.SetInt(GRPC_ARG_ALLOW_REUSEPORT, 0);
-  }
-  args.SetCompressionAlgorithm(GRPC_COMPRESS_NONE);
-  args.SetMaxSendMessageSize(std::numeric_limits<int>::max());
-  args.SetMaxReceiveMessageSize(std::numeric_limits<int>::max());
-
-  auto ch =
-      grpc::CreateCustomChannel(ep, grpc::InsecureChannelCredentials(), args);
-  channels_[ep] = ch;
-  return ch;
-}
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_client.h b/paddle/fluid/operators/distributed/grpc/grpc_client.h
deleted file mode 100644
index 5885f944b60a1..0000000000000
--- a/paddle/fluid/operators/distributed/grpc/grpc_client.h
+++ /dev/null
@@ -1,321 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <time.h>
-#include <atomic>
-#include <chrono>              // NOLINT
-#include <condition_variable>  // NOLINT
-#include <ctime>
-#include <functional>
-#include <iostream>
-#include <map>
-#include <memory>
-#include <mutex>  // NOLINT
-#include <string>
-#include <thread>  // NOLINT
-#include <unordered_map>
-#include <vector>
-
-#include "grpc++/channel.h"
-#include "grpc++/generic/generic_stub.h"
-#include "grpc++/grpc++.h"
-#include "grpc++/support/byte_buffer.h"
-#include "grpc++/support/slice.h"
-#include "grpc/support/log.h"
-#include "paddle/fluid/framework/blocking_queue.h"
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/operators/distributed/distributed_pb.h"
-#include "paddle/fluid/operators/distributed/request_handler.h"
-#include "paddle/fluid/operators/distributed/rpc_client.h"
-#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
-#include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
-
-namespace grpc {
-class Channel;
-}  // namespace grpc
-namespace paddle {
-namespace framework {
-class Scope;
-}  // namespace framework
-namespace platform {
-class DeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-void ProcGetResponse(const VarHandle& var_h, const grpc::ByteBuffer& msg);
-
-void ProcGetRecvResponse(const VarHandle& var_h, const grpc::ByteBuffer& msg);
-
-class BaseProcessor {
- public:
-  BaseProcessor() { context_ = nullptr; }
-
-  virtual ~BaseProcessor() {}
-
-  virtual void Prepare(VarHandlePtr h, int64_t time_out) {
-    var_h_ = h;
-
-    context_.reset(new grpc::ClientContext());
-    context_->set_wait_for_ready(true);
-    if (time_out) {
-      std::chrono::system_clock::time_point deadline =
-          std::chrono::system_clock::now() +
-          std::chrono::milliseconds(time_out);
-      context_->set_deadline(deadline);
-    }
-  }
-
-  void Process() {
-    ProcessImpl();
-    var_h_->Finish(true);
-  }
-
-  VarHandlePtr GetVarHandlePtr() { return var_h_; }
-  bool Wait() { return var_h_->Wait(); }
-  void Finish(bool ok) { return var_h_->Finish(ok); }
-  virtual void ProcessImpl() = 0;
-
-  std::unique_ptr<grpc::ClientContext> context_;
-  grpc::Status status_;
-
- protected:
-  VarHandlePtr var_h_;
-};
-
-typedef std::function<void(const VarHandle&, const ::grpc::ByteBuffer&)>
-    RequestSendCallBack;
-
-class SendProcessor : public BaseProcessor {
- public:
-  explicit SendProcessor(std::shared_ptr<grpc::Channel> ch)
-      : BaseProcessor(), stub_g_(ch) {}
-
-  virtual ~SendProcessor() {}
-
-  void ProcessImpl() override {
-    if (response_call_back_) {
-      response_call_back_(*var_h_.get(), reply_);
-    }
-  }
-
-  ::grpc::GenericStub stub_g_;
-  ::grpc::ByteBuffer reply_;
-  RequestSendCallBack response_call_back_ = nullptr;
-};
-
-typedef std::function<void(const VarHandle&, const ::grpc::ByteBuffer&)>
-    RequestGetCallBack;
-
-class GetProcessor : public BaseProcessor {
- public:
-  explicit GetProcessor(std::shared_ptr<grpc::Channel> ch)
-      : BaseProcessor(), stub_g_(ch) {}
-
-  virtual ~GetProcessor() {}
-
-  void ProcessImpl() override {
-    if (response_call_back_) {
-      response_call_back_(*var_h_.get(), reply_);
-    }
-  }
-
-  ::grpc::ByteBuffer reply_;
-  ::grpc::GenericStub stub_g_;
-  RequestGetCallBack response_call_back_ = ProcGetResponse;
-};
-
-class SendAndRecvProcessor : public BaseProcessor {
- public:
-  explicit SendAndRecvProcessor(std::shared_ptr<grpc::Channel> ch)
-      : BaseProcessor(), stub_g_(ch) {}
-
-  virtual ~SendAndRecvProcessor() {}
-
-  void ProcessImpl() override {
-    if (response_call_back_) {
-      response_call_back_(*var_h_recv_.get(), reply_);
-      var_h_recv_->Finish(true);
-    }
-  }
-
-  void RecvPrepare(VarHandlePtr h_recv) { var_h_recv_ = h_recv; }
-
-  ::grpc::ByteBuffer reply_;
-  ::grpc::GenericStub stub_g_;
-  RequestGetCallBack response_call_back_ = ProcGetResponse;
-  VarHandlePtr var_h_recv_;
-};
-
-class BatchBarrierProcessor : public BaseProcessor {
- public:
-  explicit BatchBarrierProcessor(std::shared_ptr<grpc::Channel> ch)
-      : BaseProcessor() {
-    stub_ = sendrecv::SendRecvService::NewStub(ch);
-  }
-
-  virtual ~BatchBarrierProcessor() {}
-
-  void ProcessImpl() override {}
-  sendrecv::VoidMessage reply_;
-  std::unique_ptr<sendrecv::SendRecvService::Stub> stub_;
-};
-
-class FetchBarrierProcessor : public BaseProcessor {
- public:
-  explicit FetchBarrierProcessor(std::shared_ptr<grpc::Channel> ch)
-      : BaseProcessor() {
-    stub_ = sendrecv::SendRecvService::NewStub(ch);
-  }
-
-  virtual ~FetchBarrierProcessor() {}
-
-  void ProcessImpl() override {}
-  sendrecv::VariableMessage reply_;
-  std::unique_ptr<sendrecv::SendRecvService::Stub> stub_;
-};
-
-class CheckpointNotifyProcessor : public BaseProcessor {
- public:
-  explicit CheckpointNotifyProcessor(std::shared_ptr<grpc::Channel> ch)
-      : BaseProcessor() {
-    stub_ = sendrecv::SendRecvService::NewStub(ch);
-  }
-
-  virtual ~CheckpointNotifyProcessor() {}
-
-  void ProcessImpl() override {}
-  sendrecv::VoidMessage reply_;
-  std::unique_ptr<sendrecv::SendRecvService::Stub> stub_;
-};
-
-class GRPCClient : public RPCClient {
- public:
-  GRPCClient() : ok_(true), completed_(false), stopped_(false) {}
-  virtual ~GRPCClient();
-
-  VarHandlePtr AsyncSendVar(const std::string& ep,
-                            const platform::DeviceContext& ctx,
-                            const framework::Scope& scope,
-                            const std::string& var_name,
-                            int64_t time_out = FLAGS_rpc_deadline) override;
-
-  VarHandlePtr AsyncGetVar(const std::string& ep,
-                           const platform::DeviceContext& ctx,
-                           const framework::Scope& scope,
-                           const std::string& var_name,
-                           const std::string& out_varname,
-                           const std::string& table_name = "",
-                           int64_t time_out = FLAGS_rpc_deadline) override;
-
-  VarHandlePtr AsyncGetVarNoBarrier(
-      const std::string& ep, const platform::DeviceContext& ctx,
-      const framework::Scope& scope, const std::string& var_name,
-      const std::string& out_varname,
-      int64_t time_out = FLAGS_rpc_deadline) override;
-
-  VarHandlePtr AsyncGetMonomerVariable(
-      const std::string& ep, const platform::DeviceContext& ctx,
-      const framework::Scope& scope, const std::string& var_name,
-      int64_t time_out = FLAGS_rpc_deadline) override;
-
-  VarHandlePtr AsyncPrefetchVar(const std::string& ep,
-                                const platform::DeviceContext& ctx,
-                                const framework::Scope& scope,
-                                const std::string& in_var_name,
-                                const std::string& out_var_name,
-                                const std::string& table_name = "",
-                                int64_t time_out = FLAGS_rpc_deadline) override;
-
-  VarHandlePtr AsyncSendBatchBarrier(
-      const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override;
-
-  VarHandlePtr AsyncSendFetchBarrier(const std::string& ep,
-                                     int64_t time_out) override;
-
-  VarHandlePtr AsyncGetMonomerBarrier(
-      const std::string& ep, const std::string& var_name,
-      int64_t time_out = FLAGS_rpc_deadline) override;
-
-  VarHandlePtr AsyncCheckpointNotify(
-      const std::string& ep, const std::string& dirname,
-      const std::string& varname, const int mode,
-      int64_t time_out = FLAGS_rpc_deadline) override;
-
-  VarHandlePtr AsyncDistributeNotify(
-      const std::string& ep, const platform::DeviceContext& ctx,
-      const framework::Scope& scope, const std::string& var_name,
-      int64_t time_out = FLAGS_rpc_deadline) override;
-
-  VarHandlePtr AsyncSendAndRecv(const std::string& ep,
-                                const platform::DeviceContext& ctx,
-                                const framework::Scope& scope,
-                                const std::string& send_var_name,
-                                const std::string& recv_var_name,
-                                const std::string& table_name = "",
-                                int64_t time_out = FLAGS_rpc_deadline) override;
-
-  VarHandlePtr AsyncSendComplete(
-      const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override;
-
-  bool Wait() override;
-
-  void SendComplete() override;
-
-  void InitImpl() override;
-
- private:
-  void Proceed();
-
-  std::shared_ptr<grpc::Channel> GetChannel(const std::string& ep);
-  VarHandlePtr _AsyncGetVar(
-      const std::string& ep, const platform::DeviceContext& ctx,
-      const framework::Scope& scope, const std::string& method,
-      const std::string& var_name, const std::string& out_varname,
-      const std::string& rpc_path, const std::string& table_name = "",
-      int64_t time_out = FLAGS_rpc_deadline);
-
- private:
-  grpc::CompletionQueue cq_;
-  std::unordered_map<std::string, std::shared_ptr<grpc::Channel>> channels_;
-  std::vector<std::unique_ptr<std::thread>> client_threads_;
-
-  // mutex for Wait client sync
-  std::mutex sync_mutex_;
-  std::condition_variable sync_cond_;
-  std::atomic<int64_t> req_count_{0};
-  bool ok_;
-
-  // mutex for GetChannel thread safety
-  std::mutex chan_mutex_;
-  DISABLE_COPY_AND_ASSIGN(GRPCClient);
-
-  // mutex for sending complete message only once
-  std::mutex completed_mutex_;
-  bool completed_;
-
-  volatile bool stopped_;
-};
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_serde.cc b/paddle/fluid/operators/distributed/grpc/grpc_serde.cc
deleted file mode 100644
index 0fc9b69577914..0000000000000
--- a/paddle/fluid/operators/distributed/grpc/grpc_serde.cc
+++ /dev/null
@@ -1,190 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_NCCL
-#include <nccl.h>
-#endif
-#ifdef PADDLE_WITH_RCCL
-#include <rccl.h>
-#endif
-#include <limits>
-#include <memory>
-#include "grpcpp/impl/codegen/byte_buffer.h"
-#include "grpcpp/impl/codegen/slice.h"
-#include "paddle/fluid/operators/distributed/grpc/grpc_serde.h"
-#include "paddle/fluid/operators/distributed/grpc/grpc_variable_response.h"
-#include "paddle/fluid/operators/distributed/proto_encoder_helper.h"
-#include "paddle/fluid/operators/distributed/send_recv.pb.h"
-#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/profiler.h"
-
-namespace paddle {
-namespace framework {
-class Scope;
-class Variable;
-}  // namespace framework
-namespace platform {
-class DeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
-                           const platform::DeviceContext& ctx,
-                           ::grpc::ByteBuffer* msg, const std::string& out_name,
-                           const int trainer_id,
-                           const std::string& table_name) {
-  platform::RecordRPCEvent record_event("serial");
-  VarMsg request;
-  TensorPayload* payload = nullptr;
-
-  request.set_varname(name);
-  request.set_trainer_id(trainer_id);
-  // Note: normally the profiler is enabled in 1 trainer, hence only
-  // 1 trainer returns true for ShouldSendProfileState(). It tells PS
-  // servers the trainer's profiling state so that PS can follow the
-  // trainer.
-  if (platform::ShouldSendProfileState()) {
-    if (platform::IsProfileEnabled()) {
-      request.set_profile(platform::kEnableProfiler);
-    } else {
-      request.set_profile(platform::kDisableProfiler);
-    }
-  }
-  if (!out_name.empty()) {
-    request.set_out_varname(out_name);
-  }
-  if (!table_name.empty()) {
-    request.set_table_name(table_name);
-  }
-  if (var->IsType<framework::LoDTensor>()) {
-    request.set_type(::sendrecv::LOD_TENSOR);
-    payload = new TensorPayload(GetTensorPayload(var, ctx, &request));
-  } else if (var->IsType<framework::SelectedRows>()) {
-    request.set_type(::sendrecv::SELECTED_ROWS);
-    payload = new TensorPayload(GetSelectedRowsPayload(var, ctx, &request));
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-  } else if (var->IsType<ncclUniqueId>()) {
-    request.set_type(::sendrecv::NCCL_ID);
-#endif
-  } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "Serialize does not support type: %s", typeid(var->Type()).name()));
-  }
-  std::string header;
-  request.AppendToString(&header);
-  auto buffer = std::unique_ptr<char[]>(new char[1024]);
-  void* buf = buffer.get();
-  ProtoEncodeHelper e(static_cast<char*>(buf), 1024);
-  e.WriteRawBytes(std::string(header.data(), header.size()));
-// NCCLID is copied directly to the message, return bytebuffer
-// with only one slice if serializing NCCLID.
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-  if (var->IsType<ncclUniqueId>()) {
-    e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber,
-                              NCCL_UNIQUE_ID_BYTES);
-    const ncclUniqueId& uid = var->Get<ncclUniqueId>();
-    e.WriteRawBytes(std::string(uid.internal, NCCL_UNIQUE_ID_BYTES));
-
-    // for serialize NCCL_ID
-    ::grpc::Slice slices(e.size());
-    memcpy(const_cast<uint8_t*>(slices.begin()), e.data(), e.size());
-    ::grpc::ByteBuffer tmp(&slices, 1);
-    msg->Swap(&tmp);
-    return;
-  }
-#endif
-  PADDLE_ENFORCE_NOT_NULL(
-      payload,
-      platform::errors::InvalidArgument(
-          "Not support type: %s, need to be LOD_TENSOR or SELECTED_ROWS",
-          var->Type()));
-  e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber,
-                            payload->memory_size());
-  if (payload->memory_size() >= std::numeric_limits<int>::max()) {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "Variable %s length %d should less than %d.", name,
-        payload->memory_size(), std::numeric_limits<int>::max()));
-  }
-  // steal reference of tensor data
-  ::grpc::Slice slices[4];  // metadata, tensor, rows meta, rows
-  int num_slices = 2;       // only SelectedRows have rows buffer
-  slices[0] = ::grpc::Slice(e.size());
-  memcpy(const_cast<uint8_t*>(slices[0].begin()), e.data(), e.size());
-  slices[1] = ::grpc::Slice(
-      grpc_slice_new_with_user_data(payload->ptr(), payload->memory_size(),
-                                    SerializeDestroyCallback, payload),
-      ::grpc::Slice::STEAL_REF);
-
-  if (var->IsType<framework::SelectedRows>()) {
-    auto* slr = var->GetMutable<framework::SelectedRows>();
-    ProtoEncodeHelper e2(static_cast<char*>(buf), 128);
-
-    PADDLE_ENFORCE_EQ(VectorElemName(slr->rows()), typeid(int64_t).name(),
-                      platform::errors::InvalidArgument(
-                          "Got wrong type %s, expect type: int64_t",
-                          VectorElemName(slr->rows())));
-    size_t rows_memory_size = slr->rows().size() * sizeof(int64_t);
-
-    e2.WriteVarlengthBeginning(VarMsg::kRowsFieldNumber, rows_memory_size);
-    slices[2] = ::grpc::Slice(e2.size());
-    memcpy(const_cast<uint8_t*>(slices[2].begin()), e2.data(), e2.size());
-
-    slices[3] = ::grpc::Slice(
-        grpc_slice_new_with_user_data(
-            const_cast<void*>(
-                reinterpret_cast<const void*>(slr->rows().data())),
-            rows_memory_size, [](void* backing) {},
-            const_cast<char*>(
-                reinterpret_cast<const char*>(slr->rows().data()))),
-        ::grpc::Slice::STEAL_REF);
-    num_slices = 4;
-  }
-  ::grpc::ByteBuffer tmp(&slices[0], num_slices);
-  msg->Swap(&tmp);
-}
-
-void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
-                               const platform::DeviceContext& ctx,
-                               const framework::Scope* scope,
-                               framework::Variable** var, int* trainer_id) {
-  platform::RecordRPCEvent record_event("deserial");
-  operators::distributed::GRPCVariableResponse resp(scope, &ctx);
-  PADDLE_ENFORCE_EQ(
-      resp.Parse(msg), 0,
-      platform::errors::InvalidArgument("parse bytebuffer to tensor error!"));
-  *var = resp.GetVar();
-  *trainer_id = resp.GetTrainerId();
-}
-
-void DeserializeRecvFromByteBuffer(const ::grpc::ByteBuffer& msg,
-                                   const platform::DeviceContext& ctx,
-                                   const framework::Scope* scope,
-                                   framework::Variable** var, int* trainer_id) {
-  platform::RecordRPCEvent record_event("deserial");
-  operators::distributed::GRPCVariableResponse resp(scope, &ctx);
-  PADDLE_ENFORCE_EQ(
-      resp.Parse(msg), 0,
-      platform::errors::InvalidArgument("parse bytebuffer to tensor error!"));
-  *var = resp.GetRecvVar();
-  *trainer_id = resp.GetTrainerId();
-}
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_serde.h b/paddle/fluid/operators/distributed/grpc/grpc_serde.h
deleted file mode 100644
index 932f3e2f069a2..0000000000000
--- a/paddle/fluid/operators/distributed/grpc/grpc_serde.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <iostream>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/operators/distributed/distributed_pb.h"
-#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
-#include "paddle/fluid/platform/port.h"
-
-namespace grpc {
-class ByteBuffer;
-}  // namespace grpc
-namespace paddle {
-namespace framework {
-class Scope;
-class Variable;
-}  // namespace framework
-namespace platform {
-class DeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-typedef void (*DestroyCallback)(void*);
-
-void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
-                           const platform::DeviceContext& ctx,
-                           ::grpc::ByteBuffer* msg,
-                           const std::string& out_varname = std::string(),
-                           const int trainer_id = 0,
-                           const std::string& table_name = std::string());
-
-void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
-                               const platform::DeviceContext& ctx,
-                               const framework::Scope* scope,
-                               framework::Variable** var, int* trainer_id);
-
-void DeserializeRecvFromByteBuffer(const ::grpc::ByteBuffer& msg,
-                                   const platform::DeviceContext& ctx,
-                                   const framework::Scope* scope,
-                                   framework::Variable** var, int* trainer_id);
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_serde_test.cc b/paddle/fluid/operators/distributed/grpc/grpc_serde_test.cc
deleted file mode 100644
index d407a72938a74..0000000000000
--- a/paddle/fluid/operators/distributed/grpc/grpc_serde_test.cc
+++ /dev/null
@@ -1,224 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <unistd.h>
-#include <string>
-#include <thread>  // NOLINT
-
-#include "google/protobuf/text_format.h"
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/framework/variable.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/distributed/grpc/grpc_serde.h"
-#include "paddle/fluid/operators/distributed/grpc/grpc_variable_response.h"
-#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/string/printf.h"
-
-namespace framework = paddle::framework;
-namespace platform = paddle::platform;
-namespace operators = paddle::operators;
-namespace math = paddle::operators::math;
-namespace memory = paddle::memory;
-
-void RunSerdeTestSelectedRows(platform::Place place) {
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  auto& ctx = *pool.Get(place);
-
-  // serialize var to ByteBuffer
-  framework::Variable var;
-  auto* slr = var.GetMutable<framework::SelectedRows>();
-  slr->set_height(1000);
-  auto* tensor = slr->mutable_value();
-  auto* rows = slr->mutable_rows();
-  tensor->Resize(framework::make_ddim({564, 128}));
-  tensor->mutable_data<float>(place);
-  int tensor_numel = 564 * 128;
-  math::set_constant(ctx, tensor, 32.7);
-  for (int i = 0; i < 564; ++i) rows->push_back(i);
-
-  ::grpc::ByteBuffer msg;
-  operators::distributed::SerializeToByteBuffer("myvar", &var, ctx, &msg);
-  EXPECT_GT(msg.Length(), static_cast<size_t>(0));
-
-  // deserialize
-  std::vector<::grpc::Slice> slices;
-  (void)msg.Dump(&slices);
-  std::string tmp;
-  for (const auto& s : slices) {
-    tmp.append(reinterpret_cast<const char*>(s.begin()), s.size());
-  }
-
-  sendrecv::VariableMessage varmsg;
-  EXPECT_TRUE(varmsg.ParseFromString(tmp));
-
-  // deserialize bytebuffer
-  EXPECT_EQ(varmsg.varname(), "myvar");
-  EXPECT_EQ(varmsg.type(), 1);
-
-  const float* tensor_data =
-      reinterpret_cast<const float*>(varmsg.serialized().data());
-  const int64_t* rows_data =
-      reinterpret_cast<const int64_t*>(varmsg.rows().data());
-  for (int i = 0; i < tensor_numel; ++i) {
-    EXPECT_FLOAT_EQ(tensor_data[i], 32.7);
-  }
-  for (int i = 0; i < 564; ++i) {
-    EXPECT_EQ(rows_data[i], i);
-  }
-
-  // deserialize zero-copy
-  // framework::Variable var2;
-  // operators::distributed::DeserializeFromByteBuffer(msg, ctx, &var2);
-  framework::Scope scope;
-  scope.Var("myvar");
-  operators::distributed::GRPCVariableResponse resp(&scope, &ctx);
-  EXPECT_EQ(resp.Parse(msg), 0);
-
-  framework::Variable* var2 = resp.GetVar();
-
-  auto* slr2 = var2->GetMutable<framework::SelectedRows>();
-  auto* tensor2 = slr2->mutable_value();
-  auto* rows2 = slr2->mutable_rows();
-  float* tensor_data2 = nullptr;
-  framework::Tensor tmp_tensor;
-
-  if (platform::is_gpu_place(ctx.GetPlace())) {
-    platform::CPUPlace cpu;
-    framework::TensorCopy(*tensor2, cpu, &tmp_tensor);
-    tensor_data2 = tmp_tensor.data<float>();
-  } else {
-    tensor_data2 = const_cast<float*>(tensor2->data<float>());
-  }
-  const int64_t* rows_data2 = rows2->data();
-
-  for (int i = 0; i < tensor_numel; ++i) {
-    EXPECT_FLOAT_EQ(tensor_data2[i], 32.7);
-  }
-  for (size_t i = 0; i < rows2->size(); ++i) {
-    EXPECT_EQ(rows_data2[i], static_cast<int64_t>(i));
-  }
-  EXPECT_EQ(slr2->height(), 1000);
-}
-
-void RunTestLodTensor(platform::Place place, int from_type = 0) {
-  // serialize var to ByteBuffer
-  framework::Variable var;
-  auto* tensor = var.GetMutable<framework::LoDTensor>();
-  tensor->Resize(framework::make_ddim({512, 8, 4, 2}));
-  framework::LoD lod;
-  lod.push_back(framework::Vector<size_t>({1, 3, 8}));
-  tensor->set_lod(lod);
-  int tensor_numel = 512 * 8 * 4 * 2;
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  auto& ctx = *pool.Get(place);
-  tensor->mutable_data<float>(place);
-  math::set_constant(ctx, tensor, 31.9);
-
-  ::grpc::ByteBuffer msg;
-  operators::distributed::SerializeToByteBuffer("myvar", &var, ctx, &msg,
-                                                "outvar", 0, "table_name");
-  EXPECT_GT(msg.Length(), static_cast<size_t>(0));
-
-  // deserialize
-  std::vector<::grpc::Slice> slices;
-  (void)msg.Dump(&slices);
-  std::string tmp;
-  for (const auto& s : slices) {
-    tmp.append(reinterpret_cast<const char*>(s.begin()), s.size());
-  }
-  sendrecv::VariableMessage varmsg;
-  EXPECT_TRUE(varmsg.ParseFromString(tmp));
-  EXPECT_EQ(varmsg.varname(), "myvar");
-  EXPECT_EQ(varmsg.type(), 0);
-  EXPECT_EQ(varmsg.dims()[0], 512);
-  EXPECT_EQ(varmsg.dims()[1], 8);
-  EXPECT_EQ(varmsg.dims()[2], 4);
-  EXPECT_EQ(varmsg.dims()[3], 2);
-  EXPECT_EQ(varmsg.lod_level(), 1);
-  EXPECT_EQ(varmsg.lod(0).lod_data(0), 1);
-  EXPECT_EQ(varmsg.lod(0).lod_data(1), 3);
-  EXPECT_EQ(varmsg.lod(0).lod_data(2), 8);
-
-  const float* tensor_data =
-      reinterpret_cast<const float*>(varmsg.serialized().data());
-  for (int i = 0; i < tensor_numel; ++i) {
-    EXPECT_FLOAT_EQ(tensor_data[i], 31.9);
-  }
-
-  // message binary
-  std::string str;
-  varmsg.SerializeToString(&str);
-
-  // message bytebuffer
-  ::grpc::Slice slices_2[1];
-  int num_slices = 1;
-  slices_2[0] = ::grpc::Slice(str.length());
-  memcpy(const_cast<uint8_t*>(slices_2[0].begin()), str.c_str(), str.length());
-  ::grpc::ByteBuffer bytebuffer2(&slices_2[0], num_slices);
-
-  // deserialize zero-copy
-  framework::Scope scope;
-  scope.Var("myvar");
-  operators::distributed::GRPCVariableResponse resp(&scope, &ctx);
-  if (from_type == 0) {
-    EXPECT_EQ(resp.Parse(msg), 0);
-  } else {
-    EXPECT_EQ(resp.Parse(bytebuffer2), 0);
-  }
-
-  framework::Variable* var2 = resp.GetVar();
-
-  auto tensor2 = var2->Get<framework::LoDTensor>();
-  float* tensor_data2 = nullptr;
-  framework::Tensor tmp_tensor;
-
-  if (platform::is_gpu_place(ctx.GetPlace())) {
-    platform::CPUPlace cpu;
-    framework::TensorCopy(tensor2, cpu, &tmp_tensor);
-    tensor_data2 = tmp_tensor.data<float>();
-  } else {
-    tensor_data2 = const_cast<float*>(tensor2.data<float>());
-  }
-
-  EXPECT_EQ(varmsg.lod_level(), 1);
-  EXPECT_EQ(varmsg.lod(0).lod_data(0), 1);
-  EXPECT_EQ(varmsg.lod(0).lod_data(1), 3);
-  EXPECT_EQ(varmsg.lod(0).lod_data(2), 8);
-  for (int i = 0; i < tensor_numel; ++i) EXPECT_FLOAT_EQ(tensor_data2[i], 31.9);
-}
-
-TEST(LodTensor, Run) {
-  platform::CPUPlace place;
-  RunTestLodTensor(place);
-  RunTestLodTensor(place, 1);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  platform::CUDAPlace gpu(0);
-  RunTestLodTensor(gpu);
-  RunTestLodTensor(gpu, 1);
-#endif
-}
-
-TEST(SelectedRows, Run) {
-  platform::CPUPlace place;
-  RunSerdeTestSelectedRows(place);
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  platform::CUDAPlace gpu;
-  RunSerdeTestSelectedRows(gpu);
-#endif
-}
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_server.cc b/paddle/fluid/operators/distributed/grpc/grpc_server.cc
deleted file mode 100644
index 912520d782d75..0000000000000
--- a/paddle/fluid/operators/distributed/grpc/grpc_server.cc
+++ /dev/null
@@ -1,720 +0,0 @@
-/*Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <unistd.h>
-#include <limits>
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/operators/distributed/grpc/grpc_serde.h"
-#include "paddle/fluid/operators/distributed/grpc/grpc_server.h"
-
-namespace grpc {
-class ChannelArguments;
-}  // namespace grpc
-namespace paddle {
-namespace framework {
-class Variable;
-}  // namespace framework
-namespace operators {
-namespace distributed {
-class GRPCVariableResponse;
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
-
-using ::grpc::ServerAsyncResponseWriter;
-
-DECLARE_bool(rpc_disable_reuse_port);
-DECLARE_int32(rpc_retry_bind_port);
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-enum CallStatus { PROCESS = 0, FINISH };
-
-// reference:
-// https://stackoverflow.com/questions/41732884/grpc-multiple-services-in-cpp-async-server
-class RequestBase {
- public:
-  explicit RequestBase(GrpcService::AsyncService* service,
-                       ::grpc::ServerCompletionQueue* cq,
-                       RequestHandler* request_handler, int req_id)
-      : service_(service),
-        cq_(cq),
-        status_(PROCESS),
-        request_handler_(request_handler),
-        req_id_(req_id) {
-    PADDLE_ENFORCE_NOT_NULL(cq_, platform::errors::InvalidArgument(
-                                     "ServerCompletionQueue cq are empty"));
-  }
-  virtual ~RequestBase() {}
-  virtual void Process() = 0;
-
-  std::string Status2String(const std::string& method) {
-    std::string status = "Process";
-    if (status_ == FINISH) {
-      status = "Finish";
-    }
-
-    std::ostringstream s;
-    s << method << " name:[" << GetReqName() << "]"
-      << ", ep:[" << ctx_.peer() << "]"
-      << " " << status << " using req_id:" << req_id_;
-    return s.str();
-  }
-
-  CallStatus Status() const {
-    std::lock_guard<std::mutex> l(status_mu_);
-    return status_;
-  }
-
-  template <typename T>
-  void Finish(const T& reply, ServerAsyncResponseWriter<T>* responder) {
-    std::lock_guard<std::mutex> l(status_mu_);
-    status_ = FINISH;
-    responder->Finish(reply, ::grpc::Status::OK,
-                      reinterpret_cast<void*>(static_cast<intptr_t>(req_id_)));
-  }
-  virtual std::string GetReqName() = 0;
-
- protected:
-  mutable std::mutex status_mu_;
-  ::grpc::ServerContext ctx_;
-  GrpcService::AsyncService* service_;
-  ::grpc::ServerCompletionQueue* cq_;
-  CallStatus status_;
-  RequestHandler* request_handler_;
-  int req_id_;
-};
-
-class RequestSend final : public RequestBase {
- public:
-  explicit RequestSend(GrpcService::AsyncService* service,
-                       ::grpc::ServerCompletionQueue* cq,
-                       RequestHandler* request_handler, int req_id)
-      : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) {
-    request_.reset(new GRPCVariableResponse(request_handler->scope(),
-                                            request_handler->dev_ctx(), true));
-    int method_id = static_cast<int>(distributed::GrpcMethod::kSendVariable);
-    service_->RequestAsyncUnary(
-        method_id, &ctx_, request_.get(), &responder_, cq_, cq_,
-        reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
-  }
-  virtual ~RequestSend() {}
-  std::string GetReqName() override { return request_->Varname(); }
-
-  void Process() override {
-    std::string varname = GetReqName();
-
-    auto scope = request_->GetMutableLocalScope();
-    auto invar = request_->GetVar();
-    int trainer_id = request_->GetTrainerId();
-
-    VLOG(4) << "RequestSend var_name:" << varname << " trainer: " << trainer_id;
-
-    framework::Variable* outvar = nullptr;
-    request_handler_->Handle(varname, scope, invar, &outvar, trainer_id);
-    Finish(reply_, &responder_);
-  }
-
- protected:
-  sendrecv::VoidMessage reply_;
-  std::shared_ptr<GRPCVariableResponse> request_;
-  ServerAsyncResponseWriter<sendrecv::VoidMessage> responder_;
-};
-
-class RequestGet final : public RequestBase {
- public:
-  explicit RequestGet(GrpcService::AsyncService* service,
-                      ::grpc::ServerCompletionQueue* cq,
-                      RequestHandler* request_handler, int req_id)
-      : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) {
-    auto method_id = static_cast<int>(distributed::GrpcMethod::kGetVariable);
-    service_->RequestAsyncUnary(
-        method_id, &ctx_, &request_, &responder_, cq_, cq_,
-        reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
-  }
-
-  virtual ~RequestGet() {}
-
-  std::string GetReqName() override { return request_.varname(); }
-
-  void Process() override {
-    // proc request.
-    std::string varname = request_.varname();
-    std::string out_varname = request_.out_varname();
-    std::string table_name = request_.table_name();
-    int trainer_id = request_.trainer_id();
-
-    VLOG(4) << "RequestGet " << out_varname << " from " << varname;
-
-    auto scope = request_handler_->scope();
-    framework::Variable* invar = nullptr;
-    framework::Variable* outvar = nullptr;
-
-    tmp_scope_ = std::move(scope->NewTmpScope());
-    request_handler_->Handle(varname, tmp_scope_.get(), invar, &outvar,
-                             trainer_id, out_varname, table_name);
-
-    VLOG(1) << "before SerializeToByteBuffer";
-    if (outvar) {
-      SerializeToByteBuffer(out_varname, outvar, *request_handler_->dev_ctx(),
-                            &reply_);
-    }
-    VLOG(1) << "after SerializeToByteBuffer";
-    Finish(reply_, &responder_);
-  }
-
- protected:
-  sendrecv::VariableMessage request_;
-  ::grpc::ByteBuffer reply_;
-  std::unique_ptr<framework::Scope> tmp_scope_;
-  ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
-};
-
-class RequestGetNoBarrier final : public RequestBase {
- public:
-  explicit RequestGetNoBarrier(GrpcService::AsyncService* service,
-                               ::grpc::ServerCompletionQueue* cq,
-                               RequestHandler* request_handler, int req_id)
-      : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) {
-    auto method_id =
-        static_cast<int>(distributed::GrpcMethod::kGetVariableNoBarrier);
-    service_->RequestAsyncUnary(
-        method_id, &ctx_, &request_, &responder_, cq_, cq_,
-        reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
-  }
-
-  virtual ~RequestGetNoBarrier() {}
-
-  std::string GetReqName() override { return request_.varname(); }
-
-  void Process() override {
-    // proc request.
-    std::string varname = request_.varname();
-    std::string out_varname = request_.out_varname();
-    int trainer_id = request_.trainer_id();
-
-    VLOG(4) << "RequestGetNoBarrier " << out_varname << " from " << varname;
-
-    auto scope = request_handler_->scope();
-    framework::Variable* invar = nullptr;
-    framework::Variable* outvar = nullptr;
-
-    request_handler_->Handle(varname, scope, invar, &outvar, trainer_id,
-                             out_varname);
-
-    if (outvar) {
-      SerializeToByteBuffer(out_varname, outvar, *request_handler_->dev_ctx(),
-                            &reply_);
-    }
-    Finish(reply_, &responder_);
-  }
-
- protected:
-  sendrecv::VariableMessage request_;
-  ::grpc::ByteBuffer reply_;
-  ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
-};
-
-class RequestGetMonomerVariable final : public RequestBase {
- public:
-  explicit RequestGetMonomerVariable(GrpcService::AsyncService* service,
-                                     ::grpc::ServerCompletionQueue* cq,
-                                     RequestHandler* request_handler,
-                                     int req_id, RPCServer* rpc_server)
-      : RequestBase(service, cq, request_handler, req_id),
-        responder_(&ctx_),
-        rpc_server_(rpc_server) {
-    auto method_id =
-        static_cast<int>(distributed::GrpcMethod::kGetMonomerVariable);
-    service_->RequestAsyncUnary(
-        method_id, &ctx_, &request_, &responder_, cq_, cq_,
-        reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
-  }
-
-  virtual ~RequestGetMonomerVariable() {}
-
-  std::string GetReqName() override { return request_.varname(); }
-
-  void Process() override {
-    // proc request.
-    std::string varname = request_.varname();
-
-    rpc_server_->WaitVarCond(varname);
-    MonomerHandle h = rpc_server_->GetMonomer(varname);
-
-    auto scope = h.scope_;
-    auto invar = scope->FindVar(varname);
-    framework::Variable* outvar = nullptr;
-
-    request_handler_->Handle(varname, scope, invar, &outvar,
-                             request_.trainer_id());
-
-    if (outvar) {
-      SerializeToByteBuffer(varname, outvar, *h.dev_ctx_, &reply_);
-    }
-    Finish(reply_, &responder_);
-  }
-
- protected:
-  sendrecv::VariableMessage request_;
-  ::grpc::ByteBuffer reply_;
-  ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
-  RPCServer* rpc_server_{nullptr};
-};
-
-class RequestGetMonomerBarrier final : public RequestBase {
- public:
-  explicit RequestGetMonomerBarrier(GrpcService::AsyncService* service,
-                                    ::grpc::ServerCompletionQueue* cq,
-                                    RequestHandler* request_handler, int req_id,
-                                    RPCServer* rpc_server)
-      : RequestBase(service, cq, request_handler, req_id),
-        responder_(&ctx_),
-        rpc_server_(rpc_server) {
-    auto method_id =
-        static_cast<int>(distributed::GrpcMethod::kGetMonomerBarrier);
-    service_->RequestAsyncUnary(
-        method_id, &ctx_, &request_, &responder_, cq_, cq_,
-        reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
-  }
-
-  virtual ~RequestGetMonomerBarrier() {}
-
-  std::string GetReqName() override { return request_.varname(); }
-
-  void Process() override {
-    // proc request.
-    std::string varname = request_.varname();
-    VLOG(4) << "RequestGetMonomerBarrier " << varname;
-
-    rpc_server_->WaitVarCond(varname);
-    MonomerHandle h = rpc_server_->GetMonomer(varname);
-
-    framework::Scope* scope = nullptr;
-    framework::Variable* invar = nullptr;
-    framework::Variable* outvar = nullptr;
-
-    request_handler_->Handle(varname, scope, invar, &outvar,
-                             request_.trainer_id());
-
-    Finish(reply_, &responder_);
-  }
-
- protected:
-  sendrecv::VariableMessage request_;
-  sendrecv::VoidMessage reply_;
-  ServerAsyncResponseWriter<sendrecv::VoidMessage> responder_;
-  RPCServer* rpc_server_{nullptr};
-};
-
-class RequestPrefetch final : public RequestBase {
- public:
-  explicit RequestPrefetch(GrpcService::AsyncService* service,
-                           ::grpc::ServerCompletionQueue* cq,
-                           RequestHandler* request_handler, int req_id)
-      : RequestBase(service, cq, request_handler, req_id),
-        responder_(&ctx_),
-        local_scope_(nullptr) {
-    request_.reset(new GRPCVariableResponse(request_handler->scope(),
-                                            request_handler->dev_ctx(), true));
-    int method_id =
-        static_cast<int>(distributed::GrpcMethod::kPrefetchVariable);
-    service_->RequestAsyncUnary(
-        method_id, &ctx_, request_.get(), &responder_, cq_, cq_,
-        reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
-  }
-
-  virtual ~RequestPrefetch() {}
-
-  std::string GetReqName() override { return request_->Varname(); }
-
-  void Process() override {
-    // prefetch process...
-    std::string in_var_name = request_->Varname();
-    std::string out_var_name = request_->OutVarname();
-    std::string table_name = request_->TableName();
-    int trainer_id = request_->GetTrainerId();
-
-    VLOG(4) << "RequestPrefetch, in_var_name: " << in_var_name
-            << " out_var_name: " << out_var_name << " trainer: " << trainer_id;
-
-    auto scope = request_->GetMutableLocalScope();
-    auto invar = scope->FindVar(in_var_name);
-    // out var must be created in local scope!
-    framework::Variable* outvar = scope->Var(out_var_name);
-
-    request_handler_->Handle(in_var_name, scope, invar, &outvar, trainer_id,
-                             out_var_name, table_name);
-
-    SerializeToByteBuffer(out_var_name, outvar, *request_handler_->dev_ctx(),
-                          &reply_);
-    Finish(reply_, &responder_);
-  }
-
- protected:
-  std::shared_ptr<GRPCVariableResponse> request_;
-  ::grpc::ByteBuffer reply_;
-  ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
-  framework::Scope* local_scope_;
-};
-
-class RequestCheckpointNotify final : public RequestBase {
- public:
-  explicit RequestCheckpointNotify(GrpcService::AsyncService* service,
-                                   ::grpc::ServerCompletionQueue* cq,
-                                   RequestHandler* request_handler, int req_id)
-      : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) {
-    request_.reset(new GRPCVariableResponse(request_handler->scope(),
-                                            request_handler->dev_ctx()));
-    int method_id =
-        static_cast<int>(distributed::GrpcMethod::kCheckpointNotify);
-    service_->RequestAsyncUnary(
-        method_id, &ctx_, request_.get(), &responder_, cq_, cq_,
-        reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
-  }
-
-  virtual ~RequestCheckpointNotify() {}
-
-  std::string GetReqName() override { return request_->Varname(); }
-
-  void Process() override {
-    auto scope = request_->GetMutableLocalScope();
-
-    std::string checkpoint_notify = request_->Varname();
-    std::string checkpoint_dir = request_->OutVarname();
-    int trainer_id = request_->GetTrainerId();
-    std::string table_name = request_->TableName();
-
-    VLOG(4) << "RequestCheckpointNotify notify: " << checkpoint_notify
-            << ", dir: " << checkpoint_dir;
-
-    request_handler_->Handle(checkpoint_notify, scope, nullptr, nullptr,
-                             trainer_id, checkpoint_dir, table_name);
-    Finish(reply_, &responder_);
-  }
-
- protected:
-  std::shared_ptr<GRPCVariableResponse> request_;
-  sendrecv::VoidMessage reply_;
-  ServerAsyncResponseWriter<sendrecv::VoidMessage> responder_;
-};
-
-class RequestNotify final : public RequestBase {
- public:
-  explicit RequestNotify(GrpcService::AsyncService* service,
-                         ::grpc::ServerCompletionQueue* cq,
-                         RequestHandler* request_handler, int req_id)
-      : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) {
-    request_.reset(new GRPCVariableResponse(request_handler->scope(),
-                                            request_handler->dev_ctx(), true));
-    int method_id = static_cast<int>(distributed::GrpcMethod::kRequestNotify);
-    service_->RequestAsyncUnary(
-        method_id, &ctx_, request_.get(), &responder_, cq_, cq_,
-        reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
-  }
-  virtual ~RequestNotify() {}
-  std::string GetReqName() override { return request_->Varname(); }
-
-  void Process() override {
-    std::string varname = GetReqName();
-    VLOG(4) << "RequestNotify var_name:" << varname;
-
-    auto scope = request_->GetMutableLocalScope();
-    auto invar = request_->GetVar();
-    int trainer_id = request_->GetTrainerId();
-    framework::Variable* outvar = nullptr;
-    request_handler_->Handle(varname, scope, invar, &outvar, trainer_id);
-    Finish(reply_, &responder_);
-  }
-
- protected:
-  sendrecv::VoidMessage reply_;
-  std::shared_ptr<GRPCVariableResponse> request_;
-  ServerAsyncResponseWriter<sendrecv::VoidMessage> responder_;
-};
-
-class RequestSendAndRecv final : public RequestBase {
- public:
-  explicit RequestSendAndRecv(GrpcService::AsyncService* service,
-                              ::grpc::ServerCompletionQueue* cq,
-                              RequestHandler* request_handler, int req_id)
-      : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) {
-    request_.reset(new GRPCVariableResponse(request_handler->scope(),
-                                            request_handler->dev_ctx(), true));
-
-    int method_id =
-        static_cast<int>(distributed::GrpcMethod::kRequestSendAndRecv);
-
-    service_->RequestAsyncUnary(
-        method_id, &ctx_, request_.get(), &responder_, cq_, cq_,
-        reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
-  }
-
-  virtual ~RequestSendAndRecv() {}
-  std::string GetReqName() override { return request_->Varname(); }
-
-  void Process() override {
-    std::string in_var_name = request_->Varname();
-    std::string out_var_name = request_->OutVarname();
-    std::string table_name = request_->TableName();
-    int trainer_id = request_->GetTrainerId();
-
-    VLOG(4) << "RequestSendAndRecv, in_var_name: " << in_var_name
-            << " out_var_name: " << out_var_name << " trainer: " << trainer_id;
-    auto scope = request_->GetMutableLocalScope();
-    auto invar = scope->FindVar(in_var_name);
-    framework::Variable* outvar = nullptr;
-    request_handler_->Handle(in_var_name, scope, invar, &outvar, trainer_id,
-                             out_var_name, table_name);
-    SerializeToByteBuffer(out_var_name, outvar, *request_handler_->dev_ctx(),
-                          &reply_);
-    Finish(reply_, &responder_);
-  }
-
- protected:
-  std::shared_ptr<GRPCVariableResponse> request_;
-  ::grpc::ByteBuffer reply_;
-  ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
-};
-
-void AsyncGRPCServer::WaitServerReady() {
-  VLOG(4) << "AsyncGRPCServer is waiting server ready";
-  std::unique_lock<std::mutex> lock(this->mutex_ready_);
-  condition_ready_.wait(lock, [=] { return this->ready_ == 1; });
-  VLOG(4) << "AsyncGRPCServer WaitSeverReady";
-}
-
-// Define an option subclass in order to disable SO_REUSEPORT for the
-// server socket.
-// Come from:
-// https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
-class NoReusePortOption : public ::grpc::ServerBuilderOption {
- public:
-  void UpdateArguments(::grpc::ChannelArguments* args) override {
-    args->SetInt(GRPC_ARG_ALLOW_REUSEPORT, 0);
-  }
-
-  void UpdatePlugins(std::vector<std::unique_ptr<::grpc::ServerBuilderPlugin>>*
-                         plugins) override {}
-};
-
-void AsyncGRPCServer::StartServer() {
-  for (int i = 0; i < FLAGS_rpc_retry_bind_port; i++) {
-    ::grpc::ServerBuilder builder;
-    std::unique_ptr<GrpcService::AsyncService> service(
-        new GrpcService::AsyncService());
-    builder.AddListeningPort(bind_address_, ::grpc::InsecureServerCredentials(),
-                             &selected_port_);
-
-    builder.SetMaxSendMessageSize(std::numeric_limits<int>::max());
-    builder.SetMaxReceiveMessageSize(std::numeric_limits<int>::max());
-    if (FLAGS_rpc_disable_reuse_port) {
-      builder.SetOption(
-          std::unique_ptr<::grpc::ServerBuilderOption>(new NoReusePortOption));
-      LOG(INFO) << "set FLAGS_rpc_disable_reuse_port";
-    }
-    builder.RegisterService(service.get());
-
-    for (auto t : rpc_call_map_) {
-      rpc_cq_[t.first].reset(builder.AddCompletionQueue().release());
-    }
-
-    server_ = builder.BuildAndStart();
-    if (selected_port_ != 0) {
-      LOG(INFO) << "Server listening on " << bind_address_
-                << " successful, selected port: " << selected_port_;
-      service_.reset(service.release());
-      break;
-    }
-
-    LOG(WARNING) << "Server listening on " << bind_address_
-                 << " failed, selected port: " << selected_port_
-                 << ", retry after 3 seconds!";
-
-    sleep(3);
-  }
-
-  PADDLE_ENFORCE_NE(
-      selected_port_, 0,
-      platform::errors::Unavailable("can't bind to address:%s", bind_address_));
-
-  std::function<void(const std::string&, int)> f =
-      std::bind(&AsyncGRPCServer::TryToRegisterNewOne, this,
-                std::placeholders::_1, std::placeholders::_2);
-
-  for (auto& t : rpc_call_map_) {
-    auto& rpc_name = t.first;
-    auto& cq = rpc_cq_[rpc_name];
-    auto threadnum = rpc_thread_num_[rpc_name];
-    auto& reqs = rpc_reqs_[rpc_name];
-
-    reqs.reserve(kRequestBufSize);
-
-    for (int i = 0; i < kRequestBufSize; i++) {
-      VLOG(6) << "TryToRegisterNewOne on RPC NAME: " << rpc_name << " I: " << i;
-      TryToRegisterNewOne(rpc_name, i);
-    }
-
-    for (int i = 0; i < threadnum; i++) {
-      rpc_threads_[rpc_name].emplace_back(new std::thread(std::bind(
-          &AsyncGRPCServer::HandleRequest, this, cq.get(), rpc_name, f)));
-      VLOG(4) << t.first << " creates threads!";
-    }
-  }
-
-  {
-    std::lock_guard<std::mutex> lock(this->mutex_ready_);
-    ready_ = 1;
-  }
-  condition_ready_.notify_all();
-
-  // wait server
-  server_->Wait();
-
-  for (auto& t : rpc_threads_) {
-    auto& threads = t.second;
-    for (size_t i = 0; i < threads.size(); ++i) {
-      threads[i]->join();
-      VLOG(4) << t.first << " threads ends!";
-    }
-  }
-}
-
-void AsyncGRPCServer::ShutdownQueue() {
-  for (auto& t : rpc_cq_) {
-    t.second->Shutdown();
-    VLOG(4) << t.first << " queue shutdown!";
-  }
-}
-
-void AsyncGRPCServer::ShutDownImpl() {
-  std::unique_lock<std::mutex> lock(cq_mutex_);
-  is_shut_down_ = true;
-  ShutdownQueue();
-
-  VLOG(4) << "server_ shutdown!";
-  server_->Shutdown();
-}
-
-void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name,
-                                          int req_id) {
-  std::unique_lock<std::mutex> lock(cq_mutex_);
-  if (is_shut_down_) {
-    VLOG(4) << "shutdown, do not TryToRegisterNewSendOne";
-    return;
-  }
-
-  VLOG(4) << "TryToRegisterNewOne on RPC NAME: " << rpc_name
-          << " REQ ID: " << req_id;
-
-  auto& reqs = rpc_reqs_[rpc_name];
-  auto& handler = rpc_call_map_[rpc_name];
-  auto& cq = rpc_cq_[rpc_name];
-
-  RequestBase* b = nullptr;
-  if (rpc_name == kRequestSend) {
-    b = new RequestSend(service_.get(), cq.get(), handler, req_id);
-  } else if (rpc_name == kRequestGet) {
-    b = new RequestGet(service_.get(), cq.get(), handler, req_id);
-
-  } else if (rpc_name == kRequestGetNoBarrier) {
-    b = new RequestGetNoBarrier(service_.get(), cq.get(), handler, req_id);
-  } else if (rpc_name == kRequestGetMonomerVariable) {
-    b = new RequestGetMonomerVariable(service_.get(), cq.get(), handler, req_id,
-                                      this);
-  } else if (rpc_name == kRequestGetMonomerBarrier) {
-    b = new RequestGetMonomerBarrier(service_.get(), cq.get(), handler, req_id,
-                                     this);
-  } else if (rpc_name == kRequestPrefetch) {
-    b = new RequestPrefetch(service_.get(), cq.get(), handler, req_id);
-  } else if (rpc_name == kRequestCheckpoint) {
-    b = new RequestCheckpointNotify(service_.get(), cq.get(), handler, req_id);
-  } else if (rpc_name == kRequestNotify) {
-    b = new RequestNotify(service_.get(), cq.get(), handler, req_id);
-  } else if (rpc_name == kRequestSendAndRecv) {
-    b = new RequestSendAndRecv(service_.get(), cq.get(), handler, req_id);
-  } else {
-    PADDLE_THROW(
-        platform::errors::InvalidArgument("not supported rpc: %s", rpc_name));
-  }
-
-  reqs[req_id] = b;
-
-  VLOG(4) << "TryToRegisterNewOne status:" << b->Status();
-}
-
-void AsyncGRPCServer::HandleRequest(
-    ::grpc::ServerCompletionQueue* cq, const std::string& rpc_name,
-    std::function<void(const std::string&, int)> TryToRegisterNewOne) {
-  void* tag = NULL;
-  bool ok = false;
-
-  while (true) {
-    VLOG(4) << "HandleRequest " << rpc_name << " wait next";
-    if (!cq->Next(&tag, &ok)) {
-      VLOG(4) << "CompletionQueue " << rpc_name << " shutdown!";
-      break;
-    }
-
-    int req_id = static_cast<int>(reinterpret_cast<intptr_t>(tag));
-    VLOG(4) << "HandleRequest " << rpc_name << ", req_id:" << req_id
-            << " get next";
-
-    auto& reqs = rpc_reqs_[rpc_name];
-    RequestBase* base = nullptr;
-    {
-      PADDLE_ENFORCE_EQ(
-          (req_id >= 0 && req_id < kRequestBufSize), true,
-          platform::errors::OutOfRange("request id: %s out of bounds: [0, %s)",
-                                       req_id, kRequestBufSize));
-      std::unique_lock<std::mutex> lock(cq_mutex_);
-      base = reqs[req_id];
-    }
-
-    VLOG(3) << base->Status2String(rpc_name);
-
-    // reference:
-    // https://github.com/tensorflow/tensorflow/issues/5596
-    // https://groups.google.com/forum/#!topic/grpc-io/xftlRy-IQwM
-    // https://groups.google.com/forum/#!topic/grpc-io/ywATt88Ef_I
-    if (!ok) {
-      VLOG(4) << "completion queue:" << rpc_name << " recv no regular event"
-              << " context:" << base->Status2String(rpc_name);
-      TryToRegisterNewOne(rpc_name, req_id);
-      delete base;
-      continue;
-    }
-
-    switch (base->Status()) {
-      case PROCESS: {
-        base->Process();
-        break;
-      }
-      case FINISH: {
-        TryToRegisterNewOne(rpc_name, req_id);
-        delete base;
-        break;
-      }
-      default: { assert(false); }
-    }
-  }
-}
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_server.h b/paddle/fluid/operators/distributed/grpc/grpc_server.h
deleted file mode 100644
index 3d68b7e8cebb4..0000000000000
--- a/paddle/fluid/operators/distributed/grpc/grpc_server.h
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <map>
-#include <memory>
-#include <set>
-#include <string>
-#include <thread>  // NOLINT
-#include <utility>
-#include <vector>
-
-#include "grpc++/grpc++.h"
-#include "paddle/fluid/framework/blocking_queue.h"
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/operators/distributed/distributed_pb.h"
-#include "paddle/fluid/operators/distributed/grpc/grpc_service.h"
-#include "paddle/fluid/operators/distributed/request_handler.h"
-#include "paddle/fluid/operators/distributed/rpc_server.h"
-#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
-#include "paddle/fluid/platform/profiler.h"
-
-namespace grpc {
-class ServerCompletionQueue;
-}  // namespace grpc
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-class RequestBase;
-
-class AsyncGRPCServer final : public RPCServer {
- public:
-  explicit AsyncGRPCServer(const std::string& address, int client_num)
-      : RPCServer(address, client_num), ready_(0) {}
-
-  virtual ~AsyncGRPCServer() {}
-  void WaitServerReady() override;
-  void StartServer() override;
-
- private:
-  // HandleRequest needs to be thread-safe.
-  void HandleRequest(
-      ::grpc::ServerCompletionQueue* cq, const std::string& rpc_name,
-      std::function<void(const std::string&, int)> TryToRegisterNewOne);
-
-  void TryToRegisterNewOne(const std::string& rpc_name, int req_id);
-  void ShutdownQueue();
-  void ShutDownImpl() override;
-
- private:
-  static const int kRequestBufSize = 100;
-
-  std::mutex cq_mutex_;
-  volatile bool is_shut_down_ = false;
-
-  std::unique_ptr<GrpcService::AsyncService> service_;
-  std::unique_ptr<::grpc::Server> server_;
-
-  // condition of the sub program
-  std::condition_variable barrier_condition_;
-
-  std::mutex mutex_ready_;
-  std::condition_variable condition_ready_;
-
-  int ready_;
-
-  std::map<std::string, std::unique_ptr<::grpc::ServerCompletionQueue>> rpc_cq_;
-  std::map<std::string, std::vector<std::unique_ptr<std::thread>>> rpc_threads_;
-  std::map<std::string, std::vector<RequestBase*>> rpc_reqs_;
-};
-
-};  // namespace distributed
-};  // namespace operators
-};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_service.h b/paddle/fluid/operators/distributed/grpc/grpc_service.h
deleted file mode 100644
index 10037c90853de..0000000000000
--- a/paddle/fluid/operators/distributed/grpc/grpc_service.h
+++ /dev/null
@@ -1,145 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <grpc++/impl/codegen/async_stream.h>
-#include <grpc++/impl/codegen/async_unary_call.h>
-#include <grpc++/impl/codegen/proto_utils.h>
-#include <grpc++/impl/codegen/rpc_method.h>
-#include <grpc++/impl/codegen/service_type.h>
-#include <grpc++/impl/codegen/status.h>
-#include <grpc++/impl/codegen/stub_options.h>
-#include <grpc++/impl/codegen/sync_stream.h>
-#include <grpc++/support/byte_buffer.h>
-#include "paddle/fluid/operators/distributed/grpc/grpc_variable_response.h"
-#include "paddle/fluid/platform/profiler.h"
-
-// NOTE: This method was originally created by tensorflow
-//       (https://github.com/tensorflow/tensorflow/) we borrow this
-//       method and did some modifications so that we can parse gRPC
-//       requests without too much copying of the tensor data.
-
-namespace grpc {
-class CompletionQueue;
-class Channel;
-class RpcService;
-class ServerCompletionQueue;
-class ServerContext;
-
-// Support parsing/unparsing of tensorflow::VariableResponse.
-// Wire-format is identical to RecvVariableResponse.
-template <>
-class SerializationTraits<
-    paddle::operators::distributed::GRPCVariableResponse> {
- public:
-  static Status Serialize(
-      const paddle::operators::distributed::GRPCVariableResponse& msg,
-      grpc_byte_buffer** bp, bool* own_buffer) {
-    PADDLE_THROW(paddle::platform::errors::Unimplemented(
-        "SerializationTraits::Serialize not implemented!"));
-    return Status();
-  }
-  static Status Deserialize(
-      grpc_byte_buffer* buffer,
-      paddle::operators::distributed::GRPCVariableResponse* msg,
-      int max_message_size = INT_MAX) {
-    if (buffer == nullptr) {
-      return Status(StatusCode::INTERNAL, "No payload");
-    }
-
-    Status result = g_core_codegen_interface->ok();
-    if (result.ok()) {
-      paddle::operators::distributed::GrpcByteSource source(buffer);
-      int ret = msg->Parse(&source);
-      if (ret != 0) {
-        result = Status(StatusCode::INTERNAL, "VariableResponse parse error");
-      }
-    }
-    g_core_codegen_interface->grpc_byte_buffer_destroy(buffer);
-    return result;
-  }
-};
-}  // namespace grpc
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-enum class GrpcMethod {
-  kSendVariable,
-  kGetVariable,
-  kPrefetchVariable,
-  kCheckpointNotify,
-  kGetVariableNoBarrier,
-  kGetMonomerVariable,
-  kGetMonomerBarrier,
-  kRequestNotify,
-  kRequestSendAndRecv,
-  // when you add new handler, change kGrpcNumMethods at the same time!
-};
-
-static const int kGrpcNumMethods =
-    static_cast<int>(GrpcMethod::kRequestSendAndRecv) + 1;
-
-inline const char* GrpcMethodName(GrpcMethod id) {
-  switch (id) {
-    case GrpcMethod::kSendVariable:
-      return "/sendrecv.SendRecvService/SendVariable";
-    case GrpcMethod::kGetVariable:
-      return "/sendrecv.SendRecvService/GetVariable";
-    case GrpcMethod::kGetVariableNoBarrier:
-      return "/sendrecv.SendRecvService/GetVariableNoBarrier";
-    case GrpcMethod::kGetMonomerVariable:
-      return "/sendrecv.SendRecvService/GetMonomerVariable";
-    case GrpcMethod::kGetMonomerBarrier:
-      return "/sendrecv.SendRecvService/GetMonomerBarrier";
-    case GrpcMethod::kPrefetchVariable:
-      return "/sendrecv.SendRecvService/PrefetchVariable";
-    case GrpcMethod::kCheckpointNotify:
-      return "/sendrecv.SendRecvService/CheckpointNotify";
-    case GrpcMethod::kRequestNotify:
-      return "/sendrecv.SendRecvService/DistributeNotify";
-    case GrpcMethod::kRequestSendAndRecv:
-      return "/sendrecv.SendRecvService/SendAndRecvVariable";
-  }
-
-  // Shouldn't be reached.
-  PADDLE_THROW(platform::errors::InvalidArgument(
-      "Invalid id: not found valid method name"));
-  return nullptr;
-}
-
-class GrpcService final {
- public:
-  class AsyncService : public ::grpc::Service {
-   public:
-    AsyncService() {
-      for (int i = 0; i < kGrpcNumMethods; ++i) {
-        AddMethod(new ::grpc::internal::RpcServiceMethod(
-            GrpcMethodName(static_cast<GrpcMethod>(i)),
-            ::grpc::internal::RpcMethod::NORMAL_RPC, nullptr));
-        ::grpc::Service::MarkMethodAsync(i);
-      }
-    }
-    virtual ~AsyncService() {}
-
-    // Make RequestAsyncUnary public for grpc_call.h
-    using ::grpc::Service::RequestAsyncUnary;
-  };
-};
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_variable_response.cc b/paddle/fluid/operators/distributed/grpc/grpc_variable_response.cc
deleted file mode 100644
index f7679e9fc924d..0000000000000
--- a/paddle/fluid/operators/distributed/grpc/grpc_variable_response.cc
+++ /dev/null
@@ -1,344 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <stdint.h>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "google/protobuf/io/coded_stream.h"
-#include "paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h"
-#include "paddle/fluid/operators/distributed/grpc/grpc_variable_response.h"
-#include "paddle/fluid/operators/distributed/send_recv.pb.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/profiler.h"
-
-namespace google {
-namespace protobuf {
-namespace io {
-class ZeroCopyInputStream;
-}  // namespace io
-}  // namespace protobuf
-}  // namespace google
-namespace grpc {
-class ByteBuffer;
-}  // namespace grpc
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-enum WireType {
-  WIRETYPE_VARINT = 0,
-  WIRETYPE_LENGTH_DELIMITED = 2,
-};
-
-inline int GetTagFieldNumber(uint32_t tag) { return tag >> 3; }
-
-inline WireType GetTagWireType(uint32_t tag) {
-  return static_cast<WireType>(tag & 0x7);
-}
-
-bool ReadVarintSizeAsInt(::google::protobuf::io::CodedInputStream* input,
-                         int* result) {
-  uint64_t v;
-  if (input->ReadVarint64(&v) && v <= static_cast<uint64_t>(INT_MAX)) {
-    *result = static_cast<int>(v);
-    return true;
-  } else {
-    return false;
-  }
-}
-
-int GRPCVariableResponse::Parse(const ::grpc::ByteBuffer& byte_buffer) {
-  GrpcByteBufferSource source;
-  source.Init(byte_buffer);
-  GrpcByteBufferSourceWrapper r(&source);
-
-  return Parse(&r);
-}
-
-bool ParseLodData(::google::protobuf::io::CodedInputStream* input,
-                  std::vector<int64_t>* lod) {
-  while (true) {
-    auto p = input->ReadTagWithCutoff(127);
-    int tag = GetTagFieldNumber(p.first);
-    WireType wt = GetTagWireType(p.first);
-
-    if (!p.second) {
-      return (tag == 0);
-    }
-
-    switch (tag) {
-      case sendrecv::VariableMessage_LodData::kLodDataFieldNumber: {
-        uint64_t v;
-        if (wt == WIRETYPE_VARINT) {
-          if (!input->ReadVarint64(&v)) {
-            return false;
-          }
-          lod->push_back(v);
-          break;
-        }
-
-        if (wt == WIRETYPE_LENGTH_DELIMITED) {
-          int num_bytes = 0;
-          if (!input->ReadVarintSizeAsInt(&num_bytes)) {
-            return tag;
-          }
-          int start_pos = input->CurrentPosition();
-          while (input->CurrentPosition() - start_pos < num_bytes) {
-            uint64_t v;
-            if (!input->ReadVarint64(&v)) {
-              return tag;
-            }
-            lod->push_back(v);
-          }
-          break;
-        }
-
-        return false;
-      }
-      default: { return false; }
-    }
-  }
-
-  return true;
-}
-
-int GRPCVariableResponse::Parse(Source* source) {
-  ::google::protobuf::io::ZeroCopyInputStream* input_stream =
-      source->contents();
-  ::google::protobuf::io::CodedInputStream input(input_stream);
-  input.SetTotalBytesLimit(INT_MAX, INT_MAX);
-
-  while (true) {
-    auto p = input.ReadTagWithCutoff(127);
-    int tag = GetTagFieldNumber(p.first);
-    WireType wt = GetTagWireType(p.first);
-    if (!p.second) {
-      if (tag != 0) {
-        return -1;
-      }
-      return 0;
-    }
-
-    switch (tag) {
-      case sendrecv::VariableMessage::kVarnameFieldNumber: {
-        uint32_t length;
-        if ((wt != WIRETYPE_LENGTH_DELIMITED) || !input.ReadVarint32(&length)) {
-          return tag;
-        }
-
-        std::string temp;
-        if (!input.ReadString(&temp, length)) {
-          return tag;
-        }
-
-        meta_.set_varname(temp);
-        break;
-      }
-      case sendrecv::VariableMessage::kTypeFieldNumber: {
-        uint32_t v;
-        if ((wt != WIRETYPE_VARINT) || !input.ReadVarint32(&v)) {
-          return tag;
-        }
-
-        meta_.set_type(static_cast<::sendrecv::VarType>(v));
-        break;
-      }
-      case sendrecv::VariableMessage::kDataTypeFieldNumber: {
-        uint32_t v = 0;
-        if ((wt != WIRETYPE_VARINT) || !input.ReadVarint32(&v)) {
-          return tag;
-        }
-
-        meta_.set_data_type(static_cast<::sendrecv::VariableMessage_Type>(v));
-        break;
-      }
-      case sendrecv::VariableMessage::kDimsFieldNumber: {
-        // not packed
-        if (wt == WIRETYPE_VARINT) {
-          uint64_t v;
-          if (!input.ReadVarint64(&v)) {
-            return tag;
-          }
-          meta_.add_dims(v);
-          break;
-        }
-
-        // packed
-        if (wt == WIRETYPE_LENGTH_DELIMITED) {
-          int num_bytes = 0;
-          if (!input.ReadVarintSizeAsInt(&num_bytes)) {
-            return tag;
-          }
-          int start_pos = input.CurrentPosition();
-          while (input.CurrentPosition() - start_pos < num_bytes) {
-            uint64_t v;
-            if (!input.ReadVarint64(&v)) {
-              return tag;
-            }
-            meta_.add_dims(v);
-          }
-          break;
-        }
-        return tag;
-      }
-      case sendrecv::VariableMessage::kLodLevelFieldNumber: {
-        uint64_t v = 0;
-        if ((wt != WIRETYPE_VARINT) || !input.ReadVarint64(&v)) {
-          return tag;
-        }
-        meta_.set_lod_level(static_cast<int64_t>(v));
-        break;
-      }
-      case sendrecv::VariableMessage::kLodFieldNumber: {
-        int length = 0;
-        if (wt != WIRETYPE_LENGTH_DELIMITED ||
-            !ReadVarintSizeAsInt(&input, &length)) {
-          return tag;
-        }
-
-        std::pair<::google::protobuf::io::CodedInputStream::Limit, int> p =
-            input.IncrementRecursionDepthAndPushLimit(length);
-
-        std::vector<int64_t> lod_data;
-        if (p.second < 0 || !ParseLodData(&input, &lod_data)) {
-          return tag;
-        }
-
-        if (!input.DecrementRecursionDepthAndPopLimit(p.first)) {
-          return tag;
-        }
-
-        if (lod_data.size() == 0) {
-          break;
-        }
-
-        auto lod = meta_.add_lod();
-        for (uint32_t i = 0; i < lod_data.size(); i++) {
-          lod->add_lod_data(lod_data[i]);
-        }
-        break;
-      }
-      case sendrecv::VariableMessage::kSlrHeightFieldNumber: {
-        uint64_t v = 0;
-        if ((wt != WIRETYPE_VARINT) || !input.ReadVarint64(&v)) {
-          return tag;
-        }
-        meta_.set_slr_height(static_cast<int64_t>(v));
-        break;
-      }
-      case sendrecv::VariableMessage::kSerializedFieldNumber: {
-        int num_bytes = 0;
-        if (wt != WIRETYPE_LENGTH_DELIMITED ||
-            !ReadVarintSizeAsInt(&input, &num_bytes)) {
-          return tag;
-        }
-
-        if (!ProcSerializedField(tag, &input, num_bytes)) {
-          return tag;
-        }
-
-        break;
-      }
-      case sendrecv::VariableMessage::kRowsFieldNumber: {
-        PADDLE_ENFORCE((meta_.type() == sendrecv::SELECTED_ROWS ||
-                        meta_.type() == sendrecv::LOD_TENSOR) &&
-                           meta_.varname() != "",
-                       platform::errors::PreconditionNotMet(
-                           "meta info should be got first!"));
-
-        int num_bytes = 0;
-        if (wt != WIRETYPE_LENGTH_DELIMITED ||
-            !ReadVarintSizeAsInt(&input, &num_bytes)) {
-          return tag;
-        }
-
-        if (!CopySelectRowsData(&input, *dev_ctx_, num_bytes)) {
-          return tag;
-        }
-        break;
-      }
-      case sendrecv::VariableMessage::kOutVarnameFieldNumber: {
-        uint32_t length;
-        if ((wt != WIRETYPE_LENGTH_DELIMITED) || !input.ReadVarint32(&length)) {
-          return tag;
-        }
-
-        std::string temp;
-        if (!input.ReadString(&temp, length)) {
-          return tag;
-        }
-
-        meta_.set_out_varname(temp);
-        break;
-      }
-      case sendrecv::VariableMessage::kProfileFieldNumber: {
-        uint64_t profiling = 0;
-        if (!input.ReadVarint64(&profiling)) {
-          return tag;
-        }
-        meta_.set_profile(profiling);
-        int64_t listener_id = platform::ListenerId();
-        if (listener_id <= 0) {
-          break;
-        }
-        if (profiling == platform::kEnableProfiler &&
-            !platform::IsProfileEnabled()) {
-          platform::EnableProfiler(platform::ProfilerState::kCPU);
-        } else if (profiling == platform::kDisableProfiler &&
-                   platform::IsProfileEnabled()) {
-          platform::DisableProfiler(
-              platform::EventSortingKey::kDefault,
-              string::Sprintf("%s_%lld", FLAGS_rpc_server_profile_path,
-                              listener_id));
-        }
-        break;
-      }
-      case sendrecv::VariableMessage::kTrainerIdFieldNumber: {
-        uint64_t trainer_id = 0;
-        if (!input.ReadVarint64(&trainer_id)) {
-          return tag;
-        }
-        meta_.set_trainer_id(trainer_id);
-        break;
-      }
-      case sendrecv::VariableMessage::kTableNameFieldNumber: {
-        uint32_t length;
-        if ((wt != WIRETYPE_LENGTH_DELIMITED) || !input.ReadVarint32(&length)) {
-          return tag;
-        }
-
-        std::string temp;
-        if (!input.ReadString(&temp, length)) {
-          return tag;
-        }
-
-        meta_.set_table_name(temp);
-        break;
-      }
-      default: {
-        // Unknown tag, return unknown error.
-        return -1;
-      }
-    }
-  }
-
-  return 0;
-}
-
-};  // namespace distributed
-};  // namespace operators
-};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_variable_response.h b/paddle/fluid/operators/distributed/grpc/grpc_variable_response.h
deleted file mode 100644
index 4d12b4a4bacd7..0000000000000
--- a/paddle/fluid/operators/distributed/grpc/grpc_variable_response.h
+++ /dev/null
@@ -1,67 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-
-#include "google/protobuf/io/coded_stream.h"
-#include "google/protobuf/io/zero_copy_stream.h"
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/operators/distributed/distributed_pb.h"
-#include "paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h"
-#include "paddle/fluid/operators/distributed/variable_response.h"
-
-namespace grpc {
-class ByteBuffer;
-}  // namespace grpc
-namespace paddle {
-namespace framework {
-class Scope;
-}  // namespace framework
-namespace platform {
-class DeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-class GRPCVariableResponse : public VariableResponse {
- public:
-  GRPCVariableResponse(const framework::Scope* scope,
-                       const platform::DeviceContext* dev_ctx,
-                       bool create_scope = false)
-      : VariableResponse(scope, dev_ctx, create_scope) {}
-
-  virtual ~GRPCVariableResponse() {}
-
-  int Parse(Source* source) override;
-
-  // return:
-  // 0:ok.
-  // -1: unkown error.
-  // other: number of error field.
-  int Parse(const ::grpc::ByteBuffer& byte_buffer);
-};
-
-};  // namespace distributed
-};  // namespace operators
-};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/heart_beat_monitor.cc b/paddle/fluid/operators/distributed/heart_beat_monitor.cc
deleted file mode 100644
index 9f537f5334898..0000000000000
--- a/paddle/fluid/operators/distributed/heart_beat_monitor.cc
+++ /dev/null
@@ -1,97 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/distributed/heart_beat_monitor.h"
-
-#include <ctime>
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-DEFINE_int32(worker_update_interval_secs, 900,
-             " the longest time interval between the worker update variables");
-
-inline int GetCurrentUS() {
-  // current date/time based on current system
-  time_t t = std::time(0);
-  int now = static_cast<int>(t);
-  return now;
-}
-
-void HeartBeatMonitor::Update(const int worker_id, std::string be_monitored_var,
-                              WorkerStatus status) {
-  if (status == UNINITED) {
-    LOG(WARNING) << "HeartBeatMonitor receive UNINITED status can not be used "
-                    "in Update, something error";
-  }
-
-  if (!is_chief_) {
-    return;
-  }
-
-  if ((be_monitored_var == be_monitored_var_ && status == RUNNING) ||
-      status == COMPLETED) {
-    auto timestamp = GetCurrentUS();
-    UnderMonitoredWorker& worker = worker_status_map_.at(worker_id);
-
-    if (worker.status != COMPLETED) {
-      worker.status = status;
-    }
-    worker.timestamp = timestamp;
-    return;
-  }
-}
-
-void HeartBeatMonitor::LostWorkerMonitor() {
-  VLOG(1) << "worker heartbeat monitor start at No.0 parameter server";
-  while (running_) {
-    for (int id = 0; id < workers_; ++id) {
-      auto& worker = worker_status_map_.at(id);
-
-      if (worker.status == UNINITED) {
-        VLOG(4) << "worker " << worker.id << " is under UNINITED";
-        continue;
-      }
-      if (worker.status == COMPLETED) {
-        VLOG(4) << "worker " << worker.id << " is under COMPLETED";
-        continue;
-      }
-
-      auto timestamp = GetCurrentUS();
-
-      VLOG(4) << "worker " << worker.id << " status is " << worker.status
-              << " timestamp is " << worker.timestamp << " the interval is "
-              << timestamp - worker.timestamp;
-
-      if (timestamp - worker.timestamp >= FLAGS_worker_update_interval_secs) {
-        PADDLE_THROW(platform::errors::ExecutionTimeout(
-            "the latest update of worker %d is %d secs ago, we doubt the "
-            "the worker is not alive and this may have a bad effect on the "
-            "fitting result, please check",
-            worker.id, FLAGS_worker_update_interval_secs));
-      }
-    }
-
-    std::this_thread::sleep_for(std::chrono::milliseconds(10 * 1000));
-  }
-  VLOG(1) << "worker heartbeat monitor stopped, thread exit";
-}
-
-std::once_flag HeartBeatMonitor::init_flag_;
-std::unique_ptr<HeartBeatMonitor> HeartBeatMonitor::monitor_(nullptr);
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/heart_beat_monitor.h b/paddle/fluid/operators/distributed/heart_beat_monitor.h
deleted file mode 100644
index d96433c318b35..0000000000000
--- a/paddle/fluid/operators/distributed/heart_beat_monitor.h
+++ /dev/null
@@ -1,127 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <ThreadPool.h>
-#include <functional>
-#include <future>  // NOLINT
-#include <memory>
-#include <string>
-#include <thread>  // NOLINT
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-#include "gflags/gflags.h"
-
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-enum WorkerStatus { UNINITED = 0, RUNNING, COMPLETED };
-
-struct UnderMonitoredWorker {
-  int id;
-  WorkerStatus status;
-  int timestamp;
-
-  UnderMonitoredWorker() {}
-
-  explicit UnderMonitoredWorker(int worker_id) {
-    this->id = worker_id;
-    this->status = UNINITED;
-    this->timestamp = 0;
-  }
-};
-
-class HeartBeatMonitor {
- public:
-  explicit HeartBeatMonitor(int workers, bool is_chief,
-                            std::string be_monitored_var)
-      : workers_(workers),
-        is_chief_(is_chief),
-        be_monitored_var_(be_monitored_var),
-        running_(true) {
-    PADDLE_ENFORCE_GT(workers, 0, platform::errors::InvalidArgument(
-                                      "workers must greater than 0."));
-
-    for (auto worker_id = 0; worker_id < workers; worker_id++) {
-      UnderMonitoredWorker worker(worker_id);
-      worker_status_map_[worker_id] = std::move(worker);
-    }
-
-    // we define the No.0 pserver is the first parameter server
-    // only No.0 will check the heartbeat of all trainers
-    if (is_chief) {
-      monitor_thread_.reset(new std::thread(
-          std::bind(&HeartBeatMonitor::LostWorkerMonitor, this)));
-    }
-  }
-
-  ~HeartBeatMonitor() {
-    running_ = false;
-    if (monitor_thread_) monitor_thread_->join();
-  }
-
-  static void Init(int workers, bool is_chief, std::string be_monitored_var) {
-    std::call_once(init_flag_, &HeartBeatMonitor::InitImpl, workers, is_chief,
-                   be_monitored_var);
-  }
-
-  static HeartBeatMonitor* GetInstance() { return monitor_.get(); }
-
-  void Stop() {
-    running_ = false;
-    if (!monitor_) {
-      VLOG(0) << "HeartBeatMonitor is not inited, do nothing";
-    } else {
-      if (monitor_thread_) {
-        monitor_thread_->join();
-        monitor_thread_.reset(nullptr);
-      }
-    }
-  }
-
-  void Update(const int worker_id, std::string be_monitored_var,
-              WorkerStatus status);
-
-  void LostWorkerMonitor();
-
- private:
-  // Init is called by GetInstance.
-  static void InitImpl(int workers, bool is_chief,
-                       std::string be_monitored_var) {
-    if (monitor_ == nullptr) {
-      monitor_.reset(new HeartBeatMonitor(workers, is_chief, be_monitored_var));
-    }
-  }
-
-  static std::once_flag init_flag_;
-  static std::unique_ptr<HeartBeatMonitor> monitor_;
-
-  int workers_;
-  bool is_chief_;
-  std::string be_monitored_var_;
-  std::unordered_map<int, UnderMonitoredWorker> worker_status_map_;
-  std::unique_ptr<std::thread> monitor_thread_{nullptr};
-  std::mutex mutex_;
-  bool running_ = false;
-};
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/heart_beat_monitor_test.cc b/paddle/fluid/operators/distributed/heart_beat_monitor_test.cc
deleted file mode 100644
index 8505023f63a95..0000000000000
--- a/paddle/fluid/operators/distributed/heart_beat_monitor_test.cc
+++ /dev/null
@@ -1,54 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/distributed/heart_beat_monitor.h"
-
-#include "gtest/gtest.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-void run(HeartBeatMonitor* monitor) { monitor->LostWorkerMonitor(); }
-
-TEST(HeartBeatMonitor, All) {
-  int trainers = 10;
-  int pserver_id = 0;
-  std::string var = "nce_w@GRAD.block0";
-  std::string var2 = "nce_w@GRAD.block2";
-
-  HeartBeatMonitor::Init(trainers, pserver_id == 0, var);
-
-  auto* monitor = HeartBeatMonitor::GetInstance();
-
-  std::vector<int> ids{1, 3, 5, 7};
-
-  for (auto& id : ids) {
-    monitor->Update(id, var, RUNNING);
-  }
-
-  monitor->Update(9, var2, RUNNING);
-  monitor->Update(2, var, COMPLETED);
-
-  std::thread t(run, monitor);
-  t.detach();
-
-  std::this_thread::sleep_for(std::chrono::milliseconds(15 * 1000));
-
-  monitor->Stop();
-}
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/large_scale_kv.h b/paddle/fluid/operators/distributed/large_scale_kv.h
deleted file mode 100644
index da2281231fc8a..0000000000000
--- a/paddle/fluid/operators/distributed/large_scale_kv.h
+++ /dev/null
@@ -1,848 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <ThreadPool.h>
-#include <functional>
-#include <future>  // NOLINT
-#include <memory>
-#include <string>
-#include <thread>  // NOLINT
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-#include "gflags/gflags.h"
-
-#include "paddle/fluid/framework/generator.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/rw_lock.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/framework/threadpool.h"
-#include "paddle/fluid/framework/variable.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/platform/port.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/fluid/string/string_helper.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-enum Mode { training, infer };
-enum InitType { uniform_random, fill_constant, gaussian_random };
-
-inline std::vector<int> bucket(const int v_size, const int b_size) {
-  int remainder = v_size % b_size;
-  int bucket = v_size / b_size;
-  std::vector<int> ret_vec(b_size, bucket);
-  for (int i = 0; i < remainder; ++i) {
-    ret_vec[i] = ret_vec[i] + 1;
-  }
-  int cur_bucket = 0;
-  for (int &j : ret_vec) {
-    int tmp = j;
-    j = cur_bucket;
-    cur_bucket += tmp;
-  }
-  ret_vec.push_back(cur_bucket);
-  return ret_vec;
-}
-
-class Initializer {
- public:
-  Initializer() {}
-
-  explicit Initializer(const std::vector<std::string> &attrs) {}
-
-  virtual float GetValue() = 0;
-
-  virtual ~Initializer() {}
-
- protected:
-  std::string name_;
-  unsigned int seed_;
-};
-
-class UniformInitializer : public Initializer {
- public:
-  explicit UniformInitializer(const std::vector<std::string> &attrs) {
-    name_ = attrs[0];
-    seed_ = static_cast<unsigned int>(std::stoi(attrs[1]));
-    min_ = std::stof(attrs[2]);
-    max_ = std::stof(attrs[3]);
-
-    dist_ = std::uniform_real_distribution<float>(min_, max_);
-    random_engine_ = framework::GetCPURandomEngine(seed_);
-  }
-
-  float GetValue() override { return dist_(*random_engine_); }
-
- private:
-  float min_;
-  float max_;
-
-  std::shared_ptr<std::mt19937_64> random_engine_;
-  std::uniform_real_distribution<float> dist_;
-};
-
-template <typename T>
-inline bool entry(const int count, const T threshold);
-
-template <>
-inline bool entry<std::string>(const int count, const std::string threshold) {
-  return true;
-}
-
-template <>
-inline bool entry<int>(const int count, const int threshold) {
-  return count >= threshold;
-}
-
-template <>
-inline bool entry<float>(const int count, const float threshold) {
-  UniformInitializer uniform = UniformInitializer({"0", "0", "1"});
-  return uniform.GetValue() >= threshold;
-}
-
-class GaussianInitializer : public Initializer {
- public:
-  explicit GaussianInitializer(const std::vector<std::string> &attrs) {
-    name_ = attrs[0];
-    seed_ = static_cast<unsigned int>(std::stoi(attrs[1]));
-    mean_ = std::stof(attrs[2]);
-    std_ = std::stof(attrs[3]);
-
-    random_engine_ = framework::GetCPURandomEngine(seed_);
-
-    dist_ = std::normal_distribution<float>(mean_, std_);
-  }
-
-  float GetValue() override { return dist_(*random_engine_); }
-
- private:
-  float std_;
-  float mean_;
-
-  std::shared_ptr<std::mt19937_64> random_engine_;
-  std::normal_distribution<float> dist_;
-};
-
-class FillConstantInitializer : public Initializer {
- public:
-  explicit FillConstantInitializer(const std::vector<std::string> &attrs) {
-    name_ = attrs[0];
-    value_ = std::stof(attrs[1]);
-  }
-
-  float GetValue() override { return value_; }
-
- private:
-  float value_;
-};
-
-struct SparseMeta {
-  std::string name;
-  std::string grad_name;
-  std::vector<std::string> value_names;
-  std::vector<int> value_dims;
-  std::vector<std::string> cached_varnames;
-  std::vector<std::string> initializer_attrs;
-  std::string entry;
-  Mode mode;
-
-  std::string ToString() {
-    std::stringstream ss;
-    ss << "name: " << name << " ";
-    ss << "mode: " << mode << " ";
-
-    for (int i = 0; i < static_cast<int>(value_names.size()); i++) {
-      ss << "value_name: " << value_names[i] << " dim: " << value_dims[i]
-         << " ";
-    }
-
-    ss << " grad var: " << grad_name;
-
-    ss << " cached varnames: ";
-    for (int i = 0; i < static_cast<int>(cached_varnames.size()); i++) {
-      ss << cached_varnames[i] << " ";
-    }
-
-    ss << " initializer attrs: ";
-    for (int i = 0; i < static_cast<int>(initializer_attrs.size()); i++) {
-      ss << initializer_attrs[i] << " ";
-    }
-
-    ss << " entry attrs: " << entry;
-
-    return ss.str();
-  }
-};
-
-struct VALUE {
-  explicit VALUE(const std::vector<std::string> &names)
-      : names_(names), count_(0), unseen_days_(0) {
-    values_.resize(names.size());
-    for (int i = 0; i < static_cast<int>(names.size()); i++) {
-      places[names[i]] = i;
-    }
-  }
-
-  void set(std::vector<std::vector<float>> *values) {
-    values_ = std::move(*values);
-  }
-
-  void set(const std::vector<std::string> &names,
-           const std::vector<std::vector<float>> &values) {
-    for (int i = 0; i < static_cast<int>(names.size()); i++) {
-      auto idx = places[names[i]];
-      auto value = values[i];
-      values_[idx].assign(value.begin(), value.end());
-    }
-  }
-
-  std::vector<std::vector<float> *> get() {
-    auto pts = std::vector<std::vector<float> *>();
-    pts.reserve(values_.size());
-
-    for (auto &value : values_) {
-      pts.push_back(&value);
-    }
-    return pts;
-  }
-
-  int fetch_count() { return ++count_; }
-  void reset_unseen_days() { unseen_days_ = 0; }
-
-  void set_entry(bool is_entry) { is_entry_ = is_entry; }
-
-  bool get_entry() { return is_entry_; }
-
-  std::vector<std::vector<float> *> get(const std::vector<std::string> names) {
-    auto pts = std::vector<std::vector<float> *>();
-    pts.reserve(values_.size());
-
-    for (int i = 0; i < static_cast<int>(names.size()); i++) {
-      pts.push_back(&(values_[places[names[i]]]));
-    }
-    return pts;
-  }
-
-  std::vector<std::string> names_;
-  int count_;
-  bool seen_after_last_save_;
-  int unseen_days_;
-  bool is_entry_;
-  std::vector<std::vector<float>> values_;
-  std::unordered_map<std::string, int> places;
-};
-
-class ValueBlock {
- public:
-  explicit ValueBlock(const std::vector<std::string> value_names,
-                      const std::vector<int> value_dims, const Mode &mode,
-                      const std::vector<std::string> &init_attrs,
-                      const std::string &entry_attr)
-      : value_names_(value_names), value_dims_(value_dims), mode_(mode) {
-    // for Initializer
-    for (size_t i = 0; i < value_names.size(); i++) {
-      auto name = value_names[i];
-      auto slices = string::split_string<std::string>(init_attrs[i], "&");
-
-      if (slices[0] == "gaussian_random") {
-        initializers_[name] = new GaussianInitializer(slices);
-      } else if (slices[0] == "fill_constant") {
-        initializers_[name] = new FillConstantInitializer(slices);
-      } else if (slices[0] == "uniform_random") {
-        initializers_[name] = new UniformInitializer(slices);
-      } else {
-        PADDLE_THROW(
-            platform::errors::InvalidArgument("%s can not be supported", name));
-      }
-    }
-
-    // for Entry
-    {
-      if (entry_attr == "none") {
-        entry_func_ =
-            std::bind(entry<std::string>, std::placeholders::_1, "none");
-      } else {
-        auto slices = string::split_string<std::string>(entry_attr, "&");
-        if (slices[0] == "count_filter") {
-          int threshold = std::stoi(slices[1]);
-          entry_func_ = std::bind(entry<int>, std::placeholders::_1, threshold);
-        } else if (slices[0] == "probability") {
-          float threshold = std::stof(slices[1]);
-          entry_func_ =
-              std::bind(entry<float>, std::placeholders::_1, threshold);
-        }
-      }
-    }
-
-    rwlock_.reset(new framework::RWLock);
-  }
-
-  ~ValueBlock() {
-    //    for (auto init : initializers_) {
-    //      delete init.second;
-    //      initializers_.erase(init.first);
-    //    }
-    //
-    //    for (auto value : values_) {
-    //      delete value.second;
-    //      values_.erase(value.first);
-    //    }
-  }
-
-  void Init(const int64_t &id, std::vector<std::vector<float>> *values,
-            int count) {
-    if (Has(id)) {
-      PADDLE_THROW(platform::errors::AlreadyExists("id already exist, error"));
-    }
-
-    if (values->size() != value_names_.size()) {
-      PADDLE_THROW(
-          platform::errors::AlreadyExists("values can not match, error"));
-    }
-
-    auto value = new VALUE(value_names_);
-    value->set(values);
-    value->seen_after_last_save_ = true;
-    value->count_ = count;
-    values_[id] = value;
-  }
-
-  std::vector<std::vector<float> *> Get(
-      const int64_t &id, const std::vector<std::string> &value_names) {
-    rwlock_->RDLock();
-    auto ret_values = values_.at(id)->get(value_names);
-    rwlock_->UNLock();
-    return ret_values;
-  }
-
-  void InitFromInitializer(const int64_t &id,
-                           const std::vector<std::string> &value_names) {
-    rwlock_->WRLock();
-
-    if (Has(id)) {
-      Update(id);
-      rwlock_->UNLock();
-      return;
-    }
-
-    auto rets = std::vector<std::vector<float>>();
-    rets.resize(value_names_.size());
-
-    for (int i = 0; i < static_cast<int>(value_names_.size()); i++) {
-      auto name = value_names_[i];
-      auto *init = initializers_.at(name);
-
-      auto dim = value_dims_[i];
-      rets[i].resize(dim);
-
-      for (int j = 0; j < static_cast<int>(dim); j++) {
-        rets[i][j] = init->GetValue();
-      }
-    }
-
-    Init(id, &rets, 0);
-    Update(id);
-    rwlock_->UNLock();
-  }
-
-  bool GetEntry(const int64_t &id) {
-    rwlock_->RDLock();
-    auto value = values_.at(id);
-    auto entry = value->get_entry();
-    rwlock_->UNLock();
-    return entry;
-  }
-
-  void Set(const int64_t &id, const std::vector<std::string> &value_names,
-           const std::vector<std::vector<float>> &values) {
-    rwlock_->WRLock();
-    auto value = values_.at(id);
-    value->set(value_names, values);
-    rwlock_->UNLock();
-  }
-
-  void Update(const int64_t id) {
-    auto *value = values_.at(id);
-    value->reset_unseen_days();
-    auto count = value->fetch_count();
-
-    if (!value->get_entry()) {
-      value->set_entry(entry_func_(count));
-    }
-  }
-
- private:
-  bool Has(const int64_t id) {
-    auto got = values_.find(id);
-    if (got == values_.end()) {
-      return false;
-    } else {
-      return true;
-    }
-  }
-
- public:
-  std::unordered_map<int64_t, VALUE *> values_;
-
- private:
-  std::vector<std::string> value_names_;
-  std::vector<int> value_dims_;
-  Mode mode_;
-  std::function<bool(int64_t)> entry_func_;
-  std::unordered_map<std::string, Initializer *> initializers_;
-  std::unique_ptr<framework::RWLock> rwlock_{nullptr};
-};
-
-class SparseVariable {
- public:
-  explicit SparseVariable(const SparseMeta &meta) {
-    meta_.name = meta.name;
-    meta_.mode = meta.mode;
-    meta_.value_names = meta.value_names;
-    meta_.value_dims = meta.value_dims;
-    meta_.grad_name = meta.grad_name;
-    meta_.cached_varnames = meta.cached_varnames;
-    meta_.initializer_attrs = meta.initializer_attrs;
-    meta_.entry = meta.entry;
-
-    for (int i = 0; i < static_cast<int>(meta_.value_names.size()); i++) {
-      values_dims_[meta_.value_names[i]] = meta_.value_dims[i];
-    }
-
-    for (size_t i = 0; i < shard_num_; i++) {
-      auto block = std::make_shared<ValueBlock>(
-          meta.value_names, meta.value_dims, meta.mode, meta.initializer_attrs,
-          meta.entry);
-      shard_blocks_.emplace_back(block);
-    }
-
-    rwlock_.reset(new framework::RWLock);
-  }
-
-  void Init(const std::vector<int64_t> &ids) {
-    rwlock_->RDLock();
-    for (auto &id : ids) {
-      auto *block = GetShard(id);
-      block->InitFromInitializer(id, meta_.value_names);
-    }
-    rwlock_->UNLock();
-  }
-
-  void Get(const std::vector<int64_t> &ids,
-           const std::vector<std::string> &value_names,
-           std::vector<std::vector<std::vector<float> *>> *values) {
-    values->resize(ids.size());
-
-    auto buckets = bucket(ids.size(), 8);
-    std::vector<std::future<void>> fs;
-
-    for (int j = 0; j < 8; ++j) {
-      auto begin = buckets[j];
-      auto end = buckets[j + 1];
-
-      fs.push_back(
-          framework::Async([begin, end, &values, &ids, &value_names, this]() {
-            for (int x = begin; x < end; x++) {
-              auto id = ids[x];
-              auto *block = GetShard(id);
-              auto id_values = block->Get(id, value_names);
-              (*values)[x] = id_values;
-            }
-          }));
-    }
-
-    for (size_t i = 0; i < fs.size(); ++i) fs[i].wait();
-  }
-
-  void GetEntry(const std::vector<int64_t> &ids, std::vector<int64_t> *values) {
-    auto buckets = bucket(ids.size(), 8);
-    std::vector<std::future<void>> fs;
-
-    for (int j = 0; j < 8; ++j) {
-      auto begin = buckets[j];
-      auto end = buckets[j + 1];
-
-      fs.push_back(framework::Async([begin, end, &values, &ids, this]() {
-        for (int x = begin; x < end; x++) {
-          auto id = ids[x];
-          auto *block = GetShard(id);
-          auto is_entry = block->GetEntry(id);
-
-          if (!is_entry) {
-            values->push_back(id);
-          }
-        }
-      }));
-    }
-    for (size_t i = 0; i < fs.size(); ++i) fs[i].wait();
-  }
-
-  void Set(const std::vector<int64_t> &ids,
-           const std::vector<std::string> &value_names,
-           const std::vector<std::vector<std::vector<float>>> &values) {
-    for (int i = 0; i < static_cast<int>(ids.size()); i++) {
-      GetShard(ids[i])->Set(ids[i], value_names, values[i]);
-    }
-  }
-
-  void Dims(std::vector<std::string> value_names, std::vector<int64_t> *dims) {
-    for (auto &name : value_names) {
-      dims->push_back(values_dims_.at(name));
-    }
-  }
-
-  std::vector<std::string> CachedVarnames() const {
-    return meta_.cached_varnames;
-  }
-
-  void Load(const std::string &dirname) {
-    rwlock_->WRLock();
-    VLOG(1) << "load " << meta_.name << " from dir: " << dirname << " begin";
-
-    std::vector<std::string> filenames;
-    for (auto &value_name : meta_.value_names) {
-      auto filename = string::Sprintf("%s/%s", dirname, value_name);
-      filenames.push_back(filename);
-    }
-
-    LoadFromSelectedRows(filenames, meta_.value_names);
-    VLOG(1) << "load " << meta_.name << " in dir: " << dirname << " done";
-    rwlock_->UNLock();
-  }
-
-  void LoadFromSelectedRows(const std::vector<std::string> &filenames,
-                            const std::vector<std::string> &valuenames) {
-    std::vector<std::shared_ptr<framework::Variable>> variables;
-    auto place = platform::CPUPlace();
-
-    for (int i = 0; i < static_cast<int>(filenames.size()); i++) {
-      auto var = std::make_shared<framework::Variable>();
-      variables.push_back(var);
-      auto &filename = filenames[i];
-      std::ifstream fin(filename, std::ios::binary);
-      auto *selectedRows = var->GetMutable<framework::SelectedRows>();
-
-      platform::DeviceContextPool &pool =
-          platform::DeviceContextPool::Instance();
-      auto &dev_ctx = *pool.Get(place);
-
-      framework::DeserializeFromStream(fin, selectedRows, dev_ctx);
-      selectedRows->SyncIndex();
-    }
-
-    std::vector<const float *> tensors;
-
-    for (int i = 0; i < static_cast<int>(filenames.size()); i++) {
-      auto &slr = variables[i]->Get<framework::SelectedRows>();
-      auto src_t = slr.value();
-      const auto *value = src_t.data<float>();
-      tensors.push_back(value);
-    }
-
-    for (int i = 1; i < static_cast<int>(filenames.size()); i++) {
-      auto rows_0 = variables[0]->Get<framework::SelectedRows>().rows();
-      auto rows_i = variables[i]->Get<framework::SelectedRows>().rows();
-
-      bool is_equal = std::equal(rows_0.begin(), rows_0.end(), rows_i.begin());
-
-      if (!is_equal) {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "%s and %s are not equal, can not be load rightly", filenames[0],
-            filenames[i]));
-      }
-    }
-
-    auto rows = variables[0]->Get<framework::SelectedRows>().rows();
-
-    for (auto i = 0; i < static_cast<int64_t>(rows.size()); i++) {
-      auto id = rows[i];
-      std::vector<std::vector<float>> values;
-      values.resize(filenames.size());
-
-      for (int j = 0; j < static_cast<int>(filenames.size()); ++j) {
-        values[j].resize(meta_.value_dims[j]);
-        std::memcpy(values[j].data(), tensors[j] + i * meta_.value_dims[j],
-                    sizeof(float) * meta_.value_dims[j]);
-      }
-
-      auto *block = GetShard(id);
-      block->Init(id, &values, 0);
-      block->Update(id);
-    }
-  }
-
-  void Save(const std::string &dirname, const int mode = 0) {
-    rwlock_->WRLock();
-    VLOG(3) << "save " << meta_.name << " in dir: " << dirname << " begin";
-
-    MkDirRecursively(dirname.c_str());
-
-    std::vector<std::string> filenames;
-    for (auto &value_name : meta_.value_names) {
-      auto filename = string::Sprintf("%s/%s", dirname, value_name);
-      filenames.push_back(filename);
-    }
-
-    SaveToSelectedRows(filenames, meta_.value_names, mode);
-    VLOG(3) << "save " << meta_.name << " in dir: " << dirname << " done";
-    rwlock_->UNLock();
-  }
-
-  void SaveToSelectedRows(const std::vector<std::string> &filenames,
-                          const std::vector<std::string> &valuenames,
-                          const int mode) {
-    for (auto &value_name : valuenames) {
-      auto it = std::find(meta_.value_names.begin(), meta_.value_names.end(),
-                          value_name);
-      if (it == meta_.value_names.end()) {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "[%s] is invalid param for [%s]", value_name, meta_.name));
-      }
-    }
-
-    auto place = platform::CPUPlace();
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(place);
-
-    std::vector<int64_t> ids;
-
-    for (auto &block : shard_blocks_) {
-      for (auto value : block->values_) {
-        if (mode == 0) {
-          ids.push_back(value.first);
-        } else {
-          bool id_need_save = false;
-          // save all params
-          if (mode == 1) {
-            id_need_save = true;
-          } else {
-            id_need_save = value.second->seen_after_last_save_;
-          }
-
-          if (id_need_save) {
-            ids.push_back(value.first);
-          }
-          value.second->seen_after_last_save_ = false;
-        }
-      }
-    }
-
-    VLOG(3) << "save " << ids.size() << " feasigns for " << meta_.name
-            << " with mode: " << mode;
-
-    std::vector<std::shared_ptr<framework::Variable>> variables;
-    std::vector<float *> tensors;
-    std::vector<int64_t> dims;
-
-    for (int i = 0; i < static_cast<int>(filenames.size()); i++) {
-      auto dim = values_dims_.at(valuenames[i]);
-      auto var = std::make_shared<framework::Variable>();
-      auto *slr = var->GetMutable<framework::SelectedRows>();
-      auto *src_t = slr->mutable_value();
-
-      src_t->Resize({static_cast<int64_t>(ids.size()), dim});
-      auto *value = src_t->mutable_data<float>(place);
-
-      dims.push_back(dim);
-      variables.push_back(var);
-      tensors.push_back(value);
-    }
-
-    std::vector<std::vector<std::vector<float> *>> values;
-    Get(ids, valuenames, &values);
-
-    int64_t offset = 0;
-    for (auto &vss : values) {
-      for (int i = 0; i < static_cast<int>(vss.size()); i++) {
-        auto &vs = vss[i];
-        std::memcpy(tensors[i] + offset * dims[i], vs->data(),
-                    sizeof(float) * dims[i]);
-      }
-      offset += 1;
-    }
-
-    for (auto &var : variables) {
-      auto *slr = var->GetMutable<framework::SelectedRows>();
-      slr->set_rows(ids);
-      slr->set_height(ids.size());
-    }
-
-    for (int i = 0; i < static_cast<int>(filenames.size()); i++) {
-      auto &filename = filenames[i];
-      auto &selectedRows = variables[i]->Get<framework::SelectedRows>();
-
-      std::ofstream fout(filename, std::ios::binary);
-      PADDLE_ENFORCE_EQ(static_cast<bool>(fout), true,
-                        platform::errors::Unavailable(
-                            "Cannot open %s to save variables.", filename));
-
-      framework::SerializeToStream(fout, selectedRows, dev_ctx);
-      fout.close();
-    }
-  }
-
-  void SaveToText(const std::vector<std::string> &filenames,
-                  const std::vector<std::string> &valuenames) {
-    for (auto &value_name : valuenames) {
-      auto it = std::find(meta_.value_names.begin(), meta_.value_names.end(),
-                          value_name);
-      if (it == meta_.value_names.end()) {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "[%s] is invalid param for [%s]", value_name, meta_.name));
-      }
-    }
-
-    std::vector<std::unique_ptr<std::ofstream>> fouts;
-
-    for (auto filename : filenames) {
-      std::unique_ptr<std::ofstream> fout(new std::ofstream(filename));
-      fouts.push_back(std::move(fout));
-    }
-
-    for (auto &block : shard_blocks_) {
-      for (auto value : block->values_) {
-        std::vector<std::vector<float> *> vss = value.second->get(valuenames);
-
-        auto id = value.first;
-
-        for (int i = 0; i < static_cast<int>(vss.size()); i++) {
-          auto &vs = vss[i];
-          std::stringstream ss;
-          ss << id << "\t";
-          ss << vs->size() << "\t";
-          for (auto v : (*vs)) {
-            ss << v << " ";
-          }
-          ss << "\n";
-
-          fouts[i]->write(ss.str().c_str(), sizeof(char) * ss.str().size());
-        }
-      }
-    }
-
-    for (int i = 0; i < static_cast<int>(fouts.size()); i++) {
-      fouts[i]->close();
-    }
-  }
-
-  int64_t Size() {
-    int64_t cnt = 0;
-
-    for (auto &block : shard_blocks_) {
-      cnt += block->values_.size();
-    }
-    return cnt;
-  }
-
-  ValueBlock *GetShard(const int64_t id) {
-    return shard_blocks_[id & shard_mask_].get();
-  }
-
-  SparseMeta *GetMeta() { return &meta_; }
-
- private:
-  std::unique_ptr<framework::RWLock> rwlock_{nullptr};
-
-  SparseMeta meta_;
-  std::unordered_map<std::string, int64_t> values_dims_;
-  const size_t shard_mask_ = 127;
-  const size_t shard_num_ = 128;
-  std::vector<std::shared_ptr<ValueBlock>> shard_blocks_;
-};
-
-class LargeScaleKV {
- public:
-  LargeScaleKV() {}
-
-  explicit LargeScaleKV(const std::vector<SparseMeta> &table_metas) {
-    for (auto &sparse_meta : table_metas) {
-      auto table_name = sparse_meta.name;
-      auto meta = std::shared_ptr<SparseVariable>(
-          new SparseVariable(std::move(sparse_meta)));
-      sparse_variables[table_name] = meta;
-      grad_to_variables[sparse_meta.grad_name] = table_name;
-      grad_names_.push_back(sparse_meta.grad_name);
-    }
-  }
-
-  ~LargeScaleKV() {}
-
-  static std::shared_ptr<LargeScaleKV> GetInstantcePtr() { return scale_kv_; }
-
-  static LargeScaleKV *GetInstance() { return scale_kv_.get(); }
-
-  static LargeScaleKV *InitInstance(
-      const std::vector<SparseMeta> &table_metas) {
-    std::call_once(init_flag_, &LargeScaleKV::Init, table_metas);
-    return scale_kv_.get();
-  }
-
-  static void Init(const std::vector<SparseMeta> &table_metas) {
-    if (scale_kv_.get() == nullptr) {
-      scale_kv_.reset(new LargeScaleKV(table_metas));
-    }
-  }
-
-  SparseVariable *Get(const std::string &name) {
-    auto variable = sparse_variables.at(name);
-    return variable.get();
-  }
-
-  bool ParamInLargeScale(const std::string &name) {
-    auto got = sparse_variables.find(name);
-
-    if (got == sparse_variables.end()) {
-      return false;
-    }
-
-    return true;
-  }
-
-  bool GradInLargeScale(const std::string &name) {
-    auto got = grad_to_variables.find(name);
-
-    if (got == grad_to_variables.end()) {
-      return false;
-    }
-
-    return true;
-  }
-
-  SparseVariable *GetByGrad(const std::string &name) {
-    return Get(grad_to_variables[name]);
-  }
-
-  const std::vector<std::string> &GetAllGrads() { return grad_names_; }
-
- private:
-  std::unordered_map<std::string, std::shared_ptr<SparseVariable>>
-      sparse_variables;
-  std::unordered_map<std::string, std::string> grad_to_variables;
-  std::vector<std::string> grad_names_;
-  static std::shared_ptr<LargeScaleKV> scale_kv_;
-  static std::once_flag init_flag_;
-};
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.cc b/paddle/fluid/operators/distributed/parameter_prefetch.cc
deleted file mode 100644
index 558d70e5c3353..0000000000000
--- a/paddle/fluid/operators/distributed/parameter_prefetch.cc
+++ /dev/null
@@ -1,311 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/distributed/parameter_prefetch.h"
-#include <memory>
-#include <set>
-#include <unordered_map>
-#include <unordered_set>
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
-
-namespace paddle {
-namespace framework {
-class ExecutionContext;
-class Scope;
-}  // namespace framework
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-class RPCClient;
-
-using LoDTensor = framework::LoDTensor;
-using LoDTensor = framework::LoDTensor;
-using SelectedRows = framework::SelectedRows;
-using DDim = framework::DDim;
-
-static void SplitIdsIntoMultipleVarsBySection(
-    const std::vector<int64_t> &in_ids,
-    const std::vector<std::string> &in_varnames, const int tables,
-    const int pservers, const bool is_distibuted, framework::Scope *scope,
-    std::vector<std::vector<int64_t>> *splited_ids,
-    std::vector<std::vector<int64_t>> *origin_ids) {
-  PADDLE_ENFORCE_EQ(
-      in_varnames.size(), tables,
-      platform::errors::OutOfRange(
-          "send varnames size: %d not equal table number: %d, internal error",
-          in_varnames.size(), tables));
-
-  PADDLE_ENFORCE_LE(
-      tables, pservers,
-      platform::errors::OutOfRange("table number %d not equal or less than "
-                                   "pserver number: %d, internal error",
-                                   tables, pservers));
-
-  auto place = platform::CPUPlace();
-
-  std::set<int64_t> st(in_ids.begin(), in_ids.end());
-  std::vector<int64_t> all_ids;
-  all_ids.assign(st.begin(), st.end());
-
-  splited_ids->resize(tables);
-  origin_ids->resize(tables);
-
-  if (is_distibuted) {
-    for (auto &id : all_ids) {
-      auto pserver_id = id % pservers;
-      (*splited_ids)[pserver_id].push_back(id);
-      (*origin_ids)[pserver_id].push_back(id);
-    }
-  } else {
-    for (auto &id : all_ids) {
-      auto pserver_id = id % pservers;
-      (*origin_ids)[pserver_id].push_back(id);
-      id = id / pservers;
-      (*splited_ids)[pserver_id].push_back(id);
-    }
-  }
-
-  for (size_t i = 0; i < in_varnames.size(); ++i) {
-    auto *id_tensor =
-        scope->Var(in_varnames[i])->GetMutable<framework::LoDTensor>();
-
-    auto &ids = (*splited_ids)[i];
-    if (!ids.empty()) {
-      auto *id_tensor_data = id_tensor->mutable_data<int64_t>(
-          framework::make_ddim({static_cast<int64_t>(ids.size()), 1}), place);
-      memcpy(id_tensor_data, ids.data(), sizeof(int64_t) * ids.size());
-    }
-  }
-}
-
-typedef std::vector<std::pair<std::string, std::string>> TableAndEndpoints;
-
-void prefetch_core(
-    const std::vector<int64_t> &ids, const TableAndEndpoints &tables,
-    const framework::ExecutionContext &context, const framework::Scope &scope,
-    const bool is_distributed,
-    std::unordered_map<int64_t, std::vector<float>> *recved_vec_map) {
-  distributed::RPCClient *rpc_client =
-      distributed::RPCClient::GetInstance<RPCCLIENT_T>(
-          context.Attr<int>("trainer_id"));
-
-  int pservers = context.Attr<int>("pserver_num");
-
-  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-  auto &actual_ctx = *pool.Get(platform::CPUPlace());
-
-  std::unique_ptr<framework::Scope> local_scope = scope.NewTmpScope();
-
-  std::vector<std::string> in_var_names;
-  std::vector<std::string> out_var_names;
-  for (size_t i = 0; i < tables.size(); ++i) {
-    in_var_names.push_back("prefetch_send@" + tables[i].second);
-    out_var_names.push_back("prefetch_recv@" + tables[i].second);
-  }
-
-  std::vector<std::vector<int64_t>> split_ids;
-  std::vector<std::vector<int64_t>> origin_ids;
-  SplitIdsIntoMultipleVarsBySection(ids, in_var_names, tables.size(), pservers,
-                                    is_distributed, local_scope.get(),
-                                    &split_ids, &origin_ids);
-
-  // create output var in local scope
-  for (auto &name : out_var_names) {
-    local_scope->Var(name)->GetMutable<framework::LoDTensor>();
-  }
-
-  std::vector<distributed::VarHandlePtr> rets;
-  for (size_t i = 0; i < in_var_names.size(); i++) {
-    if (NeedSend(*local_scope.get(), in_var_names[i])) {
-      VLOG(3) << "sending " << in_var_names[i] << " to " << tables[i].second
-              << " to get " << out_var_names[i] << " back";
-      rets.push_back(rpc_client->AsyncPrefetchVar(
-          tables[i].second, actual_ctx, *local_scope.get(), in_var_names[i],
-          out_var_names[i], tables[i].first));
-    } else {
-      VLOG(3) << "don't send no-initialied variable: " << out_var_names[i];
-    }
-  }
-  for (size_t i = 0; i < rets.size(); i++) {
-    PADDLE_ENFORCE_NE(rets[i]->Wait(), 0U, platform::errors::ExecutionTimeout(
-                                               "internal error in RPCClient"));
-  }
-
-  for (size_t o_idx = 0; o_idx < out_var_names.size(); ++o_idx) {
-    auto &ids_in_this_section = origin_ids[o_idx];
-
-    if (!ids_in_this_section.empty()) {
-      auto &prefetch_out_var =
-          local_scope->Var(out_var_names[o_idx])->Get<framework::LoDTensor>();
-      const auto *out_var_data = prefetch_out_var.data<float>();
-      auto &dims = prefetch_out_var.dims();
-
-      PADDLE_ENFORCE_EQ(dims.size(), 2,
-                        platform::errors::InvalidArgument(
-                            "The size of Tensor dims must be 2."));
-      PADDLE_ENFORCE_EQ(ids_in_this_section.size(), dims[0],
-                        platform::errors::InvalidArgument(
-                            "The size of ids in this section must equal to "
-                            "dims[0]: %s, but got %s",
-                            dims[0], ids_in_this_section.size()));
-
-      auto row_numel = dims[1];
-
-      for (int64_t i = 0; i < dims[0]; ++i) {
-        auto origin_id = ids_in_this_section[i];
-        std::vector<float> vecs(row_numel);
-
-        std::copy_n(out_var_data + i * row_numel, row_numel, vecs.begin());
-        (*recved_vec_map)[origin_id] = vecs;
-      }
-    } else {
-      VLOG(3) << "ids in this section is empty";
-    }
-  }
-}
-
-void prefetch(const std::string &id_name, const std::string &out_name,
-              const std::string &persistable_var_name,
-              const bool is_distributed,
-              const std::vector<std::string> &table_names,
-              const std::vector<std::string> &endpoints,
-              const framework::ExecutionContext &context,
-              const framework::Scope &scope) {
-  prefetchs({id_name}, {out_name}, persistable_var_name, is_distributed,
-            table_names, endpoints, context, scope);
-}
-
-void prefetchs(const std::vector<std::string> &id_var_names,
-               const std::vector<std::string> &out_var_names,
-               const std::string &persistable_var_name,
-               const bool is_distributed,
-               const std::vector<std::string> &table_names,
-               const std::vector<std::string> &endpoints,
-               const framework::ExecutionContext &context,
-               const framework::Scope &scope) {
-  auto vec_dim_1 = 0;
-  auto vec_dim_0 = 0;
-  framework::Variable *var = scope.FindVar(persistable_var_name);
-
-  if (var->IsType<SelectedRows>()) {
-    vec_dim_1 = var->Get<framework::SelectedRows>().value().dims()[1];
-  } else {
-    vec_dim_0 = var->Get<framework::LoDTensor>().dims()[0];
-    vec_dim_1 = var->Get<framework::LoDTensor>().dims()[1];
-  }
-
-  PADDLE_ENFORCE_GT(vec_dim_1, 0,
-                    platform::errors::InvalidArgument(
-                        "lookup table var's dim must gather than 0"));
-
-  const auto place =
-      scope.FindVar(id_var_names[0])->Get<framework::LoDTensor>().place();
-
-  std::vector<std::vector<int64_t>> ids_group;
-  std::vector<int64_t> ids_union;
-  std::vector<framework::LoD> ids_lods;
-  TableAndEndpoints tables;
-
-  for (auto &id_name : id_var_names) {
-    auto &id_tensor = scope.FindVar(id_name)->Get<framework::LoDTensor>();
-    std::vector<int64_t> ids;
-    TensorToVector(id_tensor, context.device_context(), &ids);
-    ids_union.insert(ids_union.end(), ids.begin(), ids.end());
-    ids_group.push_back(ids);
-    ids_lods.push_back(id_tensor.lod());
-  }
-
-  std::unordered_set<int64_t> s(ids_union.begin(), ids_union.end());
-  ids_union.assign(s.begin(), s.end());
-
-  for (auto &i : ids_union) {
-    PADDLE_ENFORCE_GE(
-        i, 0, platform::errors::OutOfRange(
-                  "each element in embedding should be larger or equal 0"));
-    if (!is_distributed) {
-      PADDLE_ENFORCE_LT(
-          i, vec_dim_0,
-          platform::errors::OutOfRange(
-              "embedding id must in [0, %d) when is_distributed False",
-              vec_dim_0));
-    }
-  }
-
-  for (size_t i = 0; i < table_names.size(); i++) {
-    tables.push_back(std::make_pair(table_names[i], endpoints[i]));
-  }
-  std::unordered_map<int64_t, std::vector<float>> recved_vec_map;
-  prefetch_core(ids_union, tables, context, scope, is_distributed,
-                &recved_vec_map);
-
-  auto padding_idx = distributed::kNoPadding;
-
-  if (context.HasAttr("padding_idx")) {
-    padding_idx = context.Attr<int64_t>("padding_idx");
-  }
-
-  for (size_t i = 0; i < out_var_names.size(); i++) {
-    std::vector<int64_t> ids = ids_group[i];
-    auto ids_size = ids.size();
-    auto *out_t =
-        scope.FindVar(out_var_names[i])->GetMutable<framework::LoDTensor>();
-    out_t->set_lod(ids_lods[i]);
-    out_t->Resize(
-        framework::make_ddim({static_cast<int64_t>(ids_size), vec_dim_1}));
-    auto *out_d = out_t->mutable_data<float>(place);
-
-    if (platform::is_cpu_place(out_t->place())) {
-      for (auto idx = 0; idx < static_cast<int>(ids_size); idx++) {
-        const auto &id = ids[idx];
-        if (padding_idx != distributed::kNoPadding && id == padding_idx) {
-          memset(out_d + idx * vec_dim_1, 0, sizeof(float) * vec_dim_1);
-        } else {
-          std::copy_n(recved_vec_map[id].begin(), vec_dim_1,
-                      out_d + idx * vec_dim_1);
-        }
-      }
-    } else {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      std::vector<float> ids_value_vec(ids_size * vec_dim_1);
-      for (auto idx = 0; idx < static_cast<int>(ids_size); idx++) {
-        const auto &id = ids[idx];
-        if (padding_idx != distributed::kNoPadding && id == padding_idx) {
-          memset(&ids_value_vec[idx * vec_dim_1], 0, sizeof(float) * vec_dim_1);
-        } else {
-          memcpy(&ids_value_vec[idx * vec_dim_1], &recved_vec_map[id][0],
-                 sizeof(float) * vec_dim_1);
-        }
-      }
-      auto &gpu_place = BOOST_GET_CONST(platform::CUDAPlace, out_t->place());
-      auto &cpu_place = BOOST_GET_CONST(
-          platform::CPUPlace, paddle::platform::CPUDeviceContext().GetPlace());
-      auto stream = context.cuda_device_context().stream();
-      memory::Copy(gpu_place, out_d, cpu_place, &ids_value_vec[0],
-                   sizeof(float) * ids_size * vec_dim_1, stream);
-#else
-      PADDLE_ENFORCE(true, platform::errors::PermissionDenied(
-                               "Paddle is not compiled with GPU!"));
-#endif
-    }
-  }
-}
-
-};  // namespace distributed
-};  // namespace operators
-};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.h b/paddle/fluid/operators/distributed/parameter_prefetch.h
deleted file mode 100644
index 6fd3a998813c0..0000000000000
--- a/paddle/fluid/operators/distributed/parameter_prefetch.h
+++ /dev/null
@@ -1,53 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace framework {
-class ExecutionContext;
-class Scope;
-}  // namespace framework
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-constexpr int64_t kNoPadding = -1;
-
-void prefetchs(const std::vector<std::string>& id_var_names,
-               const std::vector<std::string>& out_var_names,
-               const std::string& persistable_var_name, const bool backfill,
-               const std::vector<std::string>& table_names,
-               const std::vector<std::string>& endpoints,
-               const framework::ExecutionContext& context,
-               const framework::Scope& scope);
-
-void prefetch(const std::string& id_name, const std::string& out_name,
-              const std::string& persistable_var_name, const bool backfill,
-              const std::vector<std::string>& table_names,
-              const std::vector<std::string>& endpoints,
-              const framework::ExecutionContext& context,
-              const framework::Scope& scope);
-
-};  // namespace distributed
-};  // namespace operators
-};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/parameter_recv.cc b/paddle/fluid/operators/distributed/parameter_recv.cc
deleted file mode 100644
index d5d3c9c3c7c48..0000000000000
--- a/paddle/fluid/operators/distributed/parameter_recv.cc
+++ /dev/null
@@ -1,248 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <sys/types.h>
-#include <algorithm>
-#include <memory>
-
-#include "glog/logging.h"
-#include "paddle/fluid/framework/ddim.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/operators/distributed/communicator_common.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/distributed/parameter_recv.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-class RPCClient;
-
-using LoDTensor = framework::LoDTensor;
-using LoDTensor = framework::LoDTensor;
-using SelectedRows = framework::SelectedRows;
-using DDim = framework::DDim;
-
-template <typename T>
-void RecvSparseLodTensor(const CommContext &rpc_ctx,
-                         const framework::Scope &scope) {
-  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-  auto cpu_place = platform::CPUPlace();
-  auto &cpu_ctx = *pool.Get(cpu_place);
-
-  distributed::RPCClient *rpc_client =
-      distributed::RPCClient::GetInstance<RPCCLIENT_T>(rpc_ctx.trainer_id);
-
-  std::unique_ptr<framework::Scope> local_scope = scope.NewTmpScope();
-  std::vector<const float *> tensors;
-  std::vector<distributed::VarHandlePtr> rets;
-  std::vector<std::string> recv_varnames;
-  for (size_t i = 0; i < rpc_ctx.splited_varnames.size(); i++) {
-    auto &recv_var_name = rpc_ctx.splited_varnames[i];
-    VLOG(4) << "recv " << recv_var_name << " from " << rpc_ctx.epmap[i];
-    local_scope->Var(recv_var_name);
-    // sparse param in recv_scope is LoDTensor
-    rets.push_back(rpc_client->AsyncGetVarNoBarrier(
-        rpc_ctx.epmap[i], cpu_ctx, *local_scope.get(), recv_var_name,
-        recv_var_name));
-    recv_varnames.push_back(recv_var_name);
-  }
-
-  for (size_t i = 0; i < rets.size(); i++) {
-    PADDLE_ENFORCE_NE(rets[i]->Wait(), 0U, platform::errors::ExecutionTimeout(
-                                               "internal error in RPCClient"));
-    auto &recv_var_name = recv_varnames[i];
-    auto *local_var = local_scope->FindVar(recv_var_name);
-    const auto *value = local_var->Get<framework::LoDTensor>().data<float>();
-    tensors.push_back(value);
-  }
-
-  auto *merged_var = scope.FindVar(rpc_ctx.var_name);
-
-  if (merged_var == nullptr || !merged_var->IsInitialized()) {
-    PADDLE_THROW(
-        platform::errors::InvalidArgument("%s must initialized at first."));
-  }
-  auto dims1 = merged_var->Get<framework::LoDTensor>().dims()[1];
-  int64_t height = 0;
-  for (size_t i = 0; i < rpc_ctx.splited_varnames.size(); i++) {
-    auto *splited_var = local_scope->FindVar(rpc_ctx.splited_varnames[i]);
-    height += splited_var->Get<framework::LoDTensor>().dims()[0];
-  }
-
-  PADDLE_ENFORCE_EQ(
-      merged_var->Get<framework::LoDTensor>().dims()[0], height,
-      platform::errors::InvalidArgument(
-          "Received variable must has same dimension with local variable."));
-
-  auto *merged_t = merged_var->GetMutable<framework::LoDTensor>();
-  auto *merged_d = merged_t->mutable_data<float>(cpu_place);
-
-  auto pserver_num = rpc_ctx.splited_varnames.size();
-  for (int x = 0; x < height; ++x) {
-    auto id = x % pserver_num;
-    auto idx = x / pserver_num;
-    std::memcpy(merged_d + x * dims1, tensors[id] + idx * dims1,
-                sizeof(float) * dims1);
-  }
-}
-
-template <typename T>
-void RecvGeoSparseRecords(const CommContext &rpc_ctx,
-                          const framework::Scope &scope) {
-  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-  auto cpu_place = platform::CPUPlace();
-  auto &cpu_ctx = *pool.Get(cpu_place);
-
-  distributed::RPCClient *rpc_client =
-      distributed::RPCClient::GetInstance<RPCCLIENT_T>(rpc_ctx.trainer_id);
-
-  std::unique_ptr<framework::Scope> local_scope = scope.NewTmpScope();
-
-  std::vector<distributed::VarHandlePtr> rets;
-  for (size_t i = 0; i < rpc_ctx.splited_varnames.size(); i++) {
-    auto &recv_var_name = rpc_ctx.splited_varnames[i];
-    local_scope->Var(recv_var_name);
-    VLOG(4) << "recv " << recv_var_name << " from " << rpc_ctx.epmap[i];
-    // sparse param in recv_scope is LoDTensor
-    rets.push_back(rpc_client->AsyncGetVar(rpc_ctx.epmap[i], cpu_ctx,
-                                           *local_scope.get(), recv_var_name,
-                                           recv_var_name, recv_var_name));
-  }
-
-  for (size_t i = 0; i < rets.size(); i++) {
-    PADDLE_ENFORCE_NE(rets[i]->Wait(), 0U, platform::errors::ExecutionTimeout(
-                                               "internal error in RPCClient"));
-  }
-
-  int64_t height = 0;
-  int64_t ids_num = 0;
-  int64_t width = 0;
-
-  std::vector<int64_t> all_ids;
-  auto pserver_num = rpc_ctx.splited_varnames.size();
-
-  for (size_t i = 0; i < rpc_ctx.splited_varnames.size(); i++) {
-    auto &recv_var_name = rpc_ctx.splited_varnames[i];
-    auto *recv_var = local_scope->FindVar(recv_var_name);
-    auto &recv_t = recv_var->Get<framework::SelectedRows>();
-
-    height += recv_t.height();
-    ids_num += recv_t.rows().size();
-    width = recv_t.value().dims()[1];
-
-    if (rpc_ctx.is_distributed) {
-      std::copy(recv_t.rows().begin(), recv_t.rows().end(),
-                std::back_inserter(all_ids));
-    } else {
-      std::transform(recv_t.rows().begin(), recv_t.rows().end(),
-                     std::back_inserter(all_ids),
-                     [&](int64_t id) { return id * pserver_num + i; });
-    }
-  }
-
-  auto *var = scope.FindVar(rpc_ctx.var_name);
-  auto *t_ = var->GetMutable<framework::SelectedRows>();
-  T *out_data =
-      t_->mutable_value()->mutable_data<T>({ids_num, width}, cpu_place);
-  t_->set_height(height);
-  t_->set_rows(all_ids);
-
-  int64_t cnt = 0;
-  for (size_t i = 0; i < rpc_ctx.splited_varnames.size(); i++) {
-    auto &recv_var_name = rpc_ctx.splited_varnames[i];
-    auto *recv_var = local_scope->FindVar(recv_var_name);
-    auto &recv_t = recv_var->Get<framework::SelectedRows>();
-
-    auto rows = recv_t.rows().size();
-    const T *in_data = recv_t.value().data<T>();
-    std::copy_n(in_data, rows * width, out_data + cnt);
-    cnt += rows * width;
-  }
-  t_->SyncIndex();
-}
-
-template <typename T>
-void RecvLodTensor(const CommContext &rpc_ctx, const framework::Scope &scope) {
-  distributed::RPCClient *rpc_client =
-      distributed::RPCClient::GetInstance<RPCCLIENT_T>(rpc_ctx.trainer_id);
-
-  std::vector<distributed::VarHandlePtr> rets;
-
-  // variable do not spilt
-  if (rpc_ctx.origin_varnames.size() == 1 &&
-      rpc_ctx.splited_varnames.size() == 1) {
-    auto varname = rpc_ctx.origin_varnames[0];
-    const auto place =
-        scope.FindVar(varname)->Get<framework::LoDTensor>().place();
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &ctx = *pool.Get(place);
-    VLOG(4) << "recv " << varname << " from " << rpc_ctx.epmap[0] << " in gpu? "
-            << platform::is_gpu_place(place);
-    rets.push_back(rpc_client->AsyncGetVarNoBarrier(rpc_ctx.epmap[0], ctx,
-                                                    scope, varname, varname));
-
-    for (size_t i = 0; i < rets.size(); i++) {
-      PADDLE_ENFORCE_NE(
-          rets[i]->Wait(), 0U,
-          platform::errors::ExecutionTimeout("internal error in RPCClient"));
-    }
-
-    VLOG(3) << "ParameterRecv out " << rpc_ctx.var_name;
-    return;
-  } else {
-    PADDLE_ENFORCE(false, platform::errors::Unimplemented(
-                              "ParameterRecv can not recv dense with multi "
-                              "parts now, add it soon."));
-  }
-}
-
-template <typename T>
-void ParameterRecv<T>::operator()(const CommContext &rpc_ctx,
-                                  const framework::Scope &scope,
-                                  bool geo_records) {
-  VLOG(3) << "ParameterRecv in " << rpc_ctx.var_name;
-
-  PADDLE_ENFORCE_GE(rpc_ctx.origin_varnames.size(), 1,
-                    platform::errors::InvalidArgument(
-                        "origin_varnames.size() >= 1 is permitted"));
-
-  if (rpc_ctx.is_sparse) {
-    if (geo_records) {
-      RecvGeoSparseRecords<T>(rpc_ctx, scope);
-    } else {
-      RecvSparseLodTensor<T>(rpc_ctx, scope);
-    }
-  } else {
-    RecvLodTensor<T>(rpc_ctx, scope);
-  }
-
-  VLOG(3) << "ParameterRecv out " << rpc_ctx.var_name;
-}
-template <typename T>
-void ParameterRecv<T>::operator()(const CommContext &rpc_ctx,
-                                  const framework::Scope &scope) {
-  this->operator()(rpc_ctx, scope, false);
-}
-
-template struct ParameterRecv<float>;
-
-};  // namespace distributed
-};  // namespace operators
-};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/parameter_recv.h b/paddle/fluid/operators/distributed/parameter_recv.h
deleted file mode 100644
index c30d21aa791e2..0000000000000
--- a/paddle/fluid/operators/distributed/parameter_recv.h
+++ /dev/null
@@ -1,37 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/distributed/communicator_common.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-template <typename T>
-struct ParameterRecv {
-  void operator()(const CommContext &rpc_ctx, const framework::Scope &scope,
-                  bool barrier);
-
-  void operator()(const CommContext &rpc_ctx, const framework::Scope &scope);
-};
-
-};  // namespace distributed
-};  // namespace operators
-};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/parameter_send.cc b/paddle/fluid/operators/distributed/parameter_send.cc
deleted file mode 100644
index 109514ca2541c..0000000000000
--- a/paddle/fluid/operators/distributed/parameter_send.cc
+++ /dev/null
@@ -1,331 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/distributed/parameter_send.h"
-#include <memory>
-#include <utility>
-#include "glog/logging.h"
-#include "paddle/fluid/framework/ddim.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/operators/distributed/communicator_common.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/distributed/request_handler.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-namespace framework {
-class Scope;
-class Tensor;
-}  // namespace framework
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-class RPCClient;
-
-using LoDTensor = framework::LoDTensor;
-using LoDTensor = framework::LoDTensor;
-using SelectedRows = framework::SelectedRows;
-using DDim = framework::DDim;
-
-typedef std::vector<std::pair<std::string, std::string>> EP_SPLIT_TABLE_PAIRS;
-
-inline EP_SPLIT_TABLE_PAIRS GetMultiFieldCommContext(
-    const CommContext &rpc_ctx, const framework::Scope &scope,
-    int multi_parts) {
-  EP_SPLIT_TABLE_PAIRS table_pairs;
-
-  auto *send_var = scope.FindVar(rpc_ctx.var_name);
-  if (send_var->IsType<framework::SelectedRows>()) {
-    PADDLE_ENFORCE_GE(multi_parts, 1,
-                      platform::errors::InvalidArgument(
-                          "multi_parts must == 1 in parameter send, now is: %d",
-                          multi_parts));
-
-    for (size_t i = 0; i < rpc_ctx.splited_varnames.size(); i++) {
-      table_pairs.push_back(
-          std::make_pair(rpc_ctx.epmap[i], rpc_ctx.splited_varnames[i]));
-    }
-
-  } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "GetMultiFieldCommContext unsupported LoDTensor current!"));
-  }
-
-  return table_pairs;
-}  // namespace distributed
-
-void SendByNotifyRPC(const CommContext &rpc_ctx,
-                     const framework::Scope &scope) {
-  auto cpu_ctx = paddle::platform::CPUDeviceContext();
-  auto &send_var_name = rpc_ctx.var_name;
-  std::vector<distributed::VarHandlePtr> rets;
-
-  distributed::RPCClient *rpc_client =
-      distributed::RPCClient::GetInstance<RPCCLIENT_T>(rpc_ctx.trainer_id);
-
-  if (NeedSend(scope, send_var_name)) {
-    for (size_t j = 0; j < rpc_ctx.epmap.size(); j++) {
-      auto &endpoint = rpc_ctx.epmap[j];
-      VLOG(4) << "sending " << send_var_name << " to " << endpoint;
-      rets.push_back(rpc_client->AsyncDistributeNotify(endpoint, cpu_ctx, scope,
-                                                       send_var_name));
-      VLOG(4) << "send var " << send_var_name << " by notify RPC done";
-    }
-  } else {
-    VLOG(3) << "don't send non-initialized variable: " << rpc_ctx.var_name;
-  }
-
-  for (auto &handle : rets) {
-    PADDLE_ENFORCE_NE(handle->Wait(), 0U, platform::errors::ExecutionTimeout(
-                                              "internal error in RPCClient"));
-  }
-}
-
-template <typename T>
-void ParameterSend<T>::operator()(const CommContext &rpc_ctx,
-                                  const framework::Scope &scope, bool sync,
-                                  int multi_parts) {
-  if (rpc_ctx.var_name == STEP_COUNTER) {
-    SendByNotifyRPC(rpc_ctx, scope);
-    return;
-  }
-
-  std::unique_ptr<framework::Scope> local_scope = scope.NewTmpScope();
-
-  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-  auto &cpu_ctx = *pool.Get(platform::CPUPlace());
-
-  distributed::RPCClient *rpc_client =
-      distributed::RPCClient::GetInstance<RPCCLIENT_T>(rpc_ctx.trainer_id);
-
-  std::vector<distributed::VarHandlePtr> rets;
-  auto *send_var = scope.FindVar(rpc_ctx.var_name);
-
-  if (send_var->IsType<framework::LoDTensor>()) {
-    size_t out_num = rpc_ctx.splited_varnames.size();
-    if (out_num > 1) {
-      auto &send_tensor = send_var->Get<framework::LoDTensor>();
-      auto &send_tensor_dims = send_tensor.dims();
-      std::vector<framework::DDim> outs_dims;
-      outs_dims.reserve(out_num);
-
-      // infer output shape
-      PADDLE_ENFORCE_EQ(
-          rpc_ctx.height_sections.size(), out_num,
-          platform::errors::InvalidArgument("tensor split sections size"
-                                            "should be equal to output size."));
-      for (size_t i = 0; i < out_num; ++i) {
-        auto dim = send_tensor_dims;
-        dim[0] = rpc_ctx.height_sections[i];
-        outs_dims.push_back(dim);
-      }
-
-      // create output var in local scope
-      size_t row_offset = 0;
-      for (size_t i = 0; i < out_num; ++i) {
-        framework::Tensor *out = local_scope->Var(rpc_ctx.splited_varnames[i])
-                                     ->GetMutable<framework::LoDTensor>();
-        *out = send_tensor.Slice(row_offset, row_offset + outs_dims[i][0]);
-        row_offset += outs_dims[i][0];
-      }
-    } else {
-      auto &send_tensor = send_var->Get<framework::LoDTensor>();
-      framework::Tensor *out = local_scope->Var(rpc_ctx.splited_varnames[0])
-                                   ->GetMutable<framework::LoDTensor>();
-      out->ShareDataWith(send_tensor);
-    }
-
-    for (size_t i = 0; i < rpc_ctx.splited_varnames.size(); i++) {
-      auto &send_var_name = rpc_ctx.splited_varnames[i];
-      auto &endpoint = rpc_ctx.epmap[i];
-      VLOG(4) << " send var name: " << send_var_name
-              << "endpoint: " << endpoint;
-      if (NeedSend(*local_scope.get(), send_var_name)) {
-        VLOG(3) << "sending " << send_var_name << " to " << endpoint;
-        rets.push_back(rpc_client->AsyncSendVar(
-            endpoint, cpu_ctx, *local_scope.get(), send_var_name));
-        VLOG(4) << "send var " << send_var_name << " async handle done";
-      } else {
-        VLOG(3) << "don't send non-initialized variable: "
-                << rpc_ctx.splited_varnames[i];
-      }
-    }
-  } else if (send_var->IsType<framework::SelectedRows>()) {
-    auto &send_slr = send_var->Get<framework::SelectedRows>();
-
-    auto &send_rows = send_slr.rows();
-    if (send_rows.size() == 0) {
-      LOG(WARNING)
-          << "WARNING: The variable sent to pserver is empty, which "
-             "may cause an unknown error. Please check the state of "
-             "use_double_buffer in pyreader/dataloader async mode, you need to "
-             "turn it false.";
-    }
-
-    std::vector<std::vector<size_t>> outs_rows_idx;
-    std::vector<std::vector<size_t>> outs_dense_idx;
-
-    auto table_pairs = GetMultiFieldCommContext(rpc_ctx, scope, 1);
-    outs_rows_idx.resize(table_pairs.size());
-    outs_dense_idx.resize(table_pairs.size());
-
-    auto row_numel = send_slr.value().numel() / send_slr.value().dims()[0];
-    auto *src = send_slr.value().data<T>();
-
-    // create output var in local scope
-    std::vector<framework::SelectedRows *> outs;
-    for (auto &table : table_pairs) {
-      auto *out =
-          local_scope->Var(table.second)->GetMutable<framework::SelectedRows>();
-      outs.push_back(out);
-    }
-
-    if (!rpc_ctx.is_distributed) {
-      auto pserver_num = rpc_ctx.epmap.size();
-
-      // split rows index into output sparse vars
-      for (size_t i = 0; i < send_rows.size(); ++i) {
-        auto ep_idx = send_rows[i] % pserver_num;
-        auto id = send_rows[i] / pserver_num;
-        outs_rows_idx[ep_idx].push_back(id);
-        outs_dense_idx[ep_idx].push_back(i);
-      }
-
-      auto place = platform::CPUPlace();
-
-      for (size_t out_idx = 0; out_idx < rpc_ctx.splited_varnames.size();
-           out_idx++) {
-        auto rows_idx = outs_rows_idx[out_idx];
-
-        auto dims = send_slr.GetCompleteDims();
-        dims[0] = rows_idx.size();
-        outs[out_idx]->set_height(rpc_ctx.height_sections[out_idx]);
-        outs[out_idx]->mutable_rows()->clear();
-        outs[out_idx]->mutable_value()->mutable_data<T>(dims, send_slr.place());
-
-        if (rows_idx.size() > 0) {
-          for (auto idx : rows_idx) {
-            outs[out_idx]->mutable_rows()->push_back(idx);
-          }
-          auto dst = outs[out_idx]->mutable_value()->mutable_data<T>(place);
-          for (size_t j = 0; j < rows_idx.size(); j++) {
-            if (platform::is_cpu_place(place)) {
-              memory::Copy(platform::CPUPlace(), dst + j * row_numel,
-                           platform::CPUPlace(),
-                           src + outs_dense_idx[out_idx][j] * row_numel,
-                           sizeof(T) * row_numel);
-            } else {
-              PADDLE_THROW(
-                  platform::errors::Unimplemented("do not support GPU now"));
-            }
-          }
-        }
-        PADDLE_ENFORCE_EQ(
-            rows_idx.size(), outs[out_idx]->rows().size(),
-            platform::errors::InvalidArgument(
-                "rows should has the same size with tensor dim 0"));
-      }
-    } else {
-      auto pserver_num = rpc_ctx.epmap.size();
-
-      // split rows index into output sparse vars
-      for (size_t i = 0; i < send_rows.size(); ++i) {
-        auto out_idx = send_rows[i] % pserver_num;
-        outs_rows_idx[out_idx].push_back(send_rows[i]);
-        outs_dense_idx[out_idx].push_back(i);
-      }
-
-      auto place = platform::CPUPlace();
-
-      for (size_t out_idx = 0; out_idx < rpc_ctx.splited_varnames.size();
-           out_idx++) {
-        auto rows_idx = outs_rows_idx[out_idx];
-
-        auto dims = send_slr.GetCompleteDims();
-        dims[0] = rows_idx.size();
-
-        outs[out_idx]->set_height(rpc_ctx.height_sections[out_idx]);
-        outs[out_idx]->mutable_rows()->clear();
-        outs[out_idx]->mutable_value()->mutable_data<T>(dims, send_slr.place());
-
-        if (rows_idx.size() > 0) {
-          for (auto idx : rows_idx) {
-            outs[out_idx]->mutable_rows()->push_back(idx);
-          }
-          auto dst = outs[out_idx]->mutable_value()->mutable_data<T>(place);
-          for (size_t j = 0; j < rows_idx.size(); j++) {
-            if (platform::is_cpu_place(place)) {
-              memory::Copy(platform::CPUPlace(), dst + j * row_numel,
-                           platform::CPUPlace(),
-                           src + outs_dense_idx[out_idx][j] * row_numel,
-                           sizeof(T) * row_numel);
-            } else {
-              PADDLE_THROW(
-                  platform::errors::Unimplemented("do not support GPU now"));
-            }
-          }
-        }
-        PADDLE_ENFORCE_EQ(
-            rows_idx.size(), outs[out_idx]->rows().size(),
-            platform::errors::InvalidArgument(
-                "rows should has the same size with tensor dim 0"));
-      }
-    }
-
-    for (size_t i = 0; i < table_pairs.size(); i++) {
-      auto &send_var_name = table_pairs[i].second;
-      auto &endpoint = table_pairs[i].first;
-      auto need_send = NeedSend(*local_scope.get(), send_var_name);
-
-      VLOG(4) << "send var name: " << send_var_name
-              << " send var endpoint: " << endpoint
-              << " need send: " << need_send;
-
-      if (need_send) {
-        VLOG(4) << "sending " << send_var_name << " to " << endpoint;
-
-        rets.push_back(rpc_client->AsyncSendVar(
-            endpoint, cpu_ctx, *local_scope.get(), send_var_name));
-        VLOG(4) << "send var " << send_var_name << " async handle done";
-      } else {
-        VLOG(4) << "don't send non-initialized variable: "
-                << rpc_ctx.splited_varnames[i];
-      }
-    }
-  } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "unsupported var type: %s to send!", send_var->Type()));
-  }
-
-  VLOG(4) << "Prepare to send var " << rpc_ctx.var_name;
-  if (sync) {
-    for (auto &handle : rets) {
-      VLOG(4) << "Wait send var to pserver handle: " << handle;
-      PADDLE_ENFORCE_NE(handle->Wait(), 0U, platform::errors::ExecutionTimeout(
-                                                "internal error in RPCClient"));
-    }
-  }
-}
-
-template struct ParameterSend<float>;
-
-};  // namespace distributed
-};  // namespace operators
-};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/proto_encoder_helper.h b/paddle/fluid/operators/distributed/proto_encoder_helper.h
deleted file mode 100644
index cedc98b1fcadd..0000000000000
--- a/paddle/fluid/operators/distributed/proto_encoder_helper.h
+++ /dev/null
@@ -1,146 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-// NOTE: This file was originally created by tensorflow
-//       (https://github.com/tensorflow/tensorflow/) we borrow this
-//       file and did some modifications so that we can send gRPC
-//       requests without too much copying of the tensor data.
-
-#pragma once
-
-#include <string>
-
-#include "grpc++/grpc++.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-char* EncodeVarint32(char* dst, uint32_t v) {
-  // Operate on characters as unsigneds
-  unsigned char* ptr = reinterpret_cast<unsigned char*>(dst);
-  static const int B = 128;
-  if (v < (1 << 7)) {
-    *(ptr++) = v;
-  } else if (v < (1 << 14)) {
-    *(ptr++) = v | B;
-    *(ptr++) = v >> 7;
-  } else if (v < (1 << 21)) {
-    *(ptr++) = v | B;
-    *(ptr++) = (v >> 7) | B;
-    *(ptr++) = v >> 14;
-  } else if (v < (1 << 28)) {
-    *(ptr++) = v | B;
-    *(ptr++) = (v >> 7) | B;
-    *(ptr++) = (v >> 14) | B;
-    *(ptr++) = v >> 21;
-  } else {
-    *(ptr++) = v | B;
-    *(ptr++) = (v >> 7) | B;
-    *(ptr++) = (v >> 14) | B;
-    *(ptr++) = (v >> 21) | B;
-    *(ptr++) = v >> 28;
-  }
-  return reinterpret_cast<char*>(ptr);
-}
-
-char* EncodeVarint64(char* dst, uint64_t v) {
-  static const int B = 128;
-  unsigned char* ptr = reinterpret_cast<unsigned char*>(dst);
-  while (v >= B) {
-    *(ptr++) = (v & (B - 1)) | B;
-    v >>= 7;
-  }
-  *(ptr++) = static_cast<unsigned char>(v);
-  return reinterpret_cast<char*>(ptr);
-}
-
-int VarintLength(uint64_t v) {
-  int len = 1;
-  while (v >= 128) {
-    v >>= 7;
-    len++;
-  }
-  return len;
-}
-
-class ProtoEncodeHelper {
- public:
-  ProtoEncodeHelper(char* buf, int max_size)
-      : base_(buf), p_(buf), limit_(base_ + max_size) {}
-
-  ~ProtoEncodeHelper() {}
-
-  const char* data() const { return base_; }
-  size_t size() const { return p_ - base_; }
-
-  void WriteUint64(int tag, uint64_t v) {
-    Encode32(combine(tag, WIRETYPE_VARINT));
-    Encode64(v);
-  }
-  void WriteBool(int tag, bool v) {
-    Encode32(combine(tag, WIRETYPE_VARINT));
-    EncodeBool(v);
-  }
-  void WriteString(int tag, const std::string& v) {
-    Encode32(combine(tag, WIRETYPE_LENGTH_DELIMITED));
-    Encode32(v.size());
-    EncodeBytes(v.data(), v.size());
-  }
-  void WriteVarlengthBeginning(int tag, uint32_t len) {
-    Encode32(combine(tag, WIRETYPE_LENGTH_DELIMITED));
-    Encode32(len);
-  }
-  void WriteRawBytes(const std::string& v) { EncodeBytes(v.data(), v.size()); }
-
- private:
-  // Note: this module's behavior must match the protocol buffer wire encoding
-  // format.
-  enum {
-    WIRETYPE_VARINT = 0,
-    WIRETYPE_LENGTH_DELIMITED = 2,
-  };
-  static uint32_t combine(uint32_t tag, uint32_t type) {
-    return ((tag << 3) | type);
-  }
-  inline void Encode32(uint32_t v) {
-    if (v < 128) {
-      // Fast path for single-byte values.  Many of the calls will use a
-      // constant value for v, so the comparison will get optimized away
-      // when Encode32 is inlined into the caller.
-      *p_ = v;
-      p_++;
-    } else {
-      p_ = EncodeVarint32(p_, v);
-    }
-  }
-  void Encode64(uint64_t v) { p_ = EncodeVarint64(p_, v); }
-  void EncodeBool(bool v) {
-    *p_ = (v ? 1 : 0);  // Equal to varint32 encoding of 0 or 1
-    p_++;
-  }
-  void EncodeBytes(const char* bytes, int N) {
-    memcpy(p_, bytes, N);
-    p_ += N;
-  }
-
-  char* base_;
-  char* p_;
-  char* limit_;  // Just for CHECKs
-};
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/request_handler.h b/paddle/fluid/operators/distributed/request_handler.h
deleted file mode 100644
index 44359af1b1b2a..0000000000000
--- a/paddle/fluid/operators/distributed/request_handler.h
+++ /dev/null
@@ -1,261 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <time.h>
-#include <condition_variable>  // NOLINT
-
-#include <functional>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/platform/macros.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-constexpr char kRequestSend[] = "RequestSend";
-constexpr char kRequestGet[] = "RequestGet";
-constexpr char kRequestGetMonomerVariable[] = "RequestGetMonomerVariable";
-constexpr char kRequestGetMonomerBarrier[] = "RequestGetMonomerBarrier";
-constexpr char kRequestPrefetch[] = "RequestPrefetch";
-constexpr char kRequestCheckpoint[] = "RequestCheckpoint";
-constexpr char kRequestPassBarrier[] = "RequestPassBarrier";
-constexpr char kRequestGetNoBarrier[] = "GetVariableNoBarrier";
-constexpr char kRequestNotify[] = "RequestNotify";
-constexpr char kRequestSendAndRecv[] = "RequestSendAndRecv";
-
-constexpr char kSendRPC[] = "SendRPC";
-constexpr char kGetRPC[] = "GetRPC";
-constexpr char kGetNoBarrierRPC[] = "GetNoBarrierRPC";
-constexpr char kGetMonomerRPC[] = "GetMonomerRPC";
-constexpr char kPrefetchRPC[] = "PrefetchRPC";
-constexpr char kBatchBarrierRPC[] = "BatchBarrierRPC";
-constexpr char kFetchBarrierRPC[] = "FetchBarrierRPC";
-constexpr char kSendMonomerFetchBarrierRPC[] = "SendMonomerFetchBarrierRPC";
-constexpr char kSendCompleteRPC[] = "SendCompleteRPC";
-constexpr char kCheckPointNotifyRPC[] = "CheckPointNotifyRPC";
-constexpr char kSendAndRecvRPC[] = "SendAndRecvRPC";
-constexpr int64_t kPrefetchTimeout = 60000;
-
-#define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV"
-#define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV"
-#define FETCH_BARRIER_MESSAGE "FETCH_BARRIER@RECV"
-#define COMPLETE_MESSAGE "COMPLETE@RECV"
-#define WITHOUT_BARRIER_MESSAGE "@WITHOUT_BARRIER@RECV"
-#define LEARNING_RATE_DECAY_COUNTER "@LR_DECAY_COUNTER@"
-#define STEP_COUNTER "@PS_STEP_COUNTER@"
-
-#define CHECKPOINT_SAVE_MESSAGE "SAVE@CHECKPOINTNOTIFY"
-#define CHECKPOINT_LOAD_MESSAGE "LOAD@CHECKPOINTNOTIFY"
-
-enum DistributedMode { kSync = 0, kAsync = 1, kHalfAsync = 2, kGeo = 3 };
-
-class RPCServer;
-
-class VarHandle {
- public:
-  VarHandle(const std::string ep, const std::string& method,
-            const std::string& name,
-            const platform::DeviceContext* p_ctx = nullptr,
-            const framework::Scope* p_scope = nullptr)
-      : status_(kDefaultState) {
-    ep_ = ep;
-    ctx_ = p_ctx;
-    scope_ = p_scope;
-    name_ = name;
-    method_ = method;
-  }
-
-  virtual ~VarHandle() {}
-
- public:
-  bool should_retry = false;
-
-  bool Wait() {
-    int ret = kDefaultState;
-    {
-      std::unique_lock<std::mutex> lk(sync_mutex_);
-      wait_cond_.wait(lk, [this] { return status_ != kDefaultState; });
-      ret = status_;
-    }
-    VLOG(7) << "VarHandle wait:" << ret;
-    return ret != kErrorState;
-  }
-
-  void Finish(bool ok) {
-    {
-      std::unique_lock<std::mutex> lk(sync_mutex_);
-      status_ = ok ? kFinishState : kErrorState;
-    }
-    VLOG(7) << "VarHandle finish:" << ok;
-    wait_cond_.notify_all();
-  }
-
-  std::string String() const {
-    std::ostringstream s;
-    s << method_ << " name:[" << name_ << "], ep:[" << ep_ << "], status:["
-      << status_ << "]";
-    return s.str();
-  }
-
-  std::string ep() const { return ep_; }
-  const platform::DeviceContext* ctx() const { return ctx_; }
-  const framework::Scope* scope() const { return scope_; }
-  std::string name() const { return name_; }
-  std::string method() const { return method_; }
-
- protected:
-  // RPC endpoint.
-  std::string ep_;
-  const platform::DeviceContext* ctx_;
-  const framework::Scope* scope_;
-  // Variable name.
-  std::string name_;
-  // RPC method name.
-  std::string method_;
-
- protected:
-  std::mutex sync_mutex_;
-  std::condition_variable wait_cond_;
-
-  enum VarHandleStatus {
-    kDefaultState = -1,
-    kErrorState = 0,
-    kFinishState = 1,
-  };
-  VarHandleStatus status_;
-
- private:
-  DISABLE_COPY_AND_ASSIGN(VarHandle);
-};
-
-typedef std::shared_ptr<VarHandle> VarHandlePtr;
-
-class RequestHandler {
- public:
-  explicit RequestHandler(int distributed_mode)
-      : distributed_mode_(distributed_mode),
-        dev_ctx_(nullptr),
-        executor_(nullptr),
-        scope_(nullptr),
-        program_(nullptr),
-        rpc_server_(nullptr) {}
-
-  virtual ~RequestHandler() {}
-
-  // Set attributes.
-  void SetScope(framework::Scope* scope) { scope_ = scope; }
-  void SetDevCtx(const platform::DeviceContext* dev_ctx) { dev_ctx_ = dev_ctx; }
-  void SetProgram(framework::ProgramDesc* program) { program_ = program; }
-  void SetExecutor(framework::Executor* executor) { executor_ = executor; }
-
-  // Used for dist lookup table prefetch
-  void SetPrefetchPreparedCtx(
-      std::unordered_map<
-          std::string, std::shared_ptr<framework::ExecutorPrepareContext>>* g) {
-    prefetch_var_name_to_prepared_ctx_ = g;
-  }
-
-  void SetCheckpointNotifyPreparedCtx(
-      std::shared_ptr<framework::ExecutorPrepareContext> g) {
-    checkpoint_prepared_ctx_ = g;
-  }
-
-  // Used for async.
-  void SetGradToPreparedCtx(
-      std::unordered_map<
-          std::string, std::shared_ptr<framework::ExecutorPrepareContext>>* g) {
-    grad_to_prepared_ctx_ = g;
-  }
-
-  void SetSparseGradToParam(std::unordered_map<std::string, std::string>* g) {
-    sparse_grad_to_param_ = g;
-  }
-
-  void SetLrDecayPreparedCtx(
-      std::shared_ptr<framework::ExecutorPrepareContext> g) {
-    lr_decay_prepared_ctx_ = g;
-  }
-
-  void SetRPCServer(RPCServer* rpc_server) { rpc_server_ = rpc_server; }
-
-  // Get attributes.
-  int distributed_mode() { return distributed_mode_; }
-  framework::Scope* scope() { return scope_; }
-  const platform::DeviceContext* dev_ctx() { return dev_ctx_; }
-  framework::ProgramDesc* program() { return program_; }
-  framework::Executor* executor() { return executor_; }
-
-  // This function processes user's rpc request.
-  // The implemention is in request_handler_impl.
-  // example:
-  //    std::string varname = request_.varname();
-  //
-  //    auto scope = request_handler_->scope();
-  //    auto invar = scope->FindVar(varname);
-  //    framework::Variable* outvar = nullptr;
-  //
-  //    request_handler_->Handle(varname, scope, invar, &outvar);
-  //    if (outvar) {
-  //        SerializeToByteBuffer(varname, outvar,
-  //           *request_handler_->dev_ctx(), &reply_);
-  //    }
-  virtual bool Handle(const std::string& varname, framework::Scope* scope,
-                      framework::Variable* var, framework::Variable** outvar,
-                      const int trainer_id,
-                      const std::string& out_var_name = "",
-                      const std::string& table_name = "") = 0;
-
- protected:
-  const int distributed_mode_;
-
-  const platform::DeviceContext* dev_ctx_;
-  framework::Executor* executor_;
-  framework::Scope* scope_;
-  framework::ProgramDesc* program_;
-
-  // used for distribute lookup table prefetch
-  std::unordered_map<std::string,
-                     std::shared_ptr<framework::ExecutorPrepareContext>>*
-      prefetch_var_name_to_prepared_ctx_;
-  // used for checkpoint notify
-  std::shared_ptr<framework::ExecutorPrepareContext> checkpoint_prepared_ctx_;
-
-  // Used for async.
-  std::unordered_map<std::string,
-                     std::shared_ptr<framework::ExecutorPrepareContext>>*
-      grad_to_prepared_ctx_;
-  std::unordered_map<std::string, std::string>* sparse_grad_to_param_;
-
-  // used for lr decay
-  std::shared_ptr<framework::ExecutorPrepareContext> lr_decay_prepared_ctx_;
-  RPCServer* rpc_server_;
-};
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/request_handler_impl.cc b/paddle/fluid/operators/distributed/request_handler_impl.cc
deleted file mode 100644
index 8c4f2ef57a32c..0000000000000
--- a/paddle/fluid/operators/distributed/request_handler_impl.cc
+++ /dev/null
@@ -1,354 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/distributed/request_handler_impl.h"
-#include <iostream>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/variable_helper.h"
-#include "paddle/fluid/operators/distributed/rpc_server.h"
-#include "paddle/fluid/string/piece.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/fluid/string/split.h"
-
-#include "paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h"
-#include "paddle/fluid/operators/distributed/heart_beat_monitor.h"
-#include "paddle/fluid/operators/distributed/large_scale_kv.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-// define LOOKUP_TABLE_PATH for checkpoint notify to save lookup table variables
-// to directory specified.
-constexpr char LOOKUP_TABLE_PATH[] = "kLookupTablePath";
-
-bool RequestSendHandler::Handle(const std::string &varname,
-                                framework::Scope *scope,
-                                framework::Variable *invar,
-                                framework::Variable **outvar,
-                                const int trainer_id,
-                                const std::string &out_var_name,
-                                const std::string &table_name) {
-  VLOG(4) << "RequestSendHandler:" << varname;
-
-  // Sync
-  if (varname == BATCH_BARRIER_MESSAGE) {
-    VLOG(3) << "sync: recv BATCH_BARRIER_MESSAGE";
-    rpc_server_->IncreaseBatchBarrier(kRequestSend);
-  } else if (varname == COMPLETE_MESSAGE) {
-    VLOG(3) << "sync: recv complete message";
-
-    if (HeartBeatMonitor::GetInstance() != nullptr) {
-      HeartBeatMonitor::GetInstance()->Update(trainer_id, "", COMPLETED);
-    }
-
-    rpc_server_->Complete();
-  } else {
-    // Async
-    if (distributed_mode_ != DistributedMode::kSync) {
-      VLOG(3) << "async process var: " << varname;
-      if (varname == BATCH_BARRIER_MESSAGE) {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "async mode should not recv BATCH_BARRIER_MESSAGE or "
-            "COMPLETE_MESSAGE"));
-      }
-      HeartBeatMonitor::GetInstance()->Update(trainer_id, varname, RUNNING);
-
-      std::string run_varname = varname;
-
-      string::Piece part_piece("@PIECE");
-      string::Piece var_name_piece = string::Piece(varname);
-
-      if (string::Contains(var_name_piece, part_piece)) {
-        auto varname_splits = paddle::string::Split(varname, '@');
-        PADDLE_ENFORCE_EQ(
-            varname_splits.size(), 3,
-            platform::errors::InvalidArgument(
-                "varname: %s should be separated into 3 parts by @", varname));
-        run_varname = varname_splits[0];
-        scope->Rename(varname, run_varname);
-      }
-
-      auto *var = scope->FindVar(run_varname);
-
-      // for sparse ids
-      if (var->IsType<framework::SelectedRows>()) {
-        if (distributed_mode_ == DistributedMode::kAsync ||
-            distributed_mode_ == DistributedMode::kHalfAsync) {
-          auto *ins = distributed::LargeScaleKV::GetInstance();
-          if (ins->GradInLargeScale(run_varname)) {
-            auto *large_scale_var = ins->GetByGrad(run_varname);
-
-            for (auto name : large_scale_var->CachedVarnames()) {
-              scope->Var(name);
-            }
-          }
-        }
-        if (distributed_mode_ == DistributedMode::kGeo) {
-          if (AsyncSparseParamUpdateRecorder::GetInstance()->HasGrad(
-                  run_varname)) {
-            auto &grad_slr =
-                scope->FindVar(run_varname)->Get<framework::SelectedRows>();
-            AsyncSparseParamUpdateRecorder::GetInstance()->Update(
-                run_varname, grad_slr.rows());
-          }
-        }
-      }
-
-      executor_->RunPreparedContext((*grad_to_prepared_ctx_)[run_varname].get(),
-                                    scope);
-      return true;
-    } else {  // sync
-      rpc_server_->WaitCond(kRequestSend);
-      VLOG(3) << "sync: processing received var: " << varname;
-      PADDLE_ENFORCE_NOT_NULL(
-          invar, platform::errors::NotFound(
-                     "sync: Can not find server side var %s.", varname));
-    }
-  }
-  return true;
-}
-
-bool RequestGetHandler::Handle(const std::string &varname,
-                               framework::Scope *scope,
-                               framework::Variable *invar,
-                               framework::Variable **outvar,
-                               const int trainer_id,
-                               const std::string &out_var_name,
-                               const std::string &table_name) {
-  VLOG(3) << "RequestGetHandler:" << varname
-          << " out_var_name: " << out_var_name << " trainer_id: " << trainer_id
-          << " table_name: " << table_name;
-
-  if (distributed_mode_ == DistributedMode::kSync) {
-    if (varname == FETCH_BARRIER_MESSAGE) {
-      VLOG(3) << "sync: recv fetch barrier message";
-      rpc_server_->IncreaseBatchBarrier(kRequestGet);
-    } else {
-      rpc_server_->WaitCond(kRequestGet);
-      *outvar = scope_->FindVar(varname);
-    }
-  } else {
-    if (varname != FETCH_BARRIER_MESSAGE && varname != COMPLETE_MESSAGE) {
-      if (enable_dc_asgd_) {
-        // NOTE: the format is determined by distribute_transpiler.py
-        std::string param_bak_name =
-            string::Sprintf("%s.trainer_%d_bak", varname, trainer_id);
-        VLOG(3) << "getting " << param_bak_name << " trainer_id " << trainer_id;
-        auto var = scope_->FindVar(varname);
-        auto t_orig = var->Get<framework::LoDTensor>();
-        auto param_bak = scope_->Var(param_bak_name);
-        auto t = param_bak->GetMutable<framework::LoDTensor>();
-        t->mutable_data(dev_ctx_->GetPlace(), t_orig.type());
-        VLOG(3) << "copying " << varname << " to " << param_bak_name;
-        framework::TensorCopy(t_orig, dev_ctx_->GetPlace(), t);
-      }
-
-      if (distributed_mode_ == DistributedMode::kGeo &&
-          AsyncSparseParamUpdateRecorder::GetInstance()->HasParam(varname) &&
-          !table_name.empty()) {
-        VLOG(3) << "AsyncSparseParamUpdateRecorder " << varname << " exist ";
-
-        std::vector<int64_t> updated_rows;
-        AsyncSparseParamUpdateRecorder::GetInstance()->GetAndClear(
-            varname, trainer_id, &updated_rows);
-
-        if (VLOG_IS_ON(3)) {
-          std::ostringstream sstream;
-          sstream << "[";
-          for (auto &row_id : updated_rows) {
-            sstream << row_id << ", ";
-          }
-          sstream << "]";
-          VLOG(3) << "updated_rows size: " << updated_rows.size() << " "
-                  << sstream.str();
-        }
-
-        auto &origin_tensor =
-            scope_->FindVar(varname)->Get<framework::LoDTensor>();
-        auto *origin_tensor_data = origin_tensor.data<float>();
-        auto &dims = origin_tensor.dims();
-        *outvar = scope->Var();
-        auto *out_slr = (*outvar)->GetMutable<framework::SelectedRows>();
-        out_slr->set_rows(updated_rows);
-        out_slr->set_height(dims[0]);
-        auto out_dims = framework::make_ddim(
-            {static_cast<int64_t>(updated_rows.size()), dims[1]});
-        auto *data = out_slr->mutable_value()->mutable_data<float>(
-            out_dims, origin_tensor.place());
-        auto width = dims[1];
-        for (size_t i = 0; i < updated_rows.size(); ++i) {
-          PADDLE_ENFORCE_LT(
-              updated_rows[i], dims[0],
-              platform::errors::OutOfRange(
-                  "The value of updated_rows: %s out of Tensor %s dims[0]: %s",
-                  updated_rows[i], varname, dims[0]));
-          memcpy(data + i * width, origin_tensor_data + updated_rows[i] * width,
-                 sizeof(float) * width);
-        }
-      } else {
-        *outvar = scope_->FindVar(varname);
-      }
-    }
-  }
-  return true;
-}
-
-bool RequestGetNoBarrierHandler::Handle(const std::string &varname,
-                                        framework::Scope *scope,
-                                        framework::Variable *invar,
-                                        framework::Variable **outvar,
-                                        const int trainer_id,
-                                        const std::string &out_var_name,
-                                        const std::string &table_name) {
-  VLOG(4) << "RequestGetNoBarrierHandler:" << varname
-          << " out_var_name: " << out_var_name;
-
-  // get var from pserver immediately without barriers
-  string::Piece without_barrier_piece(WITHOUT_BARRIER_MESSAGE);
-  string::Piece var_name_piece = string::Piece(varname);
-
-  if (string::Contains(var_name_piece, without_barrier_piece)) {
-    var_name_piece = string::TrimSuffix(var_name_piece, without_barrier_piece);
-    VLOG(4) << "Get var " << var_name_piece << " with "
-            << WITHOUT_BARRIER_MESSAGE;
-    *outvar = scope_->FindVar(var_name_piece.ToString());
-    return true;
-  } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "GetNoBarrier must contain %s", WITHOUT_BARRIER_MESSAGE));
-  }
-  return true;
-}
-
-bool RequestPrefetchHandler::Handle(const std::string &varname,
-                                    framework::Scope *scope,
-                                    framework::Variable *invar,
-                                    framework::Variable **outvar,
-                                    const int trainer_id,
-                                    const std::string &out_var_name,
-                                    const std::string &table_name) {
-  VLOG(4) << "RequestPrefetchHandler " << varname;
-
-  (*outvar)->GetMutable<framework::LoDTensor>();
-
-  VLOG(1) << "Prefetch "
-          << "tablename: " << table_name << " ids:" << varname
-          << " out: " << out_var_name;
-  paddle::platform::CPUPlace cpu_place;
-  auto *ins = distributed::LargeScaleKV::GetInstance();
-
-  if (ins->ParamInLargeScale(table_name)) {
-    auto lookup_table_op = PullLargeScaleOp(table_name, varname, out_var_name);
-    lookup_table_op->Run(*scope, cpu_place);
-  } else {
-    auto lookup_table_op =
-        BuildLookupTableOp(table_name, varname, out_var_name);
-    lookup_table_op->Run(*scope, cpu_place);
-  }
-
-  return true;
-}
-
-bool RequestCheckpointHandler::Handle(const std::string &varname,
-                                      framework::Scope *scope,
-                                      framework::Variable *invar,
-                                      framework::Variable **outvar,
-                                      const int trainer_id,
-                                      const std::string &out_var_name,
-                                      const std::string &table_name) {
-  VLOG(4) << "receive save var " << varname << " with path " << out_var_name
-          << " mode " << table_name;
-
-  int mode = std::stoi(table_name);
-
-  auto *ins = distributed::LargeScaleKV::GetInstance();
-  ins->Get(varname)->Save(out_var_name, mode);
-  return true;
-}
-
-bool RequestNotifyHandler::Handle(const std::string &varname,
-                                  framework::Scope *scope,
-                                  framework::Variable *invar,
-                                  framework::Variable **outvar,
-                                  const int trainer_id,
-                                  const std::string &out_var_name,
-                                  const std::string &table_name) {
-  VLOG(3) << "RequestNotifyHandler: " << varname
-          << ", trainer_id: " << trainer_id;
-
-  string::Piece decay_piece(STEP_COUNTER);
-  string::Piece var_name_piece = string::Piece(varname);
-  if (string::Contains(var_name_piece, decay_piece)) {
-    VLOG(3) << "LearningRate Decay Counter Update";
-
-    auto *send_var = scope->FindVar(varname);
-    auto send_var_tensor = send_var->Get<framework::LoDTensor>();
-    auto *send_value =
-        send_var_tensor.mutable_data<int64_t>(send_var_tensor.place());
-
-    auto counter = decay_counters.at(trainer_id);
-    counter += send_value[0];
-    decay_counters.at(trainer_id) = counter;
-
-    auto *global_step_var = this->scope()->FindVar(LEARNING_RATE_DECAY_COUNTER);
-    if (global_step_var == nullptr) {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "can not find LEARNING_RATE_DECAY_COUNTER "));
-    }
-
-    auto *tensor = global_step_var->GetMutable<framework::LoDTensor>();
-    auto *value = tensor->mutable_data<int64_t>(platform::CPUPlace());
-
-    auto global_counter = 0;
-    for (auto &trainer_counter : decay_counters) {
-      global_counter += trainer_counter.second;
-    }
-    value[0] = global_counter;
-
-    if (lr_decay_prepared_ctx_.get() == nullptr) {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "can not find decay block for executor"));
-    }
-
-    executor_->RunPreparedContext(lr_decay_prepared_ctx_.get(), scope_);
-  }
-  return true;
-}
-
-bool RequestSendAndRecvHandler::Handle(const std::string &varname,
-                                       framework::Scope *Scope,
-                                       framework::Variable *var,
-                                       framework::Variable **outvar,
-                                       const int trainer_id,
-                                       const std::string &out_var_name,
-                                       const std::string &table_name) {
-  VLOG(3) << "SendAndRecvHandle: " << varname
-          << " out_var_name: " << out_var_name
-          << " , trainer_id:  " << trainer_id;
-
-  executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(), Scope);
-  *outvar = Scope->FindVar(out_var_name);
-  return true;
-}
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/request_handler_impl.h b/paddle/fluid/operators/distributed/request_handler_impl.h
deleted file mode 100644
index 6d239673f9104..0000000000000
--- a/paddle/fluid/operators/distributed/request_handler_impl.h
+++ /dev/null
@@ -1,198 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <time.h>
-#include <functional>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/operators/distributed/request_handler.h"
-
-namespace paddle {
-namespace framework {
-class Scope;
-class Variable;
-}  // namespace framework
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-class RequestSendHandler final : public RequestHandler {
- public:
-  explicit RequestSendHandler(int distributed_mode, bool enable_dc_asgd = false)
-      : RequestHandler(distributed_mode) {
-    enable_dc_asgd_ = enable_dc_asgd;
-  }
-  virtual ~RequestSendHandler() {}
-  bool Handle(const std::string& varname, framework::Scope* scope,
-              framework::Variable* var, framework::Variable** outvar,
-              const int trainer_id, const std::string& out_var_name = "",
-              const std::string& table_name = "") override;
-
- private:
-  bool enable_dc_asgd_;
-};
-
-class RequestGetHandler final : public RequestHandler {
- public:
-  explicit RequestGetHandler(int distributed_mode, bool enable_dc_asgd = false)
-      : RequestHandler(distributed_mode) {
-    enable_dc_asgd_ = enable_dc_asgd;
-  }
-  virtual ~RequestGetHandler() {}
-  bool Handle(const std::string& varname, framework::Scope* scope,
-              framework::Variable* var, framework::Variable** outvar,
-              const int trainer_id, const std::string& out_var_name = "",
-              const std::string& table_name = "") override;
-
- private:
-  bool enable_dc_asgd_;
-};
-
-class RequestGetNoBarrierHandler final : public RequestHandler {
- public:
-  RequestGetNoBarrierHandler() : RequestHandler(false) {}
-  virtual ~RequestGetNoBarrierHandler() {}
-  bool Handle(const std::string& varname, framework::Scope* scope,
-              framework::Variable* var, framework::Variable** outvar,
-              const int trainer_id, const std::string& out_var_name = "",
-              const std::string& table_name = "") override;
-};
-
-static inline void BuildVar(const std::string& param_name,
-                            std::initializer_list<const char*> arguments,
-                            paddle::framework::proto::OpDesc::Var* var) {
-  var->set_parameter(param_name);
-  for (auto& arg_name : arguments) {
-    *var->mutable_arguments()->Add() = arg_name;
-  }
-}
-
-class RequestPrefetchHandler final : public RequestHandler {
- public:
-  explicit RequestPrefetchHandler(int distributed_mode)
-      : RequestHandler(distributed_mode) {}
-  virtual ~RequestPrefetchHandler() {}
-  bool Handle(const std::string& varname, framework::Scope* scope,
-              framework::Variable* var, framework::Variable** outvar,
-              const int trainer_id, const std::string& out_var_name = "",
-              const std::string& table_name = "") override;
-
- private:
-  std::unique_ptr<paddle::framework::OperatorBase> PullLargeScaleOp(
-      const std::string& table_name, const std::string& id_name,
-      const std::string& out_name) {
-    framework::OpDesc desc;
-    desc.SetType("lookup_sparse_table_read");
-    desc.SetInput("Ids", {id_name});
-    desc.SetOutput("Out", std::vector<std::string>({out_name}));
-    desc.SetAttr("tablename", {table_name});
-    desc.SetAttr("init", true);
-    desc.SetAttr("value_names", std::vector<std::string>({"Param"}));
-
-    auto op = paddle::framework::OpRegistry::CreateOp(desc);
-    return op;
-  }
-
-  std::unique_ptr<paddle::framework::OperatorBase> BuildLookupTableOp(
-      const std::string& table_name, const std::string& id_name,
-      const std::string& out_name) {
-    paddle::framework::proto::OpDesc op_desc;
-    op_desc.set_type("lookup_table");
-    BuildVar("W", {table_name.data()}, op_desc.add_inputs());
-    BuildVar("Ids", {id_name.data()}, op_desc.add_inputs());
-    BuildVar("Out", {out_name.data()}, op_desc.add_outputs());
-
-    auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
-    return op;
-  }
-};
-
-class RequestCheckpointHandler final : public RequestHandler {
- public:
-  explicit RequestCheckpointHandler(int distributed_mode)
-      : RequestHandler(distributed_mode) {}
-
-  virtual ~RequestCheckpointHandler() {}
-  bool Handle(const std::string& varname, framework::Scope* scope,
-              framework::Variable* var, framework::Variable** outvar,
-              const int trainer_id, const std::string& out_var_name = "",
-              const std::string& table_name = "") override;
-
- private:
-  std::unique_ptr<paddle::framework::OperatorBase> BuildCheckpointOp(
-      const std::string& varname, const std::string& file_path) {
-    paddle::framework::proto::OpDesc op_desc;
-    op_desc.set_type("save");
-    BuildVar("X", {varname.data()}, op_desc.add_inputs());
-
-    auto attr = op_desc.mutable_attrs()->Add();
-    attr->set_name("file_path");
-    attr->set_type(paddle::framework::proto::AttrType::STRING);
-    attr->set_s(file_path);
-
-    auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
-    return op;
-  }
-};
-
-class RequestNotifyHandler final : public RequestHandler {
- public:
-  explicit RequestNotifyHandler(int distributed_mode, int trainers)
-      : RequestHandler(distributed_mode) {
-    this->trainers = trainers;
-    for (int i = 0; i < trainers; i++) {
-      decay_counters[i] = 0;
-    }
-  }
-  virtual ~RequestNotifyHandler() {}
-  bool Handle(const std::string& varname, framework::Scope* scope,
-              framework::Variable* var, framework::Variable** outvar,
-              const int trainer_id, const std::string& out_var_name = "",
-              const std::string& table_name = "") override;
-
- private:
-  int trainers;
-  std::unordered_map<int, int64_t> decay_counters;
-};
-
-class RequestSendAndRecvHandler final : public RequestHandler {
- public:
-  explicit RequestSendAndRecvHandler(int distributed_mode)
-      : RequestHandler(distributed_mode) {}
-  virtual ~RequestSendAndRecvHandler() {}
-  bool Handle(const std::string& varname, framework::Scope* Scope,
-              framework::Variable* var, framework::Variable** outvar,
-              const int trainer_id, const std::string& out_var_name = "",
-              const std::string& table_name = "") override;
-};
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/rpc_client.cc b/paddle/fluid/operators/distributed/rpc_client.cc
deleted file mode 100644
index 57ce54870decf..0000000000000
--- a/paddle/fluid/operators/distributed/rpc_client.cc
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/distributed/rpc_client.h"
-#include "gflags/gflags.h"
-
-// default to 3min to avoid temprary network failures.
-DEFINE_int32(rpc_deadline, 180000, "deadline timeouts for rpc");
-DEFINE_int32(rpc_retry_times, 3, "retry times for rpc");
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-std::once_flag RPCClient::init_flag_;
-std::unique_ptr<RPCClient> RPCClient::rpc_client_(nullptr);
-int RPCClient::trainer_id_ = 0;
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/rpc_client.h b/paddle/fluid/operators/distributed/rpc_client.h
deleted file mode 100644
index 2c756a6f71ff9..0000000000000
--- a/paddle/fluid/operators/distributed/rpc_client.h
+++ /dev/null
@@ -1,143 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <condition_variable>  // NOLINT
-#include <memory>
-#include <string>
-
-#include "gflags/gflags.h"
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/operators/distributed/request_handler.h"
-
-namespace paddle {
-namespace framework {
-class Scope;
-}  // namespace framework
-namespace platform {
-class DeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
-DECLARE_int32(rpc_deadline);
-DECLARE_int32(rpc_retry_times);
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-class RPCClient {
- public:
-  RPCClient() {}
-  virtual ~RPCClient() {}
-  virtual VarHandlePtr AsyncSendVar(const std::string& ep,
-                                    const platform::DeviceContext& ctx,
-                                    const framework::Scope& scope,
-                                    const std::string& var_name,
-                                    int64_t time_out = FLAGS_rpc_deadline) = 0;
-
-  virtual VarHandlePtr AsyncGetVar(const std::string& ep,
-                                   const platform::DeviceContext& ctx,
-                                   const framework::Scope& scope,
-                                   const std::string& var_name,
-                                   const std::string& out_varname,
-                                   const std::string& table_name = "",
-                                   int64_t time_out = FLAGS_rpc_deadline) = 0;
-
-  virtual VarHandlePtr AsyncGetVarNoBarrier(
-      const std::string& ep, const platform::DeviceContext& ctx,
-      const framework::Scope& scope, const std::string& var_name,
-      const std::string& out_varname,
-      int64_t time_out = FLAGS_rpc_deadline) = 0;
-
-  virtual VarHandlePtr AsyncGetMonomerVariable(
-      const std::string& ep, const platform::DeviceContext& ctx,
-      const framework::Scope& scope, const std::string& var_name,
-      int64_t time_out = FLAGS_rpc_deadline) = 0;
-
-  virtual VarHandlePtr AsyncPrefetchVar(
-      const std::string& ep, const platform::DeviceContext& ctx,
-      const framework::Scope& scope, const std::string& in_var_name,
-      const std::string& out_var_name, const std::string& table_name = "",
-      int64_t time_out = FLAGS_rpc_deadline) = 0;
-
-  virtual VarHandlePtr AsyncSendBatchBarrier(
-      const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) = 0;
-
-  virtual VarHandlePtr AsyncSendFetchBarrier(
-      const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) = 0;
-
-  virtual VarHandlePtr AsyncGetMonomerBarrier(
-      const std::string& ep, const std::string& var_name,
-      int64_t time_out = FLAGS_rpc_deadline) = 0;
-
-  virtual VarHandlePtr AsyncCheckpointNotify(
-      const std::string& ep, const std::string& dirname,
-      const std::string& varname, const int mode,
-      int64_t time_out = FLAGS_rpc_deadline) = 0;
-
-  virtual VarHandlePtr AsyncDistributeNotify(
-      const std::string& ep, const platform::DeviceContext& ctx,
-      const framework::Scope& scope, const std::string& var_name,
-      int64_t time_out = FLAGS_rpc_deadline) = 0;
-
-  virtual VarHandlePtr AsyncSendAndRecv(
-      const std::string& ep, const platform::DeviceContext& ctx,
-      const framework::Scope& scope, const std::string& send_var_name,
-      const std::string& recv_var_name, const std::string& table_name = "",
-      int64_t time_out = FLAGS_rpc_deadline) = 0;
-
-  virtual VarHandlePtr AsyncSendComplete(
-      const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) = 0;
-
-  // Complete tells all the pserver instances that finishe the training,
-  // the pserver can reduce it's barrier count, and continue to train
-  // with other trainers.
-  virtual void SendComplete() = 0;
-
-  virtual bool Wait() = 0;
-
-  template <typename T>
-  static RPCClient* GetInstance(int trainer_id) {
-    std::call_once(init_flag_, &RPCClient::Init<T>, trainer_id);
-    return rpc_client_.get();
-  }
-
-  // Init is called by GetInstance.
-  template <typename T>
-  static void Init(int trainer_id) {
-    VLOG(1) << "init rpc client with trainer_id " << trainer_id;
-    trainer_id_ = trainer_id;
-    if (rpc_client_.get() == nullptr) {
-      rpc_client_.reset(new T());
-      rpc_client_->InitImpl();
-    }
-  }
-
-  virtual void InitImpl() {}
-
- protected:
-  // each trainer have exact one trainer id, it should be static
-  static int trainer_id_;
-
- private:
-  static std::once_flag init_flag_;
-  static std::unique_ptr<RPCClient> rpc_client_;
-};
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/rpc_server.cc b/paddle/fluid/operators/distributed/rpc_server.cc
deleted file mode 100644
index 37cf0460fb1fa..0000000000000
--- a/paddle/fluid/operators/distributed/rpc_server.cc
+++ /dev/null
@@ -1,242 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/distributed/rpc_server.h"
-
-#include <fstream>
-#include <string>
-
-namespace paddle {
-namespace framework {
-class Scope;
-}  // namespace framework
-namespace platform {
-class DeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-class RequestHandler;
-
-void RPCServer::ShutDown() {
-  VLOG(3) << "RPCServer ShutDown ";
-  ShutDownImpl();
-
-  exit_flag_ = true;
-  barrier_cond_.notify_all();
-  rpc_cond_.notify_all();
-}
-
-void RPCServer::SavePort() const {
-  auto file_path = string::Sprintf("/tmp/paddle.%d.port", ::getpid());
-  std::ofstream port_file;
-  port_file.open(file_path);
-  port_file << selected_port_;
-  port_file.close();
-  VLOG(3) << "selected port written to " << file_path;
-}
-
-void RPCServer::WaitBarrier(const std::string& rpc_name) {
-  VLOG(3) << "WaitBarrier in: " << rpc_name;
-  std::unique_lock<std::mutex> lock(this->mutex_);
-  barrier_cond_.wait(lock, [this, &rpc_name] {
-    return ((barrier_counter_[rpc_name] == client_num_ && client_num_ != 0) ||
-            exit_flag_.load());
-  });
-
-  VLOG(3) << "WaitBarrier out: " << rpc_name
-          << " counter: " << barrier_counter_[rpc_name];
-}
-
-void RPCServer::IncreaseBatchBarrier(const std::string rpc_name) {
-  VLOG(3) << "RPCServer begin IncreaseBatchBarrier " << rpc_name;
-  // barrier msg should make sure that it's in the right cond(send|recv)
-  WaitCond(rpc_name);
-  int b = 0;
-  std::unique_lock<std::mutex> lock(mutex_);
-  b = ++barrier_counter_[rpc_name];
-  VLOG(3) << rpc_name << " barrier_counter: " << b;
-  if (b >= client_num_) {
-    lock.unlock();
-    VLOG(3) << "BatchBarrier counter reach " << client_num_ << " for "
-            << rpc_name;
-    barrier_cond_.notify_all();
-    lock.lock();
-  }
-}
-
-void RPCServer::Complete() {
-  {
-    std::unique_lock<std::mutex> lock(mutex_);
-    client_num_--;
-    need_reset_all_vars_ = true;
-
-    VLOG(3) << "decrease client_num to: " << client_num_;
-    if (cur_cond_.load() == rpc_cond_map_[kRequestGet]) {
-      barrier_counter_[kRequestGet]--;
-    }
-  }
-  barrier_cond_.notify_all();
-}
-
-bool RPCServer::NeedResetAllVars() {
-  std::unique_lock<std::mutex> lock(mutex_);
-  return need_reset_all_vars_;
-}
-
-int RPCServer::GetClientNum() {
-  std::unique_lock<std::mutex> lock(mutex_);
-  return client_num_;
-}
-
-void RPCServer::ResetBarrierCounter() {
-  VLOG(3) << "RPCServer ResetBarrierCounter ";
-  std::unique_lock<std::mutex> lock(mutex_);
-  for (auto& t : barrier_counter_) {
-    t.second = 0;
-  }
-  need_reset_all_vars_ = false;
-}
-
-void RPCServer::RegisterRPC(const std::string& rpc_name,
-                            RequestHandler* handler, int thread_num) {
-  rpc_call_map_[rpc_name] = handler;
-  rpc_thread_num_[rpc_name] = thread_num;
-
-  static int cond = -1;
-  rpc_cond_map_[rpc_name] = ++cond;
-  VLOG(3) << "RegisterRPC rpc_name: " << rpc_name << ", handler: " << handler
-          << ", cond: " << rpc_cond_map_[rpc_name];
-}
-
-void RPCServer::SetCond(const std::string& rpc_name) {
-  VLOG(3) << "RPCServer SetCond " << rpc_name;
-  {
-    std::unique_lock<std::mutex> lock(mutex_);
-    cur_cond_ = rpc_cond_map_[rpc_name];
-  }
-
-  rpc_cond_.notify_all();
-}
-
-void RPCServer::WaitCond(const std::string& rpc_name) {
-  VLOG(3) << "RPCServer WaitCond in " << rpc_name;
-  int cond = 0;
-  {
-    std::unique_lock<std::mutex> lock(mutex_);
-    cond = rpc_cond_map_[rpc_name];
-  }
-
-  std::unique_lock<std::mutex> lock(mutex_);
-  rpc_cond_.wait(
-      lock, [=] { return (cur_cond_.load() == cond || exit_flag_.load()); });
-  VLOG(3) << "RPCServer WaitCond out " << rpc_name;
-}
-
-void RPCServer::RegisterVar(const std::string& var_name,
-                            const std::string& rpc_name,
-                            framework::Scope* scope,
-                            platform::DeviceContext* dev_ctx) {
-  MonomerHandle h;
-  h.var_name_ = var_name;
-  h.rpc_name_ = rpc_name;
-  h.scope_ = scope;
-  h.dev_ctx_ = dev_ctx;
-
-  {
-    std::unique_lock<std::mutex> lock(mutex_);
-    PADDLE_ENFORCE_EQ(
-        var_map_.find(var_name), var_map_.end(),
-        platform::errors::AlreadyExists("%s already in var_map.", var_name));
-    var_map_[var_name] = h;
-  }
-
-  rpc_cond_.notify_all();
-  VLOG(3) << "RegisterVar context:" << h.String();
-}
-
-void RPCServer::IncreaseVarBarrier(const std::string& var_name) {
-  int b = 0;
-  MonomerHandle h;
-  {
-    std::unique_lock<std::mutex> lock(mutex_);
-    b = ++var_map_[var_name].barrier_;
-    h = var_map_[var_name];
-  }
-
-  if (b >= client_num_) {
-    barrier_cond_.notify_all();
-  }
-
-  VLOG(3) << "IncreaseVarBarrier context:" << h.String();
-}
-
-void RPCServer::WaitVarBarrier(const std::string& var_name) {
-  VLOG(3) << "WaitVarBarrier var_name:" << var_name;
-
-  std::unique_lock<std::mutex> lock(mutex_);
-  barrier_cond_.wait(lock, [&]() {
-    return ((var_map_[var_name].barrier_ >= client_num_ && client_num_ != 0) ||
-            exit_flag_.load());
-  });
-
-  VLOG(3) << "WaitVarBarrier context: " << var_map_[var_name].String();
-}
-
-void RPCServer::SetVarCond(const std::string& var_name) {
-  VLOG(3) << "SetVarCond var_name:" << var_name;
-  {
-    std::unique_lock<std::mutex> lock(mutex_);
-    if (var_map_.find(var_name) != var_map_.end()) {
-      rpc_cond_.notify_all();
-    }
-  }
-}
-
-void RPCServer::WaitVarCond(const std::string& var_name) {
-  VLOG(3) << "WaitVarCond var_name:" << var_name;
-
-  std::unique_lock<std::mutex> lock(mutex_);
-  rpc_cond_.wait(lock, [=] {
-    return (var_map_.find(var_name) != var_map_.end() || exit_flag_.load());
-  });
-
-  VLOG(3) << "WaitVarCond var_name:" << var_name << " end";
-}
-
-MonomerHandle RPCServer::GetMonomer(const std::string& var_name) {
-  MonomerHandle h;
-  {
-    std::unique_lock<std::mutex> lock(mutex_);
-    h = var_map_[var_name];
-  }
-
-  return h;
-}
-
-void RPCServer::ClearRegisteredVars() {
-  std::unique_lock<std::mutex> lock(mutex_);
-  var_map_.clear();
-}
-
-void RPCServer::ClearVar(const std::string& var_name) {
-  std::unique_lock<std::mutex> lock(mutex_);
-  var_map_.erase(var_name);
-}
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/rpc_server.h b/paddle/fluid/operators/distributed/rpc_server.h
deleted file mode 100644
index 2120260515e25..0000000000000
--- a/paddle/fluid/operators/distributed/rpc_server.h
+++ /dev/null
@@ -1,149 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <atomic>
-#include <set>
-#include <string>
-#include <thread>  // NOLINT
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/operators/distributed/request_handler.h"
-#include "paddle/fluid/platform/device_context.h"
-
-namespace paddle {
-namespace framework {
-class Scope;
-}  // namespace framework
-namespace platform {
-class DeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-class RequestHandler;
-
-struct MonomerHandle {
-  std::string var_name_;
-  std::string rpc_name_;
-  framework::Scope* scope_{nullptr};
-  platform::DeviceContext* dev_ctx_{nullptr};
-  int64_t barrier_{0};
-
-  std::string String() {
-    std::stringstream ss;
-    ss << "var_name:" << var_name_ << ", rpc_name:" << rpc_name_
-       << ", scope:" << scope_ << ", dev_ctx:" << dev_ctx_
-       << ", barrier_:" << barrier_;
-    return ss.str();
-  }
-};
-
-class RPCServer {
- public:
-  explicit RPCServer(const std::string& address, int client_num)
-      : cur_cond_(0),
-        bind_address_(address),
-        exit_flag_(false),
-        selected_port_(0),
-        client_num_(client_num),
-        need_reset_all_vars_(false) {}
-
-  virtual ~RPCServer() {}
-  virtual void StartServer() = 0;
-  virtual void WaitServerReady() = 0;
-
-  void ShutDown();
-
-  bool IsExit() { return exit_flag_.load(); }
-
-  int GetSelectedPort() const { return selected_port_; }
-
-  int GetClientNum();
-
-  void SavePort() const;
-
-  // RegisterRPC, register the rpc method name to a handler
-  // class, and auto generate a condition id for this call
-  // to be used for the barrier.
-  void RegisterRPC(const std::string& rpc_name, RequestHandler* handler,
-                   int thread_num = 1);
-
-  int GetThreadNum(const std::string& rpc_name) {
-    return rpc_thread_num_[rpc_name];
-  }
-
-  // Wait util all the clients have reached the barrier for one
-  // rpc method. This function should be called in the
-  // RequestHandler if you want to run the server/client in a
-  // synchronous mode.
-  void WaitBarrier(const std::string& rpc_name);
-
-  void SetCond(const std::string& rpc_name);
-  void WaitCond(const std::string& rpc_name);
-  void IncreaseBatchBarrier(const std::string rpc_name);
-
-  void RegisterVar(const std::string& var_name, const std::string& rpc_name,
-                   framework::Scope* scope, platform::DeviceContext* dev_ctx);
-  void IncreaseVarBarrier(const std::string& var_name);
-  void WaitVarBarrier(const std::string& var_name);
-  void SetVarCond(const std::string& var_name);
-  void WaitVarCond(const std::string& var_name);
-  void ClearRegisteredVars();
-  void ClearVar(const std::string& var_name);
-  MonomerHandle GetMonomer(const std::string& var_name);
-
-  void Complete();
-
-  void ResetBarrierCounter();
-
-  bool NeedResetAllVars();
-
- protected:
-  virtual void ShutDownImpl() = 0;
-
- private:
-  std::mutex mutex_;
-  std::unordered_map<std::string, int> barrier_counter_;
-  std::condition_variable barrier_cond_;
-
-  std::unordered_map<std::string, int> rpc_cond_map_;
-  std::atomic<int> cur_cond_;
-  std::condition_variable rpc_cond_;
-
- protected:
-  std::string bind_address_;
-  std::atomic<int> exit_flag_;
-  int selected_port_;
-  int client_num_;
-  bool need_reset_all_vars_;
-
-  std::unordered_map<std::string, RequestHandler*> rpc_call_map_;
-  std::unordered_map<std::string, int> rpc_thread_num_;
-  friend class RequestHandler;
-
-  // TODO(gongwb): use more cond to notify or wait;
-  std::unordered_map<std::string, MonomerHandle> var_map_;
-};
-
-};  // namespace distributed
-};  // namespace operators
-};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/rpc_server_test.cc b/paddle/fluid/operators/distributed/rpc_server_test.cc
deleted file mode 100644
index f59285400033d..0000000000000
--- a/paddle/fluid/operators/distributed/rpc_server_test.cc
+++ /dev/null
@@ -1,344 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <stdlib.h>
-#include <unistd.h>
-#include <chrono>  // NOLINT
-#include <memory>
-#include <string>
-#include <thread>  // NOLINT
-#include <unordered_map>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/block_desc.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-
-#include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/distributed/heart_beat_monitor.h"
-#include "paddle/fluid/operators/distributed/large_scale_kv.h"
-#include "paddle/fluid/operators/distributed/request_handler_impl.h"
-#include "paddle/fluid/operators/distributed/rpc_client.h"
-#include "paddle/fluid/operators/distributed/rpc_server.h"
-
-namespace framework = paddle::framework;
-namespace platform = paddle::platform;
-namespace distributed = paddle::operators::distributed;
-
-USE_NO_KERNEL_OP(lookup_sparse_table_read);
-USE_NO_KERNEL_OP(checkpoint_notify);
-USE_OP(scale);
-
-std::unique_ptr<distributed::RPCServer> g_rpc_service;
-std::unique_ptr<distributed::RequestHandler> g_req_handler;
-
-framework::BlockDesc* AppendSendAndRecvBlock(framework::ProgramDesc* program) {
-  auto root_block = program->MutableBlock(0);
-  auto* block = program->AppendBlock(*root_block);
-
-  framework::OpDesc* op = block->AppendOp();
-  op->SetType("scale");
-  op->SetInput("X", {"x"});
-  op->SetOutput("Out", {"res"});
-  op->SetAttr("scale", 0.5f);
-
-  auto& out = *root_block->Var("res");
-  out.SetType(framework::proto::VarType::LOD_TENSOR);
-  out.SetShape({1, 10});
-
-  return block;
-}
-
-void CreateVarsOnScope(framework::Scope* scope, platform::CPUPlace* place) {
-  auto w_var = scope->Var("w");
-  w_var->GetMutable<framework::SelectedRows>();
-
-  auto out_var = scope->Var("out");
-  out_var->GetMutable<framework::LoDTensor>();
-
-  auto ids_var = scope->Var("ids");
-  ids_var->GetMutable<framework::LoDTensor>();
-
-  auto x_var = scope->Var("x");
-  x_var->GetMutable<framework::LoDTensor>();
-
-  auto res_var = scope->Var("res");
-  res_var->GetMutable<framework::LoDTensor>();
-}
-
-void InitTensorsOnClient(framework::Scope* scope, platform::CPUPlace* place,
-                         int64_t rows_numel) {
-  CreateVarsOnScope(scope, place);
-  auto ids_var = scope->Var("ids")->GetMutable<framework::LoDTensor>();
-  int64_t* ids_ptr =
-      ids_var->mutable_data<int64_t>(framework::DDim({rows_numel, 1}), *place);
-  for (int64_t i = 0; i < rows_numel; ++i) ids_ptr[i] = i * 2;
-
-  auto x_var = scope->Var("x")->GetMutable<framework::LoDTensor>();
-  float* x_ptr =
-      x_var->mutable_data<float>(framework::DDim({1, rows_numel}), *place);
-  for (int64_t i = 0; i < rows_numel; ++i) x_ptr[i] = 1.0;
-}
-
-void InitTensorsOnServer(framework::Scope* scope, platform::CPUPlace* place,
-                         int64_t rows_numel) {
-  CreateVarsOnScope(scope, place);
-  auto w = scope->Var("w")->GetMutable<framework::SelectedRows>();
-  auto w_value = w->mutable_value();
-  w_value->Resize({rows_numel, 10});
-  for (int64_t i = 0; i < rows_numel; ++i) w->AutoGrownIndex(i, true);
-
-  auto ptr = w_value->mutable_data<float>(*place);
-
-  for (int64_t i = 0; i < w_value->numel(); ++i) {
-    ptr[i] = static_cast<float>(i / 10);
-  }
-}
-
-void StartServer(const std::string& rpc_name) {
-  framework::ProgramDesc program;
-  framework::Scope scope;
-  platform::CPUPlace place;
-  framework::Executor exe(place);
-  platform::CPUDeviceContext ctx(place);
-
-  std::unordered_map<std::string,
-                     std::shared_ptr<framework::ExecutorPrepareContext>>
-      prefetch_var_name_to_prepared;
-
-  g_req_handler->SetProgram(&program);
-  g_req_handler->SetPrefetchPreparedCtx(&prefetch_var_name_to_prepared);
-  g_req_handler->SetDevCtx(&ctx);
-  g_req_handler->SetScope(&scope);
-  g_req_handler->SetExecutor(&exe);
-
-  g_rpc_service->RegisterRPC(rpc_name, g_req_handler.get());
-
-  //  distributed::HeartBeatMonitor::Init(1, true, "w@grad");
-
-  g_req_handler->SetRPCServer(g_rpc_service.get());
-
-  std::thread server_thread(
-      std::bind(&distributed::RPCServer::StartServer, g_rpc_service.get()));
-
-  server_thread.join();
-}
-
-void StartSendAndRecvServer(const std::string& rpc_name) {
-  framework::ProgramDesc program;
-  framework::Scope scope;
-  platform::CPUPlace place;
-  framework::Executor exe(place);
-  platform::CPUDeviceContext ctx(place);
-  auto block = AppendSendAndRecvBlock(&program);
-  std::string in_var_name("x");
-  std::vector<int> prefetch_block_ids{block->ID()};
-  auto prepared = exe.Prepare(program, prefetch_block_ids);
-  InitTensorsOnServer(&scope, &place, 10);
-
-  std::unordered_map<std::string,
-                     std::shared_ptr<framework::ExecutorPrepareContext>>
-      grad_to_prepared_ctx;
-  grad_to_prepared_ctx[in_var_name] = prepared[0];
-
-  g_req_handler->SetProgram(&program);
-  g_req_handler->SetGradToPreparedCtx(&grad_to_prepared_ctx);
-  g_req_handler->SetDevCtx(&ctx);
-  g_req_handler->SetScope(&scope);
-  g_req_handler->SetExecutor(&exe);
-
-  g_rpc_service->RegisterRPC(rpc_name, g_req_handler.get());
-  g_req_handler->SetRPCServer(g_rpc_service.get());
-
-  std::thread server_thread(
-      std::bind(&distributed::RPCServer::StartServer, g_rpc_service.get()));
-
-  server_thread.join();
-}
-
-TEST(COMPLETE, CPU) {
-  setenv("http_proxy", "", 1);
-  setenv("https_proxy", "", 1);
-  g_req_handler.reset(
-      new distributed::RequestSendHandler(distributed::DistributedMode::kSync));
-  g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 2));
-  distributed::RPCClient* client =
-      distributed::RPCClient::GetInstance<RPCCLIENT_T>(0);
-  PADDLE_ENFORCE_NE(client, nullptr,
-                    platform::errors::InvalidArgument(
-                        "Client Start Fail, Check Your Code & Env"));
-  std::thread server_thread(StartServer, distributed::kRequestSend);
-  g_rpc_service->WaitServerReady();
-  int port = g_rpc_service->GetSelectedPort();
-  std::string ep = paddle::string::Sprintf("127.0.0.1:%d", port);
-  client->AsyncSendComplete(ep);
-  client->Wait();
-
-  EXPECT_EQ(g_rpc_service->GetClientNum(), 1);
-
-  g_rpc_service->ShutDown();
-  server_thread.join();
-  g_rpc_service.reset(nullptr);
-  g_req_handler.reset(nullptr);
-}
-
-TEST(SENDANDRECV, CPU) {
-  setenv("http_proxy", "", 1);
-  setenv("https_proxy", "", 1);
-  g_req_handler.reset(new distributed::RequestSendAndRecvHandler(
-      distributed::DistributedMode::kAsync));
-  g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 1));
-  distributed::RPCClient* client =
-      distributed::RPCClient::GetInstance<RPCCLIENT_T>(0);
-  PADDLE_ENFORCE_NE(client, nullptr,
-                    platform::errors::InvalidArgument(
-                        "Client Start Fail, Check Your Code & Env"));
-  std::thread server_thread(StartSendAndRecvServer,
-                            distributed::kRequestSendAndRecv);
-  g_rpc_service->WaitServerReady();
-  int port = g_rpc_service->GetSelectedPort();
-  std::string ep = paddle::string::Sprintf("127.0.0.1:%d", port);
-
-  framework::Scope scope;
-  platform::CPUPlace place;
-  platform::CPUDeviceContext ctx(place);
-
-  // create var on local scope
-  int64_t rows_numel = 10;
-  InitTensorsOnClient(&scope, &place, rows_numel);
-  std::string in_var_name("x");
-  std::string out_var_name("res");
-
-  client->AsyncSendAndRecv(ep, ctx, scope, in_var_name, out_var_name);
-  client->Wait();
-  auto var = scope.Var(out_var_name);
-  auto value = var->GetMutable<framework::LoDTensor>();
-  auto ptr = value->mutable_data<float>(place);
-
-  for (int64_t i = 0; i < rows_numel; ++i) {
-    EXPECT_EQ(ptr[i], 0.5);
-  }
-  g_rpc_service->ShutDown();
-  server_thread.join();
-  LOG(INFO) << "begin reset";
-  g_rpc_service.reset(nullptr);
-  g_req_handler.reset(nullptr);
-}
-
-void StartCheckpointServer(const std::string& rpc_name) {
-  framework::ProgramDesc program;
-  framework::Scope scope;
-  platform::CPUPlace place;
-  framework::Executor exe(place);
-  platform::CPUDeviceContext ctx(place);
-
-  std::vector<distributed::SparseMeta> metas;
-
-  auto meta = distributed::SparseMeta();
-  meta.name = "embedding.block0";
-  meta.value_names = {"Param"};
-  meta.value_dims = {64};
-  meta.mode = distributed::Mode::training;
-  meta.grad_name = "embedding@Grad";
-  meta.cached_varnames = {"kSparseIds"};
-  meta.initializer_attrs = {"fill_constant&1.0"};
-  meta.entry = "none";
-
-  metas.push_back(meta);
-  distributed::LargeScaleKV::Init(metas);
-
-  auto* ins = distributed::LargeScaleKV::GetInstance();
-  ins->Get("embedding.block0")->Init({0, 1, 2, 3, 4, 5, 6, 7, 8, 9});
-
-  std::unordered_map<std::string,
-                     std::shared_ptr<framework::ExecutorPrepareContext>>
-      prefetch_var_name_to_prepared;
-
-  g_req_handler->SetProgram(&program);
-  g_req_handler->SetPrefetchPreparedCtx(&prefetch_var_name_to_prepared);
-  g_req_handler->SetDevCtx(&ctx);
-  g_req_handler->SetScope(&scope);
-  g_req_handler->SetExecutor(&exe);
-
-  g_rpc_service->RegisterRPC(rpc_name, g_req_handler.get());
-
-  g_req_handler->SetRPCServer(g_rpc_service.get());
-
-  std::thread server_thread(
-      std::bind(&distributed::RPCServer::StartServer, g_rpc_service.get()));
-
-  server_thread.join();
-}
-
-TEST(LARGE_SCALE_CHECKPOINT, CPU) {
-  setenv("http_proxy", "", 1);
-  setenv("https_proxy", "", 1);
-
-  paddle::framework::Scope scope;
-  paddle::platform::CPUPlace place;
-
-  g_req_handler.reset(new distributed::RequestCheckpointHandler(
-      distributed::DistributedMode::kAsync));
-  g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 1));
-
-  distributed::RPCClient* client =
-      distributed::RPCClient::GetInstance<RPCCLIENT_T>(0);
-
-  PADDLE_ENFORCE_NE(client, nullptr,
-                    platform::errors::InvalidArgument(
-                        "Client Start Fail, Check Your Code & Env"));
-
-  std::thread server_thread(StartCheckpointServer,
-                            distributed::kRequestCheckpoint);
-  g_rpc_service->WaitServerReady();
-
-  int port = g_rpc_service->GetSelectedPort();
-  std::string ep = paddle::string::Sprintf("127.0.0.1:%d", port);
-
-  auto save_path =
-      paddle::string::Sprintf("%s/%s/%s", "/tmp/large_scale_table/base",
-                              "embedding", "embedding.block0");
-  int mode = 0;
-  client->AsyncCheckpointNotify(ep, save_path, "embedding.block0", mode);
-  client->Wait();
-
-  save_path =
-      paddle::string::Sprintf("%s/%s/%s", "/tmp/large_scale_table/delta",
-                              "embedding", "embedding.block0");
-  mode = 1;
-  client->AsyncCheckpointNotify(ep, save_path, "embedding.block0", mode);
-  client->Wait();
-
-  paddle::framework::AttributeMap attrs;
-
-  std::vector<std::string> eps = {ep};
-  attrs["endpoints"] = eps;
-  attrs["dirname"] = std::string("/tmp/large_scale_table/delta1");
-  attrs["varname"] = std::string("embedding");
-  attrs["mode"] = 2;
-  std::vector<std::string> slices = {"embedding.block0"};
-  attrs["slice_varnames"] = slices;
-  std::vector<std::string> remotes = {"embedding.block0"};
-  attrs["remote_varnames"] = remotes;
-
-  auto ops =
-      framework::OpRegistry::CreateOp("checkpoint_notify", {}, {}, attrs, true);
-  ops->Run(scope, place);
-
-  g_rpc_service->ShutDown();
-  server_thread.join();
-  LOG(INFO) << "begin reset";
-  g_rpc_service.reset(nullptr);
-  g_req_handler.reset(nullptr);
-}
diff --git a/paddle/fluid/operators/distributed/send_recv.proto.in b/paddle/fluid/operators/distributed/send_recv.proto.in
deleted file mode 100644
index a333642bd16fb..0000000000000
--- a/paddle/fluid/operators/distributed/send_recv.proto.in
+++ /dev/null
@@ -1,88 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. Licensed under
-the Apache License, Version 2.0 (the "License"); you may not use this file
-except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-syntax = "proto3";
-package sendrecv;
-
-option cc_generic_services = @cc_generic_services@;
-
-service SendRecvService {
-  // For parameter server round-robin like hashing, do not split tensors.
-  // Send and recv only one tensor
-  // TODO(typhoonzero): add streaming API
-  rpc SendVariable(VariableMessage) returns (VoidMessage) {}
-  // Argument VariableMessage for GetVariable should only contain varname.
-  rpc GetVariable(VariableMessage) returns (VariableMessage) {}
-  rpc GetVariableNoBarrier(VariableMessage) returns (VariableMessage) {}
-  // pre-fetch variable by given variable name and Ids
-  rpc PrefetchVariable(VariableMessage) returns (VariableMessage) {}
-
-  rpc CheckpointNotify(VariableMessage) returns (VoidMessage) {}
-  rpc DistributeNotify(VariableMessage) returns (VoidMessage) {}
-  rpc SendAndRecvVariable(VariableMessage) returns (VariableMessage) {}
-  rpc GetMonomerVariable(VariableMessage) returns (VariableMessage) {}
-  rpc GetMonomerBarrier(VariableMessage) returns (VoidMessage) {}
-}
-
-// It can be: LoDTensor、SelectedRows or NCCL_ID
-enum VarType {
-  LOD_TENSOR = 0;
-  SELECTED_ROWS = 1;
-  NCCL_ID = 2;
-}
-
-// VariableMessage is serialized paddle variable message.
-// NOTICE(gongwb):don't modify this proto if you are not
-//   not familar with how we serialize in sendrecvop_utils.h
-//   and deserilize it in  variable_response.h.
-message VariableMessage {
-  enum Type {
-    // Pod Types
-    BOOL = 0;
-    INT16 = 1;
-    INT32 = 2;
-    INT64 = 3;
-    FP16 = 4;
-    FP32 = 5;
-    FP64 = 6;
-  }
-
-  message LodData { repeated int64 lod_data = 1; }
-  string varname = 1;
-  // TODO(Yancey1989): reference framework::proto::VarDesc::VarType
-  VarType type = 2;
-  // bool persistable is not needed for sending.
-  // tensor info:
-  Type data_type = 3;
-  repeated int64 dims = 4;
-
-  // lod details:
-  int64 lod_level = 5;
-  repeated LodData lod = 6;
-  // selected_rows height, aka. original dim0
-  int64 slr_height = 7;
-  // tensor data
-  bytes serialized = 8;
-  // selected_rows data
-  bytes rows = 9;
-  // Look up table block execution output variable name.
-  string out_varname = 10;
-  // If 1, the ps server will start profiling, the ps
-  // server stops profiling and generates a profile to /tmp/profile_ps_*
-  // when profile switches from 1 to 2.
-  int64 profile = 11;
-  int64 trainer_id = 12;
-  string table_name = 13;
-}
-
-message VoidMessage {}
diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.cc b/paddle/fluid/operators/distributed/sendrecvop_utils.cc
deleted file mode 100644
index 107c74eb2670e..0000000000000
--- a/paddle/fluid/operators/distributed/sendrecvop_utils.cc
+++ /dev/null
@@ -1,117 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <memory>
-
-#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
-
-namespace paddle {
-namespace framework {
-class Variable;
-}  // namespace framework
-namespace memory {
-namespace allocation {
-class Allocation;
-}  // namespace allocation
-}  // namespace memory
-}  // namespace paddle
-
-DEFINE_bool(rpc_disable_reuse_port, false, "Disable SO_REUSEPORT or not.");
-DEFINE_int32(rpc_retry_bind_port, 3,
-             "Retry to bind the address if address is already used.");
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-using VarMsg = sendrecv::VariableMessage;
-
-static TensorPayload GetCommunicationAllocationFromTensor(
-    const platform::DeviceContext& ctx, const framework::Tensor& tensor) {
-  if (is_gpu_place(ctx.GetPlace())) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    PADDLE_ENFORCE_EQ(
-        is_gpu_place(tensor.place()), true,
-        platform::errors::PreconditionNotMet("Please run in gpu place."));
-    auto& gpu_dev_ctx =
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx);
-    auto copy_size = tensor.numel() * framework::SizeOfType(tensor.type());
-    platform::CUDAPinnedPlace cuda_pinned;
-    auto result = memory::AllocShared(cuda_pinned, copy_size);
-
-    memory::Copy(cuda_pinned, result->ptr(),
-                 BOOST_GET_CONST(platform::CUDAPlace, tensor.place()),
-                 tensor.data<void>(), copy_size, gpu_dev_ctx.stream());
-    ctx.Wait();
-    return TensorPayload(result);
-#else
-    PADDLE_THROW(
-        platform::errors::Unavailable("This situation should not be happened"));
-#endif
-  } else {
-    return TensorPayload(tensor);
-  }
-}
-TensorPayload GetTensorPayload(framework::Variable* var,
-                               const platform::DeviceContext& ctx,
-                               VarMsg* request) {
-  auto tensor = var->Get<framework::LoDTensor>();
-  // FIXME(wuyi): data types in send_recv.proto is copied from
-  // framework.proto
-  request->set_data_type(static_cast<VarMsg::Type>(tensor.type()));
-  for (auto& dim : framework::vectorize(tensor.dims())) {
-    request->add_dims(dim);
-  }
-  const framework::LoD lod = tensor.lod();
-  if (lod.size() > 0) {
-    request->set_lod_level(lod.size());
-    for (auto& each : lod) {
-      VarMsg::LodData* lod_inner = request->add_lod();
-      for (auto& d : each) {
-        lod_inner->add_lod_data(d);
-      }
-    }
-  }
-  return GetCommunicationAllocationFromTensor(ctx, tensor);
-}
-
-TensorPayload GetSelectedRowsPayload(framework::Variable* var,
-                                     const platform::DeviceContext& ctx,
-                                     VarMsg* request) {
-  auto* slr = var->GetMutable<framework::SelectedRows>();
-  request->set_data_type(static_cast<VarMsg::Type>(slr->value().type()));
-  request->set_lod_level(0);
-  request->set_slr_height(slr->height());
-
-  for (auto& dim : framework::vectorize(slr->value().dims())) {
-    request->add_dims(dim);
-  }
-
-  auto* tensor = slr->mutable_value();
-  return GetCommunicationAllocationFromTensor(ctx, *tensor);
-}
-
-TensorPayload::TensorPayload(std::shared_ptr<memory::Allocation> allocation)
-    : allocation_(allocation), offset_(0), memory_size_(allocation->size()) {}
-TensorPayload::TensorPayload(const framework::Tensor& tensor)
-    : allocation_(tensor.Holder()),
-      offset_(tensor.offset()),
-      memory_size_(tensor.numel() * framework::SizeOfType(tensor.type())) {}
-void* TensorPayload::ptr() const {
-  return reinterpret_cast<void*>(
-      reinterpret_cast<uintptr_t>(allocation_->ptr()) + offset_);
-}
-size_t TensorPayload::memory_size() const { return memory_size_; }
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.h b/paddle/fluid/operators/distributed/sendrecvop_utils.h
deleted file mode 100644
index 84ed1ab024712..0000000000000
--- a/paddle/fluid/operators/distributed/sendrecvop_utils.h
+++ /dev/null
@@ -1,110 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <iostream>
-#include <memory>
-#include <string>
-#include <typeindex>
-#include <vector>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/operators/distributed/distributed_pb.h"
-#include "paddle/fluid/platform/port.h"
-
-namespace paddle {
-namespace framework {
-class Tensor;
-class Variable;
-}  // namespace framework
-namespace memory {
-namespace allocation {
-class Allocation;
-}  // namespace allocation
-}  // namespace memory
-namespace platform {
-class DeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-using VarMsg = sendrecv::VariableMessage;
-
-class TensorPayload final {
- public:
-  explicit TensorPayload(const framework::Tensor& tensor);
-  explicit TensorPayload(std::shared_ptr<memory::Allocation> allocation);
-
-  TensorPayload(const TensorPayload& o) = default;
-  TensorPayload& operator=(const TensorPayload& o) = default;
-
-  void* ptr() const;
-  size_t memory_size() const;
-
- private:
-  std::shared_ptr<memory::Allocation> allocation_;
-  size_t offset_;
-  size_t memory_size_;
-};
-
-inline void SerializeDestroyCallback(void* payload) {
-  if (payload != nullptr) {
-    auto* shared_payload = reinterpret_cast<TensorPayload*>(payload);
-    delete shared_payload;
-  }
-}
-
-TensorPayload GetTensorPayload(framework::Variable* var,
-                               const platform::DeviceContext& ctx,
-                               VarMsg* request);
-
-TensorPayload GetSelectedRowsPayload(framework::Variable* var,
-                                     const platform::DeviceContext& ctx,
-                                     VarMsg* request);
-
-inline framework::proto::VarType::Type ToVarType(
-    sendrecv::VariableMessage::Type type) {
-  switch (type) {
-    case sendrecv::VariableMessage::FP32:
-      return framework::proto::VarType::FP32;  // NOLINT
-    case sendrecv::VariableMessage::FP64:
-      return framework::proto::VarType::FP64;  // NOLINT
-    case sendrecv::VariableMessage::INT32:
-      return framework::proto::VarType::INT32;  // NOLINT
-    case sendrecv::VariableMessage::INT64:
-      return framework::proto::VarType::INT64;  // NOLINT
-    case sendrecv::VariableMessage::BOOL:
-      return framework::proto::VarType::BOOL;  // NOLINT
-    default:
-      PADDLE_THROW(
-          platform::errors::InvalidArgument("Not support type id: %d.", type));
-  }
-}
-
-template <template <typename> class T, typename Elem>
-std::string VectorElemName(const T<Elem>& arg) {
-  return typeid(Elem).name();
-}
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/varhandle_test.cc b/paddle/fluid/operators/distributed/varhandle_test.cc
deleted file mode 100644
index 7c52ef74b4c2e..0000000000000
--- a/paddle/fluid/operators/distributed/varhandle_test.cc
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/operators/distributed/request_handler.h"
-
-using paddle::operators::distributed::VarHandlePtr;
-using paddle::operators::distributed::VarHandle;
-
-void WaitTrue(VarHandlePtr s) { EXPECT_TRUE(s->Wait()); }
-
-void WaitFalse(VarHandlePtr s) { EXPECT_FALSE(s->Wait()); }
-
-TEST(VarHandle, Run) {
-  std::vector<VarHandlePtr> a;
-  for (int i = 0; i < 12; i++) {
-    VarHandlePtr s(new VarHandle("", "", "", nullptr, nullptr));
-    a.push_back(s);
-  }
-
-  std::vector<std::unique_ptr<std::thread>> t;
-  for (int i = 0; i < 6; i++) {
-    t.emplace_back(new std::thread(WaitFalse, a[i]));
-  }
-
-  for (int i = 0; i < 6; i++) {
-    a[i]->Finish(false);
-    t[i]->join();
-  }
-
-  for (int i = 6; i < 12; i++) {
-    t.emplace_back(new std::thread(WaitTrue, a[i]));
-  }
-
-  for (int i = 6; i < 12; i++) {
-    a[i]->Finish(true);
-    t[i]->join();
-  }
-}
diff --git a/paddle/fluid/operators/distributed/variable_response.cc b/paddle/fluid/operators/distributed/variable_response.cc
deleted file mode 100644
index 79b0843968e85..0000000000000
--- a/paddle/fluid/operators/distributed/variable_response.cc
+++ /dev/null
@@ -1,271 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/distributed/variable_response.h"
-#include <vector>
-#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
-
-DEFINE_string(rpc_server_profile_path, "./profile_ps",
-              "the profile log file path");
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-bool VariableResponse::ReadRaw(::google::protobuf::io::CodedInputStream* input,
-                               const platform::DeviceContext& dev_ctx,
-                               platform::Place place, void* dest,
-                               int64_t size) {
-  const void* data = NULL;
-  int size_to_write = 0;
-  int64_t length = size;
-  int total_written = 0;
-
-  if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    auto& gpu_dev_ctx =
-        static_cast<const platform::CUDADeviceContext&>(dev_ctx);
-    platform::CPUPlace cpu;
-
-    char* p = reinterpret_cast<char*>(dest);
-    while (total_written < length) {
-      if (!input->GetDirectBufferPointer(&data, &size_to_write)) {
-        return false;
-      }
-      // NOTE: if raw buffer is large and have two neighbor fields of raw
-      // buffers GetDirectBufferPointer can get all of them, use length to
-      // truncate it.
-      if (total_written + size_to_write > length) {
-        size_to_write = length - total_written;
-      }
-      // This log is useful to see how long a internal block size is of rpc.
-      VLOG(7) << "copy " << size_to_write << " data to CUDAPlace";
-      memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place),
-                   reinterpret_cast<void*>(p), cpu, data, size_to_write,
-                   gpu_dev_ctx.stream());
-      p += size_to_write;
-      total_written += size_to_write;
-
-      input->Skip(size_to_write);
-    }
-    gpu_dev_ctx.Wait();
-#else
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "Unexpected branch, please compile with WITH_GPU or WITH_ROCM"));
-#endif
-    return true;
-  } else if (platform::is_xpu_place(place)) {
-#ifdef PADDLE_WITH_XPU
-    auto& xpu_dev_ctx = static_cast<const platform::XPUDeviceContext&>(dev_ctx);
-    platform::CPUPlace cpu;
-    char* p = reinterpret_cast<char*>(dest);
-    while (total_written < length) {
-      if (!input->GetDirectBufferPointer(&data, &size_to_write)) {
-        return false;
-      }
-
-      if (total_written + size_to_write > length) {
-        size_to_write = length - total_written;
-      }
-
-      memory::Copy(BOOST_GET_CONST(platform::XPUPlace, place),
-                   reinterpret_cast<void*>(p), cpu, data, size_to_write);
-      p += size_to_write;
-      total_written += size_to_write;
-      input->Skip(size_to_write);
-    }
-    xpu_dev_ctx.Wait();
-#else
-    PADDLE_ENFORCE_NOT_NULL(
-        nullptr,
-        platform::errors::Unimplemented(
-            "Not supported XPU, please compile with option WITH_XPU=ON."));
-#endif
-    return true;
-  }
-
-  char* p = reinterpret_cast<char*>(dest);
-  while (total_written < length) {
-    if (!input->GetDirectBufferPointer(&data, &size_to_write)) {
-      return false;
-    }
-    // NOTE: if raw buffer is large and have two neighbor fields of raw buffers
-    // GetDirectBufferPointer can get all of them, use length to truncate it.
-    if (total_written + size_to_write > length) {
-      size_to_write = length - total_written;
-    }
-    // TODO(gongwb): can we avoid copy?
-    platform::CPUPlace cpu;
-    // This log is useful to see how long a internal block size is of rpc.
-    VLOG(7) << "copy " << size_to_write << " data to CPUPlace";
-    memory::Copy(cpu, reinterpret_cast<void*>(p), cpu, data, size_to_write);
-
-    p += size_to_write;
-    total_written += size_to_write;
-
-    input->Skip(size_to_write);
-  }
-
-  return true;
-}
-
-bool VariableResponse::CopyLodTensorData(
-    ::google::protobuf::io::CodedInputStream* input,
-    const platform::DeviceContext& ctx, const framework::DDim& dims,
-    int length) {
-  auto server_var = GetVar();
-  if (!server_var) {
-    LOG(ERROR) << "recved var should not on current server: "
-               << meta_.varname();
-    return false;
-  }
-  auto* tensor = GetVar()->GetMutable<framework::LoDTensor>();
-  tensor->Resize(dims);
-  framework::LoD lod;
-  for (int i = 0; i < meta_.lod_level(); ++i) {
-    framework::Vector<size_t> v;
-    for (int j = 0; j < meta_.lod(i).lod_data_size(); ++j) {
-      v.push_back(meta_.lod(i).lod_data(j));
-    }
-    lod.push_back(v);
-  }
-  tensor->set_lod(lod);
-
-  void* tensor_data =
-      tensor->mutable_data(ctx.GetPlace(), ToVarType(meta_.data_type()));
-
-  VLOG(6) << "Tensor.memory_size = " << tensor->memory_size()
-          << ", Buffer Size = " << length << ", dims:" << dims
-          << ", numel:" << tensor->numel();
-  PADDLE_ENFORCE_GE(
-      tensor->memory_size(), static_cast<unsigned int>(length),
-      platform::errors::InvalidArgument(
-          "The memory size of tensor: %s should greater than length: %s",
-          tensor->memory_size(), length));
-  return ReadRaw(input, ctx, tensor->place(), tensor_data, length);
-}
-
-inline framework::DDim GetDims(
-    const ::google::protobuf::RepeatedField<::google::protobuf::int64>& dims) {
-  std::vector<int> vecdims;
-  for (auto& d : dims) {
-    vecdims.push_back(d);
-  }
-  return framework::make_ddim(vecdims);
-}
-
-bool VariableResponse::CopySelectRowsTensorData(
-    ::google::protobuf::io::CodedInputStream* input,
-    const platform::DeviceContext& ctx, const framework::DDim& dims,
-    int length) {
-  auto* slr = GetVar()->GetMutable<framework::SelectedRows>();
-  slr->set_height(meta_.slr_height());
-  auto* tensor = slr->mutable_value();
-  tensor->Resize(dims);
-  PADDLE_ENFORCE_EQ(
-      static_cast<size_t>(tensor->numel()),
-      length / framework::SizeOfType(paddle::operators::distributed::ToVarType(
-                   meta_.data_type())),
-      platform::errors::InvalidArgument(
-          "length: %s should equal to memory size of tensor: %s", length,
-          tensor->numel() *
-              framework::SizeOfType(paddle::operators::distributed::ToVarType(
-                  meta_.data_type()))));
-  void* tensor_data = tensor->mutable_data(
-      ctx.GetPlace(),
-      paddle::operators::distributed::ToVarType(meta_.data_type()));
-
-  if (!ReadRaw(input, ctx, tensor->place(), tensor_data, length)) {
-    return false;
-  }
-
-  return true;
-}
-
-bool VariableResponse::CopySelectRowsData(
-    ::google::protobuf::io::CodedInputStream* input,
-    const platform::DeviceContext& ctx, int length) {
-  auto* slr = GetVar()->GetMutable<framework::SelectedRows>();
-  slr->mutable_rows()->clear();
-  slr->mutable_rows()->resize(length / sizeof(int64_t));  // int64
-  int64_t* rows_data = slr->mutable_rows()->data();
-
-  // copy rows CPU data, GPU data will be copied lazily.
-  platform::CPUPlace cpu;
-  if (!ReadRaw(input, ctx, cpu, rows_data, length)) {
-    return false;
-  }
-
-  return true;
-}
-
-bool VariableResponse::ProcSerializedField(
-    int tag, ::google::protobuf::io::CodedInputStream* input,
-    int64_t num_bytes) {
-  PADDLE_ENFORCE(
-      (meta_.type() == sendrecv::SELECTED_ROWS ||
-       meta_.type() == sendrecv::LOD_TENSOR ||
-       meta_.type() == sendrecv::NCCL_ID) &&
-          meta_.varname() != "",
-      platform::errors::PreconditionNotMet("meta info should be got first!"));
-
-  if (meta_.type() == sendrecv::NCCL_ID) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    auto* var = scope_->FindVar(meta_.varname());
-    if (var != nullptr) {
-      ncclUniqueId* id = var->GetMutable<ncclUniqueId>();
-      if (!ReadRaw(input, *dev_ctx_, platform::CPUPlace(), id->internal,
-                   num_bytes)) {
-        return false;
-      }
-    }
-    return true;
-#else
-    PADDLE_THROW(
-        platform::errors::PreconditionNotMet("Please compiled with CUDA!"));
-    return false;
-#endif
-  }
-
-  VLOG(7) << "ProcSerializedField:" << meta_.varname()
-          << ", type:" << meta_.type() << std::endl;
-  framework::DDim dims = GetDims(meta_.dims());
-  if (meta_.type() == sendrecv::LOD_TENSOR) {
-    PADDLE_ENFORCE_GE(
-        meta_.lod_size(), 0,
-        platform::errors::PreconditionNotMet("lod info should be got first!"));
-    if (!CopyLodTensorData(input, *dev_ctx_, dims, num_bytes)) {
-      return false;
-    }
-
-    return true;
-  }
-
-  if (meta_.type() == sendrecv::SELECTED_ROWS) {
-    if (!CopySelectRowsTensorData(input, *dev_ctx_, dims, num_bytes)) {
-      return false;
-    }
-    return true;
-  }
-
-  PADDLE_THROW(platform::errors::InvalidArgument(
-      "The type: %s of var: %s is not supported", meta_.type(),
-      meta_.varname()));
-
-  return false;
-}
-
-};  // namespace distributed
-};  // namespace operators
-};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/variable_response.h b/paddle/fluid/operators/distributed/variable_response.h
deleted file mode 100644
index be67a2396f7d7..0000000000000
--- a/paddle/fluid/operators/distributed/variable_response.h
+++ /dev/null
@@ -1,155 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-
-#include "google/protobuf/io/coded_stream.h"
-#include "google/protobuf/io/zero_copy_stream.h"
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/operators/distributed/distributed_pb.h"
-
-namespace google {
-namespace protobuf {
-namespace io {
-class CodedInputStream;
-class ZeroCopyInputStream;
-}  // namespace io
-}  // namespace protobuf
-}  // namespace google
-namespace paddle {
-namespace framework {
-class Variable;
-}  // namespace framework
-namespace platform {
-class DeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
-DECLARE_string(rpc_server_profile_path);
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-// Source provides a way for a particular RPC implementation to provide
-// received data to ParseFrom.
-class Source {
- public:
-  virtual ~Source() {}
-
-  // Return the stream that contains the data to be parsed.
-  // Note that this method might be invoked more than once if
-  // ParseFrom needs to fall back to a more expensive parsing method.
-  // Every call must return a stream pointing at the beginning of
-  // the serialized RecvTensorResponse.
-  //
-  // Note that a subsequent call to contents() invalidates previous
-  // results of contents().
-  //
-  // Ownership of the returned stream is retained by the Source and
-  // should not be deleted by the caller.
-  virtual ::google::protobuf::io::ZeroCopyInputStream* contents() = 0;
-};
-
-class VariableResponse {
- public:
-  VariableResponse(const framework::Scope* scope,
-                   const platform::DeviceContext* dev_ctx,
-                   bool create_scope = false)
-      : scope_(scope), dev_ctx_(dev_ctx), create_scope_(create_scope) {
-    if (create_scope) {
-      local_scope_ = scope->NewTmpScope().release();
-    }
-  }
-
-  virtual ~VariableResponse() {
-    if (local_scope_) {
-      delete local_scope_;
-      local_scope_ = nullptr;
-    }
-  }
-
-  int Parse(Source* source, const sendrecv::VariableMessage& meta) {
-    meta_ = meta;
-    return Parse(source);
-  }
-
-  // return:
-  // 0:ok.
-  // -1: unkown error.
-  // other: number of error field.
-  virtual int Parse(Source* source) = 0;
-
-  inline const framework::Scope& GetLocalScope() const { return *local_scope_; }
-  inline framework::Scope* GetMutableLocalScope() const { return local_scope_; }
-  inline std::string Varname() const { return meta_.varname(); }
-  inline std::string OutVarname() const { return meta_.out_varname(); }
-  inline std::string TableName() const { return meta_.table_name(); }
-
-  // should call parse first.
-  framework::Variable* GetVar() {
-    if (create_scope_) {
-      return local_scope_->Var(meta_.varname());
-    }
-    return scope_->FindVar(meta_.varname());
-  }
-
-  framework::Variable* GetRecvVar() {
-    if (create_scope_) {
-      return local_scope_->Var(meta_.out_varname());
-    }
-    return scope_->FindVar(meta_.out_varname());
-  }
-
-  int GetTrainerId() { return static_cast<int>(meta_.trainer_id()); }
-
- protected:
-  bool ReadRaw(::google::protobuf::io::CodedInputStream* input,
-               const platform::DeviceContext& dev_ctx, platform::Place place,
-               void* dest, int64_t size);
-
-  bool CopySelectRowsTensorData(::google::protobuf::io::CodedInputStream* input,
-                                const platform::DeviceContext& ctx,
-                                const framework::DDim& dims, int length);
-
-  bool CopySelectRowsData(::google::protobuf::io::CodedInputStream* input,
-                          const platform::DeviceContext& ctx, int length);
-
-  bool CopyLodTensorData(::google::protobuf::io::CodedInputStream* input,
-                         const platform::DeviceContext& ctx,
-                         const framework::DDim& dims, int length);
-
-  bool ProcSerializedField(int tag,
-                           ::google::protobuf::io::CodedInputStream* input,
-                           int64_t num_bytes);
-
- protected:
-  const framework::Scope* scope_;
-  const platform::DeviceContext* dev_ctx_;
-  bool create_scope_ = false;
-  framework::Scope* local_scope_ = nullptr;
-
-  sendrecv::VariableMessage meta_;
-};
-
-};  // namespace distributed
-};  // namespace operators
-};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed_ops/CMakeLists.txt b/paddle/fluid/operators/distributed_ops/CMakeLists.txt
deleted file mode 100644
index e651f19fedbcf..0000000000000
--- a/paddle/fluid/operators/distributed_ops/CMakeLists.txt
+++ /dev/null
@@ -1,38 +0,0 @@
-include(operators)
-
-set(DISTRIBUTE_DEPS "")
-if(WITH_GRPC)
-    set(DISTRIBUTE_DEPS sendrecvop_rpc parameter_send parameter_recv communicator async_sparse_param_update_recorder grpc++_unsecure grpc_unsecure gpr zlib protobuf node)
-else()
-    set(DISTRIBUTE_DEPS sendrecvop_rpc parameter_send parameter_recv communicator async_sparse_param_update_recorder brpc leveldb protobuf ssl crypto zlib node)
-    if(WITH_BRPC_RDMA)
-        find_library(IBVERBS_LIBRARY NAMES ibverbs)
-        ADD_LIBRARY(ibverbs SHARED IMPORTED GLOBAL)
-        SET_PROPERTY(TARGET ibverbs PROPERTY IMPORTED_LOCATION ${IBVERBS_LIBRARY})
-
-
-        find_library(RDMACM_LIBRARY NAMES rdmacm)
-        ADD_LIBRARY(rdmacm SHARED IMPORTED GLOBAL)
-        SET_PROPERTY(TARGET rdmacm PROPERTY IMPORTED_LOCATION ${RDMACM_LIBRARY})
-
-        set(DISTRIBUTE_DEPS ${DISTRIBUTE_DEPS} ibverbs rdmacm)
-    endif()
-endif()
-
-set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
-
-file(GLOB OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc")
-list(REMOVE_DUPLICATES OPS)
-
-foreach(src ${OPS})
-    set_source_files_properties(${src} PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-endforeach()
-
-register_operators(EXCLUDES gen_nccl_id_op DEPS ${DISTRIBUTE_DEPS})
-
-if(WITH_NCCL OR WITH_RCCL)
-    set(DISTRIBUTE_DEPS ${DISTRIBUTE_DEPS} nccl_common)
-endif()
-
-set(OPERATOR_DEPS ${OPERATOR_DEPS} ${DISTRIBUTE_DEPS} PARENT_SCOPE)
-set(GLOB_DISTRIBUTE_DEPS ${DISTRIBUTE_DEPS} CACHE INTERNAL "distributed dependency")
diff --git a/paddle/fluid/operators/distributed_ops/allreduce_op.cc b/paddle/fluid/operators/distributed_ops/allreduce_op.cc
deleted file mode 100644
index 86f1c28a9dd4f..0000000000000
--- a/paddle/fluid/operators/distributed_ops/allreduce_op.cc
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <future>  // NOLINT
-#include <ostream>
-
-#include "paddle/fluid/operators/distributed_ops/allreduce_op.h"
-
-namespace paddle {
-namespace operators {
-
-class AllReduceOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {}
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
-  }
-};
-
-class AllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddInput("X", "(Tensor), tensor to be allreduced.");
-    AddOutput("Out", "(Tensor) the result of allreduced.");
-    AddAttr<int>("reduce_type", "(int) determin the reduce type.")
-        .SetDefault(0);
-    AddAttr<bool>(
-        "sync_mode",
-        "(bool) whether to synchronize the CUDA stream after nccl call.")
-        .SetDefault(false);
-    AddComment(R"DOC(
-***AllReduce Operator***
-
-Call NCCL AllReduce internally. Note that this op must be used when one
-thread is managing one GPU device.
-
-For speed reasons, reduce_type should be an integer:
-
-0: sum
-1: prod
-2: max
-3: min
-
-If input and output are the same variable, in-place allreduce will be used.
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_WITHOUT_GRADIENT(allreduce, ops::AllReduceOp,
-                             ops::AllReduceOpMaker);
-
-REGISTER_OP_CPU_KERNEL(
-    allreduce, ops::AllReduceOpKernel<plat::CPUDeviceContext, float>,
-    ops::AllReduceOpKernel<plat::CPUDeviceContext, double>,
-    ops::AllReduceOpKernel<plat::CPUDeviceContext, int>,
-    ops::AllReduceOpKernel<plat::CPUDeviceContext, int64_t>,
-    ops::AllReduceOpKernel<plat::CPUDeviceContext, plat::float16>);
diff --git a/paddle/fluid/operators/distributed_ops/allreduce_op.h b/paddle/fluid/operators/distributed_ops/allreduce_op.h
deleted file mode 100644
index 157924f08546b..0000000000000
--- a/paddle/fluid/operators/distributed_ops/allreduce_op.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-#include "paddle/fluid/platform/nccl_helper.h"
-#endif
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class AllReduceOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto place = ctx.GetPlace();
-    PADDLE_ENFORCE_EQ(is_gpu_place(place), true,
-                      platform::errors::PreconditionNotMet(
-                          "AllReduce op can run on gpu place only for now."));
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    auto in = ctx.Input<framework::Tensor>("X");
-    auto out = ctx.Output<framework::Tensor>("Out");
-
-    int dtype = platform::ToNCCLDataType(in->type());
-    int64_t numel = in->numel();
-    auto* sendbuff = in->data<void>();
-    out->Resize(in->dims());
-    void* recvbuff = out->mutable_data<T>(place);
-
-    auto* comm = dev_ctx.nccl_comm();
-    // FIXME(typhoonzero): should use nccl stream here.
-    auto stream = dev_ctx.stream();
-    PADDLE_ENFORCE_NOT_NULL(
-        stream, platform::errors::NotFound("Should initialize NCCL firstly."));
-
-    int reduce_type = ctx.Attr<int>("reduce_type");
-    ncclRedOp_t red_type = ncclSum;
-    switch (reduce_type) {
-      case 0:
-        red_type = ncclSum;
-        break;
-      case 1:
-        red_type = ncclProd;
-        break;
-      case 2:
-        red_type = ncclMax;
-        break;
-      case 3:
-        red_type = ncclMin;
-        break;
-    }
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
-        sendbuff, recvbuff, numel, static_cast<ncclDataType_t>(dtype), red_type,
-        comm, stream));
-    if (ctx.Attr<bool>("sync_mode")) {
-#ifdef PADDLE_WITH_RCCL
-      PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
-#else
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
-#endif
-    }
-#else
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "PaddlePaddle should compile with GPU."));
-#endif
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed_ops/broadcast_op.cc b/paddle/fluid/operators/distributed_ops/broadcast_op.cc
deleted file mode 100644
index 61e27887b68c7..0000000000000
--- a/paddle/fluid/operators/distributed_ops/broadcast_op.cc
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-#include <ostream>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-class BroadcastOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(X) of BroadcastOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      platform::errors::InvalidArgument(
-                          "Output(Output) of ConvOp should not be null."));
-  }
-};
-
-class BroadcastOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddInput("X", "(Tensor), tensor to be broadcast.");
-    AddOutput("Out", "(Tensor) the result of broadcast.");
-    AddAttr<bool>(
-        "sync_mode",
-        "(bool) whether to synchronize the CUDA stream after nccl call.")
-        .SetDefault(false);
-    AddAttr<int>("root", "(int).").SetDefault(0).EqualGreaterThan(0);
-    AddComment(R"DOC(
-***Broadcast Operator***
-
-Call NCCL Broadcast internally. Note that this op must be used when one
-thread is managing one GPU device.
-)DOC");
-  }
-};
-
-template <typename T>
-class BroadcastOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "Broadcast op can run on gpu place only for now."));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_WITHOUT_GRADIENT(broadcast, ops::BroadcastOp,
-                             ops::BroadcastOpMaker);
-
-REGISTER_OP_CPU_KERNEL(broadcast, ops::BroadcastOpKernel<float>,
-                       ops::BroadcastOpKernel<double>,
-                       ops::BroadcastOpKernel<int>,
-                       ops::BroadcastOpKernel<int64_t>,
-                       ops::BroadcastOpKernel<plat::float16>);
diff --git a/paddle/fluid/operators/distributed_ops/broadcast_op.cu.cc b/paddle/fluid/operators/distributed_ops/broadcast_op.cu.cc
deleted file mode 100644
index 1bfcc8af03e1e..0000000000000
--- a/paddle/fluid/operators/distributed_ops/broadcast_op.cu.cc
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-#include "paddle/fluid/platform/nccl_helper.h"
-#endif
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class NCCLBroadcastOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(ctx.GetPlace()), true,
-        platform::errors::PreconditionNotMet(
-            "The place of ExecutionContext should be CUDAPlace."));
-
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-    int dev_id = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()).device;
-    int root_dev_id = ctx.Attr<int>("root");
-
-    auto in = ctx.Input<framework::Tensor>("X");
-    auto out = ctx.Output<framework::Tensor>("Out");
-    PADDLE_ENFORCE_EQ(
-        out->IsInitialized(), true,
-        platform::errors::PreconditionNotMet(
-            "Currently, the output of broadcast op must be initialized,"
-            "because this op can only be an In-Place operation."));
-    void* send_recv_buffer = out->mutable_data<T>(ctx.GetPlace());
-    PADDLE_ENFORCE_EQ(
-        send_recv_buffer, in->data<void>(),
-        platform::errors::PreconditionNotMet("Currently, the broadcast op can "
-                                             "only be an In-Place operation."));
-
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    auto comm = dev_ctx.nccl_comm();
-    auto stream = dev_ctx.stream();
-
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
-        send_recv_buffer, static_cast<size_t>(in->numel()),
-        platform::ToNCCLDataType(in->type()), root_dev_id, comm, stream));
-
-    VLOG(3) << "Bcast " << ctx.InputNames("X")[0] << ", (" << in->numel() << ")"
-            << " From " << root_dev_id << " to " << dev_id;
-
-    if (ctx.Attr<bool>("sync_mode")) {
-#ifdef PADDLE_WITH_RCCL
-      PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
-#else
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
-#endif
-    }
-#else
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "PaddlePaddle should compile with GPU."));
-#endif
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_CUDA_KERNEL(broadcast, ops::NCCLBroadcastOpKernel<float>,
-                        ops::NCCLBroadcastOpKernel<double>,
-                        ops::NCCLBroadcastOpKernel<int>,
-                        ops::NCCLBroadcastOpKernel<int64_t>,
-                        ops::NCCLBroadcastOpKernel<plat::float16>);
diff --git a/paddle/fluid/operators/distributed_ops/checkpoint_notify_op.cc b/paddle/fluid/operators/distributed_ops/checkpoint_notify_op.cc
deleted file mode 100644
index 051d9d65c7714..0000000000000
--- a/paddle/fluid/operators/distributed_ops/checkpoint_notify_op.cc
+++ /dev/null
@@ -1,117 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
-
-namespace paddle {
-namespace framework {
-class InferShapeContext;
-class OpDesc;
-class Scope;
-template <typename T>
-class EmptyGradOpMaker;
-}  // namespace framework
-namespace imperative {
-class OpBase;
-}  // namespace imperative
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-
-class CheckpointNotifyOp : public framework::OperatorBase {
- public:
-  CheckpointNotifyOp(const std::string& type,
-                     const framework::VariableNameMap& inputs,
-                     const framework::VariableNameMap& outputs,
-                     const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& place) const override {
-    std::vector<std::string> epmap =
-        Attr<std::vector<std::string>>("endpoints");
-    std::string dirname = Attr<std::string>("dirname");
-    std::string varname = Attr<std::string>("varname");
-    auto mode = Attr<int>("mode");
-
-    if (mode != 0 && mode != 1 && mode != 2) {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "mode expected in [0/1/2], but got %d", mode));
-    }
-
-    std::vector<std::string> slice_varnames =
-        Attr<std::vector<std::string>>("slice_varnames");
-
-    std::vector<std::string> remote_varnames =
-        Attr<std::vector<std::string>>("remote_varnames");
-
-    distributed::RPCClient* rpc_client =
-        distributed::RPCClient::GetInstance<RPCCLIENT_T>(0);
-
-    for (size_t i = 0; i < epmap.size(); i++) {
-      auto save_path =
-          string::Sprintf("%s/%s/%s", dirname, varname, slice_varnames[i]);
-
-      rpc_client->AsyncCheckpointNotify(epmap[i], save_path, remote_varnames[i],
-                                        mode);
-
-      VLOG(3) << "checkpoint notify sending with path: " << save_path
-              << " and var:" << slice_varnames[i] << " to " << epmap[i]
-              << " with mode " << mode;
-    }
-    PADDLE_ENFORCE_EQ(
-        rpc_client->Wait(), true,
-        platform::errors::Fatal("Fail to notify checkpoint."
-                                " Internal error occurs in RPCClient."));
-  }
-};
-
-class CheckpointNotifyOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddAttr<std::vector<std::string>>(
-        "endpoints",
-        "(string vector)"
-        "Parameter Server endpoints in the order");
-    AddAttr<std::string>("dirname",
-                         "(string) indicate the folder checkpoint will use");
-    AddAttr<std::string>("varname", "(string)  the var need to be saved");
-    AddAttr<std::vector<std::string>>(
-        "slice_varnames", "(string vector) the slice vars need to be saved");
-    AddAttr<std::vector<std::string>>(
-        "remote_varnames", "(string vector) the slice vars need to be saved");
-    AddAttr<int>("mode", "mode=0/1/2 means nothing/save base/save delta")
-        .SetDefault(0);
-    AddComment(R"DOC(
-CheckpointNotify operator
-This operator will send lookup table and it's checkpoint direcoty to listen_and_serve op at
-the parameter server.
-)DOC");
-  }
-};
-
-class CheckpointNotifyOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext* ctx) const override {}
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(
-    checkpoint_notify, ops::CheckpointNotifyOp,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
-    ops::CheckpointNotifyOpMaker, ops::CheckpointNotifyOpShapeInference);
diff --git a/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cc b/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cc
deleted file mode 100644
index 6dfa2670c140f..0000000000000
--- a/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cc
+++ /dev/null
@@ -1,156 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/distributed/parameter_prefetch.h"
-#include "paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-class DistributedLookupTableOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInputs("Ids"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(Ids) of LookupTableOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("W"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(W) of LookupTableOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutputs("Outputs"), true,
-                      platform::errors::InvalidArgument(
-                          "Output(Outs) of LookupTableOp should not be null."));
-
-    auto ids_dims = ctx->GetInputsDim("Ids");
-    auto table_dims = ctx->GetInputDim("W");
-
-    PADDLE_ENFORCE_EQ(
-        table_dims.size(), 2,
-        platform::errors::InvalidArgument(
-            "Only 2 dimensions of the 'Embedding' is supported."));
-
-    for (auto &ids_dim : ids_dims) {
-      PADDLE_ENFORCE_EQ(ids_dim.size(), 2,
-                        platform::errors::InvalidArgument(
-                            "The dimension of the 'Ids' tensor must be 2."));
-    }
-
-    auto endpoints = ctx->Attrs().Get<std::vector<std::string>>("endpoints");
-    // for fluid.embedding
-    auto lookup_table_version =
-        ctx->Attrs().Get<std::string>("lookup_table_version");
-
-    auto outputs_dims = std::vector<framework::DDim>();
-
-    for (auto &ids_dim : ids_dims) {
-      if (lookup_table_version == "lookup_table") {
-        outputs_dims.push_back(
-            framework::make_ddim({ids_dim[0], table_dims[1]}));
-      } else if (lookup_table_version == "lookup_table_v2") {
-        outputs_dims.push_back(framework::make_ddim(
-            {static_cast<int64_t>(ids_dim[0]), static_cast<int64_t>(ids_dim[1]),
-             static_cast<int64_t>(table_dims[1])}));
-      }
-    }
-
-    ctx->SetOutputsDim("Outputs", outputs_dims);
-    ctx->ShareLoD("Ids", /*->*/ "Outputs");
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        framework::proto::VarType::Type(ctx.Attr<int>("dtype")),
-        ctx.GetPlace());
-  }
-};
-
-class DistributedLookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Ids",
-             "(LoDTensor) Ids's type should be LoDTensor"
-             "THe ids to be looked up in W.")
-        .AsDuplicable();
-
-    AddInput("W",
-             "(Tensor) The input represents embedding tensors, "
-             "which is a learnable parameter.");
-
-    AddOutput("Outputs",
-              "(LoDTensor) The lookup results, which have the same type as W.")
-        .AsDuplicable();
-
-    AddAttr<std::vector<std::string>>(
-        "table_names",
-        "(string vector, such as emb_block0, emb_block1)"
-        "Server endpoints in the order of input variables for mapping")
-        .SetDefault({""});
-    AddAttr<std::vector<std::string>>(
-        "endpoints",
-        "(string vector, default 127.0.0.1:6164)"
-        "Server endpoints in the order of input variables for mapping")
-        .SetDefault({"127.0.0.1:6164"});
-
-    AddAttr<int>("pserver_num", "the number of pserver").SetDefault(0);
-
-    AddAttr<bool>("is_distributed",
-                  "(boolean, default false) distributed lookup table.")
-        .SetDefault(false);
-
-    AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
-
-    AddAttr<std::string>(
-        "lookup_table_version",
-        "(string, default lookup_table) "
-        "To distinguish between different versions of embedding OP")
-        .SetDefault(std::string("lookup_table"));
-
-    AddAttr<int64_t>("padding_idx",
-                     "(int64, default -1) "
-                     "If the value is -1, it makes no effect to lookup. "
-                     "Otherwise the given value indicates padding the output "
-                     "with zeros whenever lookup encounters it in Ids.")
-        .SetDefault(distributed::kNoPadding);
-    AddAttr<int>("dtype",
-                 "(int, default 5 (FP32)) "
-                 "Output data type")
-        .SetDefault(framework::proto::VarType::FP32);
-
-    AddComment(R"DOC(
-Lookup Tablel Prefetch Operator.
-This operator is used to perform lookup on parameter W,
-then concatenated into a sparse tensor.
-The type of Ids(Input) is SelectedRows, the rows of Ids contains
-the ids to be looked up in W;
-if the Id is not in the sparse table, this operator will return a
-random value and set the value into the table for the next looking up.
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(distributed_lookup_table, ops::DistributedLookupTableOp,
-                  ops::DistributedLookupTableOpMaker);
-
-REGISTER_OP_CPU_KERNEL(distributed_lookup_table,
-                       ops::DistributedLookupTableKernel<
-                           paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cu.cc b/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cu.cc
deleted file mode 100644
index 54c894415096e..0000000000000
--- a/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cu.cc
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-  Licensed under the Apache License, Version 2.0 (the "License");
-  you may not use this file except in compliance with the License.
-  You may obtain a copy of the License at
-
-      http://www.apache.org/licenses/LICENSE-2.0
-
-  Unless required by applicable law or agreed to in writing, software
-  distributed under the License is distributed on an "AS IS" BASIS,
-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-  See the License for the specific language governing permissions and
-  limitations under the License. */
-
-#include "paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(
-    distributed_lookup_table,
-    ops::DistributedLookupTableKernel<plat::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.h b/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.h
deleted file mode 100644
index 6387120bc87fc..0000000000000
--- a/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-     http://www.apache.org/licenses/LICENSE-2.0
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/distributed/parameter_prefetch.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class DistributedLookupTableKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto ids_vars = context.MultiInputVar("Ids");
-    auto emb_vars = context.MultiOutput<framework::Tensor>("Embeddings");
-
-    auto id_names = context.InputNames("Ids");
-    auto embedding_name = context.InputNames("W").front();
-    auto out_names = context.OutputNames("Outputs");
-    auto lookup_tables = context.Attr<std::vector<std::string>>("table_names");
-    auto endpoints = context.Attr<std::vector<std::string>>("endpoints");
-    auto is_distributed = context.Attr<bool>("is_distributed");
-
-    auto lookup_table_version =
-        context.Attr<std::string>("lookup_table_version");
-
-    operators::distributed::prefetchs(id_names, out_names, embedding_name,
-                                      is_distributed, lookup_tables, endpoints,
-                                      context, context.scope());
-
-    if (lookup_table_version == "lookup_table_v2") {
-      auto &scope = context.scope();
-      auto emb_dim =
-          scope.FindVar(embedding_name)->Get<framework::LoDTensor>().dims()[1];
-
-      for (size_t i = 0; i < id_names.size(); ++i) {
-        auto *id_var = scope.FindVar(id_names[i]);
-        auto *out_var = scope.FindVar(out_names[i]);
-        auto *id_tensor = id_var->GetMutable<framework::LoDTensor>();
-        auto *out_tensor = out_var->GetMutable<framework::LoDTensor>();
-
-        auto id_dims = id_tensor->dims();
-        out_tensor->Resize(framework::make_ddim(
-            {static_cast<int64_t>(id_dims[0]), static_cast<int64_t>(id_dims[1]),
-             static_cast<int64_t>(emb_dim)}));
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed_ops/fake_init_op.cc b/paddle/fluid/operators/distributed_ops/fake_init_op.cc
deleted file mode 100644
index cb27dc75eb2fa..0000000000000
--- a/paddle/fluid/operators/distributed_ops/fake_init_op.cc
+++ /dev/null
@@ -1,81 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-class FakeInitInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "FakeInit");
-    auto &shape = ctx->Attrs().Get<std::vector<int64_t>>("shape");
-    ctx->SetOutputDim("Out", framework::make_ddim(shape));
-  }
-};
-
-class FakeInitOp : public framework::OperatorBase {
- public:
-  using framework::OperatorBase::OperatorBase;
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    framework::Tensor *tensor = nullptr;
-
-    auto &out_var = *scope.FindVar(Output("Out"));
-
-    if (out_var.IsType<framework::LoDTensor>()) {
-      tensor = out_var.GetMutable<framework::LoDTensor>();
-      tensor->Resize(framework::make_ddim(Attr<std::vector<int64_t>>("shape")));
-    } else if (out_var.IsType<framework::SelectedRows>()) {
-      tensor = out_var.GetMutable<framework::SelectedRows>()->mutable_value();
-      tensor->Resize(framework::make_ddim(Attr<std::vector<int64_t>>("shape")));
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "fake init op's output only"
-          "supports SelectedRows and LoDTensor"));
-    }
-  }
-};
-
-class FakeInitOpVarTypeInference : public framework::VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {}
-};
-
-class FakeInitOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddAttr<std::vector<int64_t>>("shape",
-                                  "(vector<int64_t>) The shape of the output");
-    AddOutput("Out",
-              "(Tensor) Tensor of specified shape will be filled "
-              "with the specified value");
-    AddComment(R"DOC(
-FakeInit Operator.
-Init an variable but not alloc memory for it, it is used for init the
-table parameter at trainer side in distributed lookup table.
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(
-    fake_init, ops::FakeInitOp, ops::FakeInitInferShape, ops::FakeInitOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
-    ops::FakeInitOpVarTypeInference);
diff --git a/paddle/fluid/operators/distributed_ops/fetch_barrier_op.cc b/paddle/fluid/operators/distributed_ops/fetch_barrier_op.cc
deleted file mode 100644
index 755cbf017d9d4..0000000000000
--- a/paddle/fluid/operators/distributed_ops/fetch_barrier_op.cc
+++ /dev/null
@@ -1,105 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
-
-namespace paddle {
-namespace framework {
-class InferShapeContext;
-class OpDesc;
-class Scope;
-template <typename T>
-class EmptyGradOpMaker;
-}  // namespace framework
-namespace imperative {
-class OpBase;
-}  // namespace imperative
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-
-namespace distributed {
-class RPCClient;
-}  // namespace distributed
-
-class FetchBarrierOp : public framework::OperatorBase {
- public:
-  FetchBarrierOp(const std::string& type,
-                 const framework::VariableNameMap& inputs,
-                 const framework::VariableNameMap& outputs,
-                 const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& place) const override {
-    std::vector<std::string> eps = Attr<std::vector<std::string>>("endpoints");
-
-    distributed::RPCClient* rpc_client =
-        distributed::RPCClient::GetInstance<RPCCLIENT_T>(
-            Attr<int>("trainer_id"));
-
-    std::vector<distributed::VarHandlePtr> rets;
-    for (auto& ep : eps) {
-      VLOG(3) << "fetch barrier, ep: " << ep;
-      rets.push_back(rpc_client->AsyncSendFetchBarrier(ep));
-    }
-
-    for (size_t i = 0; i < rets.size(); i++) {
-      PADDLE_ENFORCE_NE(rets[i]->Wait(), 0U,
-                        platform::errors::Unavailable(
-                            "Internal error occurred in RPCClient."));
-    }
-  }
-};
-
-class FetchBarrierOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddInput("X", "(Any) Dummy inputs, used for control dependency")
-        .AsDispensable()
-        .AsDuplicable();
-    AddOutput("Out", "(Any) Dummy outputs, used for control dependency")
-        .AsDuplicable();
-    AddComment(R"DOC(
-SendBarrier operator
-
-This operator will send a send barrier signal to list_and_serv op, so that
-the Parameter Server would knew all variables have been sent.
-)DOC");
-
-    AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
-    AddAttr<std::vector<std::string>>("endpoints",
-                                      "(string vector, default 127.0.0.1:6164)"
-                                      "Server endpoints to send variables to.")
-        .SetDefault({"127.0.0.1:6164"});
-  }
-};
-
-class FetchBarrierOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext* ctx) const override {}
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(
-    fetch_barrier, ops::FetchBarrierOp,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
-    ops::FetchBarrierOpMaker, ops::FetchBarrierOpShapeInference);
diff --git a/paddle/fluid/operators/distributed_ops/fl_listen_and_serv_op.cc b/paddle/fluid/operators/distributed_ops/fl_listen_and_serv_op.cc
deleted file mode 100644
index 2e54bb3961cd2..0000000000000
--- a/paddle/fluid/operators/distributed_ops/fl_listen_and_serv_op.cc
+++ /dev/null
@@ -1,284 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <stdio.h>  // for removing the port file
-#include <csignal>
-#include <cstdlib>
-#include <fstream>
-#include <thread>  // NOLINT
-#include <vector>
-
-#include "gflags/gflags.h"
-
-#include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-#include "paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h"
-#include "paddle/fluid/operators/distributed/request_handler_impl.h"
-#include "paddle/fluid/operators/distributed_ops/fl_listen_and_serv_op.h"
-
-#include "paddle/fluid/platform/profiler.h"
-
-DEFINE_int32(flrpc_send_thread_num, 12, "number of threads for rpc send");
-DEFINE_int32(flrpc_get_thread_num, 12, "number of threads for rpc get");
-
-namespace paddle {
-namespace operators {
-
-void FlRunServer(std::shared_ptr<distributed::RPCServer> service) {
-  service->StartServer();
-}
-static void flsplit(const std::string &str, char sep,
-                    std::vector<std::string> *pieces) {
-  pieces->clear();
-  if (str.empty()) {
-    return;
-  }
-  size_t pos = 0;
-  size_t next = str.find(sep, pos);
-  while (next != std::string::npos) {
-    pieces->push_back(str.substr(pos, next - pos));
-    pos = next + 1;
-    next = str.find(sep, pos);
-  }
-  if (!str.substr(pos).empty()) {
-    pieces->push_back(str.substr(pos));
-  }
-}
-
-static void FlParallelExecuteBlocks(
-    const std::vector<size_t> &parallel_blkids, framework::Executor *executor,
-    const std::vector<std::shared_ptr<framework::ExecutorPrepareContext>>
-        &prepared,
-    framework::ProgramDesc *program, framework::Scope *scope) {
-  std::vector<std::future<void>> fs;
-  for (size_t idx : parallel_blkids) {
-    fs.push_back(framework::Async([&executor, &prepared, &scope, idx]() {
-      int run_block = idx;  // thread local
-      try {
-        VLOG(3) << "running server block: " << run_block
-                << "pointer: " << prepared[run_block].get();
-        executor->RunPreparedContext(prepared[run_block].get(), scope);
-      } catch (const std::exception &e) {
-        PADDLE_THROW(platform::errors::Fatal(
-            "Run %d-th sub program failed. The exception is:\n%s.", idx,
-            e.what()));
-      }
-    }));
-  }
-  for (size_t i = 0; i < fs.size(); ++i) fs[i].wait();
-}
-
-FlListenAndServOp::FlListenAndServOp(const std::string &type,
-                                     const framework::VariableNameMap &inputs,
-                                     const framework::VariableNameMap &outputs,
-                                     const framework::AttributeMap &attrs)
-    : OperatorBase(type, inputs, outputs, attrs) {}
-
-FlListenAndServOp::~FlListenAndServOp() {}
-
-void FlListenAndServOp::SavePort() const {
-  // NOTE: default write file to /tmp/paddle.selected_port
-  rpc_service_->SavePort();
-}
-
-static int64_t GetTimestamp() {
-  struct timeval tp;
-  gettimeofday(&tp, NULL);
-  return tp.tv_sec * 1000 + tp.tv_usec / 1000;
-}
-
-void FlListenAndServOp::RunSyncLoop(framework::Executor *executor,
-                                    framework::ProgramDesc *program,
-                                    framework::Scope *recv_scope,
-                                    platform::DeviceContext *dev_ctx) const {
-  VLOG(2) << "RunSyncLoop";
-  size_t num_blocks = program->Size();
-  auto optimize_blocks =
-      Attr<std::vector<framework::BlockDesc *>>(kOptimizeBlocks);
-  PADDLE_ENFORCE_GE(num_blocks, 2,
-                    platform::errors::InvalidArgument(
-                        "server program should have at least 2 blocks"));
-
-  // Prepare all the server block
-  std::vector<int> optimize_blocks_list;
-  for (size_t i = 1; i < program->Size(); ++i) {
-    optimize_blocks_list.push_back(i);
-  }
-  auto optimize_prepared = executor->Prepare(*program, optimize_blocks_list);
-  // Insert placeholder for block0 which holds current op itself,
-  // NOTE the first block in `optimize_prepared` should never be ran.
-  optimize_prepared.insert(
-      optimize_prepared.begin(),
-      std::shared_ptr<framework::ExecutorPrepareContext>(nullptr));
-
-  while (true) {
-    // Get from multiple trainers, we don't care about the order in which
-    // the gradients arrives, just add suffix 0~n and merge the gradient.
-    VLOG(3) << "wait all clients to get pserver parameters back";
-    rpc_service_->SetCond(distributed::kRequestGet);
-    VLOG(3) << "wait all clients to send fetch_barrier";
-    rpc_service_->WaitBarrier(distributed::kRequestGet);
-
-    if (rpc_service_->IsExit()) {
-      rpc_service_->SetCond(distributed::kRequestGet);
-      break;
-    }
-
-    VLOG(3) << "wait all clients to send after_optimizer parameters";
-    rpc_service_->SetCond(distributed::kRequestSend);
-    VLOG(3) << "wait all clients to send send_barrier";
-    rpc_service_->WaitBarrier(distributed::kRequestSend);
-    VLOG(3) << "ResetBarrierCounter";
-    rpc_service_->ResetBarrierCounter();
-    // NOTE: if is_gpu_place, CUDA kernels are launched by multiple threads
-    // and this will still work.
-    // The optimize blocks which have the same parent ID would run parallel
-    // TODO(Yancey1989): need to use ParallelExecutor for future
-    int32_t last_parent_blkid = optimize_blocks[0]->Parent();
-    std::vector<size_t> parallel_blkids;
-    parallel_blkids.push_back(optimize_blocks[0]->ID());
-    double ts = GetTimestamp();
-    for (size_t i = 1; i < optimize_blocks.size(); ++i) {
-      // skip the first optimize block because it is already in the
-      // parallel_blkids.
-      int blkid = optimize_blocks[i]->ID();
-      if (program->Block(blkid).Parent() != last_parent_blkid) {
-        FlParallelExecuteBlocks(parallel_blkids, executor, optimize_prepared,
-                                program, recv_scope);
-        parallel_blkids.clear();
-        last_parent_blkid = program->Block(blkid).Parent();
-      }
-      parallel_blkids.push_back(blkid);
-    }
-    FlParallelExecuteBlocks(parallel_blkids, executor, optimize_prepared,
-                            program, recv_scope);
-    VLOG(3) << "run all blocks spent " << GetTimestamp() - ts << "(ms)";
-  }  // while(true)
-}
-
-static void FillRequestCtx(distributed::RequestHandler *h,
-                           framework::Scope *scope,
-                           platform::DeviceContext *dev_ctx,
-                           framework::Executor *executor,
-                           framework::ProgramDesc *program,
-                           distributed::RPCServer *rpc_server) {
-  h->SetScope(scope);
-  h->SetDevCtx(dev_ctx);
-  h->SetExecutor(executor);
-  h->SetProgram(program);
-  h->SetRPCServer(rpc_server);
-}
-
-void FlListenAndServOp::RunImpl(const framework::Scope &scope,
-                                const platform::Place &dev_place) const {
-  // Mark this as PS that it should decide profiling by listening from trainer.
-  platform::SetProfileListener();
-  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-  auto &dev_ctx = *pool.Get(dev_place);
-  framework::Scope &recv_scope = scope.NewScope();
-
-  bool sync_mode = Attr<bool>("sync_mode");
-  auto fan_in = Attr<int>("Fanin");
-  auto inputs = Inputs("X");
-
-  PADDLE_ENFORCE_EQ(!rpc_service_, true, platform::errors::InvalidArgument(
-                                             "rpc_service_ must null"));
-  std::string endpoint = Attr<std::string>("endpoint");
-
-  VLOG(4) << "sync_mode:" << sync_mode << ", fan_in:" << fan_in
-          << ", end_point:" << endpoint;
-
-  rpc_service_.reset(new RPCSERVER_T(endpoint, fan_in));
-
-  request_send_handler_.reset(
-      new distributed::RequestSendHandler(!sync_mode, false));
-  request_get_handler_.reset(
-      new distributed::RequestGetHandler(!sync_mode, false));
-
-  rpc_service_->RegisterRPC(distributed::kRequestSend,
-                            request_send_handler_.get(),
-                            FLAGS_flrpc_send_thread_num);
-  rpc_service_->RegisterRPC(distributed::kRequestGet,
-                            request_get_handler_.get(),
-                            FLAGS_flrpc_get_thread_num);
-  auto optimize_blocks =
-      Attr<std::vector<framework::BlockDesc *>>(kOptimizeBlocks);
-  PADDLE_ENFORCE_GE(
-      optimize_blocks.size(), 1,
-      platform::errors::InvalidArgument(
-          "optimize blocks should be 1 at least on the pserver side."));
-  auto *program = optimize_blocks[0]->Program();
-  framework::Executor executor(dev_place);
-
-  auto f = std::bind(FillRequestCtx, std::placeholders::_1, &recv_scope,
-                     &dev_ctx, &executor, program, rpc_service_.get());
-
-  f(request_send_handler_.get());
-  f(request_get_handler_.get());
-
-  // start the server listening after all member initialized.
-  server_thread_.reset(new std::thread(FlRunServer, rpc_service_));
-  VLOG(3) << "wait server thread to become ready...";
-  rpc_service_->WaitServerReady();
-
-  // register SIGINT(from ctrl+C) and SIGTERM(from kill) signal handlers
-  signal(SIGINT, FlSignalHandler::StopAndExit);
-  signal(SIGTERM, FlSignalHandler::StopAndExit);
-
-  // Cache the type of the received vars as `sparse_vars_` and `dense_vars_`
-  // so that we can reset them at the end of each iteration.
-  // NOTE: only used in sync update
-
-  // Write to a file of server selected port for python use.
-  SavePort();
-  RunSyncLoop(&executor, program, &recv_scope, &dev_ctx);
-}
-
-class FlListenAndServOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddInput("X", "(Tensor) Variables that server recv.").AsDuplicable();
-    AddComment(R"DOC(" + "ListenAndServ operator" + "\n" + "This operator" +
-" will start a RPC server which can receive variables from send_op and send" +
-"back variables to recv_op.)DOC");
-    AddAttr<std::string>("endpoint",
-                         "(string, default 127.0.0.1:6164)"
-                         "IP address to listen on.")
-        .SetDefault("127.0.0.1:6164")
-        .AddCustomChecker([](const std::string &ip) { return !ip.empty(); });
-    AddAttr<bool>("sync_mode", "if works at sync_mode or not").SetDefault(true);
-    AddAttr<int>("Fanin", "How many clients send to this server.")
-        .SetDefault(1);
-    AddAttr<std::vector<framework::BlockDesc *>>(
-        kOptimizeBlocks, "Optimize blocks to run on server side.")
-        .SetDefault({});
-  }
-};
-
-void FlSignalHandler::StopAndExit(int signal_num) {
-  // Do not use VLOG here for the device for printing maybe already released.
-  // exit will release interal allocated resoureces.
-  auto file_path = string::Sprintf("/tmp/paddle.%d.port", ::getpid());
-  remove(file_path.c_str());
-  exit(0);
-}
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_WITHOUT_GRADIENT(fl_listen_and_serv, ops::FlListenAndServOp,
-                             ops::FlListenAndServOpMaker);
diff --git a/paddle/fluid/operators/distributed_ops/fl_listen_and_serv_op.h b/paddle/fluid/operators/distributed_ops/fl_listen_and_serv_op.h
deleted file mode 100644
index 25ad16e3fce37..0000000000000
--- a/paddle/fluid/operators/distributed_ops/fl_listen_and_serv_op.h
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stdint.h>
-#include <atomic>
-#include <memory>
-#include <set>
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/threadpool.h"
-#include "paddle/fluid/operators/distributed/request_handler.h"
-#include "paddle/fluid/operators/distributed/rpc_server.h"
-#include "paddle/fluid/platform/device_context.h"
-
-namespace paddle {
-namespace framework {
-class Executor;
-class ProgramDesc;
-class Scope;
-}  // namespace framework
-namespace platform {
-class DeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-
-namespace distributed {
-class RPCServer;
-class RequestHandler;
-}  // namespace distributed
-
-constexpr char kOptimizeBlocks[] = "optimize_blocks";
-
-void FlRunServer(std::shared_ptr<distributed::RPCServer> service);
-
-template <class TKey, class TValue>
-class DoubleFindMap : public std::unordered_map<TKey, TValue> {
- public:
-  typename std::unordered_map<TKey, TValue>::iterator find_value(TValue v) {
-    return std::find_if(this->begin(), this->end(),
-                        [&v](const std::pair<const std::string, int> p) {
-                          return p.second == v;
-                        });
-  }
-};
-
-class FlListenAndServOp : public framework::OperatorBase {
- public:
-  FlListenAndServOp(const std::string& type,
-                    const framework::VariableNameMap& inputs,
-                    const framework::VariableNameMap& outputs,
-                    const framework::AttributeMap& attrs);
-  virtual ~FlListenAndServOp();
-
-  void RunSyncLoop(framework::Executor* executor,
-                   framework::ProgramDesc* program,
-                   framework::Scope* recv_scope,
-                   platform::DeviceContext* dev_ctx) const;
-
-  void SavePort() const;
-
-  int GetSelectedPort() { return rpc_service_->GetSelectedPort(); }
-
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& dev_place) const override;
-
- protected:
-  mutable std::shared_ptr<distributed::RPCServer> rpc_service_;
-  mutable std::shared_ptr<distributed::RequestHandler> request_send_handler_;
-  mutable std::shared_ptr<distributed::RequestHandler> request_get_handler_;
-
-  mutable std::shared_ptr<std::thread> server_thread_;
-  mutable std::vector<std::string> sparse_vars_;
-  mutable std::vector<std::string> dense_vars_;
-};
-
-class FlSignalHandler {
- public:
-  static void StopAndExit(int signal_num);
-
- private:
-  DISABLE_COPY_AND_ASSIGN(FlSignalHandler);
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed_ops/gen_nccl_id_op.cc b/paddle/fluid/operators/distributed_ops/gen_nccl_id_op.cc
deleted file mode 100644
index db8c2f3f2d866..0000000000000
--- a/paddle/fluid/operators/distributed_ops/gen_nccl_id_op.cc
+++ /dev/null
@@ -1,313 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <ostream>
-#include <string>
-
-#include "glog/logging.h"
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/op_proto_maker.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/var_type_traits.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/distributed/request_handler.h"
-#include "paddle/fluid/operators/distributed/request_handler_impl.h"
-#include "paddle/fluid/operators/distributed/rpc_client.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-namespace operators {
-
-class GenNCCLIdOp : public framework::OperatorBase {
- public:
-  GenNCCLIdOp(const std::string& type, const framework::VariableNameMap& inputs,
-              const framework::VariableNameMap& outputs,
-              const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& dev_place) const override {
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    // put nccl id in CPUPlace
-    auto& dev_ctx = *pool.Get(platform::CPUPlace());
-    int trainer_id = Attr<int>("trainer_id");
-
-    std::vector<std::string> trainers =
-        Attr<std::vector<std::string>>("trainers");
-    PADDLE_ENFORCE_GE(trainer_id, 0, platform::errors::InvalidArgument(
-                                         "trainer_id %d is less than 0. Its "
-                                         "valid range is [0, trainer_size)"));
-    PADDLE_ENFORCE_LT(
-        trainer_id, static_cast<int>(trainers.size()),
-        platform::errors::OutOfRange("trainer_id %d is out of range. Its valid "
-                                     "range is [0, trainer_size)",
-                                     trainer_id));
-
-    std::string endpoint = trainers[trainer_id];
-
-    framework::Scope& local_scope = scope.NewScope();
-
-    int nccl_comm_num = Attr<int>("nccl_comm_num");
-    int use_hierarchical_allreduce = Attr<bool>("use_hierarchical_allreduce");
-    int inter_nranks = Attr<int>("hierarchical_allreduce_inter_nranks");
-
-    int inter_trainer_id = -1;
-    int exter_trainer_id = -1;
-    if (use_hierarchical_allreduce) {
-      PADDLE_ENFORCE_GT(
-          trainers.size(), 1,
-          platform::errors::PreconditionNotMet(
-              "The number of collective trainers %llu <= 1", trainers.size()));
-      PADDLE_ENFORCE_GT(
-          inter_nranks, 1,
-          platform::errors::PreconditionNotMet(
-              "inter_nranks %d <= 1 while in hierarchical allreduce mode",
-              inter_nranks));
-      PADDLE_ENFORCE_EQ(
-          trainers.size() % inter_nranks, 0,
-          platform::errors::PreconditionNotMet(
-              "The number of trainers %llu mod inter_nranks %d is not equal 0",
-              trainers.size(), inter_nranks));
-
-      inter_trainer_id = trainer_id % inter_nranks;
-
-      if (trainer_id % inter_nranks == 0) {
-        exter_trainer_id = trainer_id / inter_nranks;
-      }
-    }
-
-    if (trainer_id != 0) {
-      GetIdByServer(endpoint, &local_scope, dev_ctx, nccl_comm_num,
-                    use_hierarchical_allreduce, trainer_id, inter_trainer_id,
-                    exter_trainer_id);
-    }
-
-    std::ostringstream ss;
-    for (size_t i = 0; i < trainers.size(); i++) {
-      ss << trainers[i] << ",";
-    }
-
-    VLOG(1) << "trainer_id:" << trainer_id
-            << ", use_hierarchical_allreduce:" << use_hierarchical_allreduce
-            << ", inter_nranks:" << inter_nranks
-            << ", inter_trainer_id:" << inter_trainer_id
-            << ", exter_trainer_id:" << exter_trainer_id
-            << ", trainers:" << ss.str();
-
-    // init flat
-    if (trainer_id == 0) {
-      std::vector<std::string> flat_endpoints;
-      flat_endpoints.insert(flat_endpoints.begin(), trainers.begin() + 1,
-                            trainers.end());
-      // flat nccl_id
-      for (int i = 0; i < nccl_comm_num; i++) {
-        std::string var_name = platform::GetFlatNCCLVarName(i);
-        GenerateAndSend(&local_scope, dev_ctx, var_name, flat_endpoints);
-      }
-    }
-
-    if (!use_hierarchical_allreduce) {
-      return;
-    }
-
-    PADDLE_ENFORCE_EQ(
-        trainers.size() % inter_nranks, 0,
-        platform::errors::PreconditionNotMet(
-            "The number of trainers %llu mod inter_nranks %d is not equal 0",
-            trainers.size(), inter_nranks));
-    PADDLE_ENFORCE_GT(
-        inter_nranks, 1,
-        platform::errors::PreconditionNotMet(
-            "inter_nranks %d <= 1 while in hierarchical allreduce mode",
-            inter_nranks));
-
-    // hierarchical inter ncclid
-    if (inter_trainer_id == 0) {
-      std::ostringstream ss;
-      ss << endpoint;
-      std::vector<std::string> inter_endpoints;
-      for (int i = trainer_id + 1; i < trainer_id + inter_nranks &&
-                                   i < static_cast<int>(trainers.size());
-           i++) {
-        ss << ",";
-        inter_endpoints.push_back(trainers[i]);
-        ss << trainers[i];
-      }
-      VLOG(1) << "Hierarchical inter ring endpoints:" << ss.str();
-      for (int i = 0; i < nccl_comm_num; i++) {
-        std::string nccl_var_name =
-            platform::GetHierarchicalInterNCCLVarName(i);
-        GenerateAndSend(&local_scope, dev_ctx, nccl_var_name, inter_endpoints);
-      }
-    }
-
-    // hierarchical exter ncclid
-    if (exter_trainer_id == 0) {
-      std::ostringstream ss;
-      std::vector<std::string> exter_endpoints;
-      ss << endpoint;
-      for (size_t i = inter_nranks; i < trainers.size(); i += inter_nranks) {
-        ss << ",";
-        exter_endpoints.push_back(trainers[i]);
-        ss << trainers[i];
-      }
-      VLOG(1) << "Hierarchical exter ring endpoints:" << ss.str();
-      for (int i = 0; i < nccl_comm_num; i++) {
-        std::string nccl_var_name =
-            platform::GetHierarchicalExterNCCLVarName(i);
-        GenerateAndSend(&local_scope, dev_ctx, nccl_var_name, exter_endpoints);
-      }
-    }
-  }
-
- private:
-  void GenerateAndSend(framework::Scope* scope,
-                       const platform::DeviceContext& dev_ctx,
-                       const std::string& nccl_id_name,
-                       const std::vector<std::string>& endpoint_list) const {
-    auto var = scope->FindVar(nccl_id_name);
-    PADDLE_ENFORCE_NOT_NULL(
-        var, platform::errors::NotFound("Variable with name %s is not found",
-                                        nccl_id_name.c_str()));
-    auto id = var->GetMutable<ncclUniqueId>();
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclGetUniqueId(id));
-
-    distributed::RPCClient* client =
-        distributed::RPCClient::GetInstance<RPCCLIENT_T>(0);
-
-    for (auto& ep : endpoint_list) {
-      VLOG(3) << "sending nccl_id_var:" << nccl_id_name << " to " << ep;
-      client->AsyncSendVar(ep, dev_ctx, *scope, nccl_id_name);
-    }
-    client->Wait();
-    for (auto& ep : endpoint_list) {
-      client->AsyncSendBatchBarrier(ep);
-    }
-    client->Wait();
-    VLOG(3) << "sending completed...";
-  }
-
-  void GetIdByServer(const std::string& endpoint, framework::Scope* scope,
-                     const platform::DeviceContext& dev_ctx, int nccl_comm_num,
-                     bool use_hierarchical_allreduce, int trainer_id,
-                     int inter_trainer_id, int exter_trainer_id) const {
-    // std::string endpoint = Attr<std::string>("endpoint");
-    // NOTE: Can not use unique_ptr here because the default
-    // deleter will call GRPC Server's base class's dtor and
-    // that will cause a wired crash.
-    distributed::RequestSendHandler rpc_h(distributed::DistributedMode::kSync);
-    std::unique_ptr<distributed::RPCServer> rpc_service(
-        new RPCSERVER_T(endpoint, 1));
-
-    rpc_service->RegisterRPC(distributed::kRequestSend, &rpc_h);
-    rpc_h.SetRPCServer(rpc_service.get());
-
-    framework::ProgramDesc empty_program;
-    framework::Executor executor(dev_ctx.GetPlace());
-    rpc_h.SetScope(scope);
-    rpc_h.SetDevCtx(&dev_ctx);
-    rpc_h.SetProgram(&empty_program);
-    rpc_h.SetExecutor(&executor);
-
-    std::thread server_thread(
-        std::bind(&distributed::RPCServer::StartServer, rpc_service.get()));
-
-    for (int i = 0; i < nccl_comm_num; i++) {
-      rpc_service->SetCond(distributed::kRequestSend);
-      VLOG(3) << "trainer_id:" << trainer_id
-              << " start getting nccl id from trainer 0, nccl_comm_no:" << i;
-      rpc_service->WaitBarrier(distributed::kRequestSend);
-      rpc_service->ResetBarrierCounter();
-    }
-
-    if (use_hierarchical_allreduce) {
-      if (inter_trainer_id > 0) {
-        for (int i = 0; i < nccl_comm_num; i++) {
-          rpc_service->SetCond(distributed::kRequestSend);
-          VLOG(3) << "trainer_id:" << trainer_id
-                  << ", inter_trainer_id:" << inter_trainer_id
-                  << " start getting nccl id from inter_trainer:" << i;
-          rpc_service->WaitBarrier(distributed::kRequestSend);
-          rpc_service->ResetBarrierCounter();
-        }
-      }
-
-      if (exter_trainer_id > 0) {
-        for (int i = 0; i < nccl_comm_num; i++) {
-          rpc_service->SetCond(distributed::kRequestSend);
-          VLOG(3)
-              << "trainer_id:" << trainer_id
-              << ", exter_trainer_id:" << exter_trainer_id
-              << " start getting nccl id from exter_trainer 0, nccl_comm_no:"
-              << i;
-          rpc_service->WaitBarrier(distributed::kRequestSend);
-          rpc_service->ResetBarrierCounter();
-        }
-      }
-    }
-
-    VLOG(3) << "traier_id:" << trainer_id
-            << ", inter_trainer_id:" << inter_trainer_id
-            << ", exter_trainer_id:" << exter_trainer_id
-            << " got nccl id and stop server...";
-    rpc_service->ShutDown();
-    VLOG(3) << "rpc server stopped";
-    server_thread.join();
-  }
-};
-
-class GenNCCLIdOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddOutput("NCCLID", "Raw variable contains a NCCL UniqueId instaces.");
-    AddComment(R"DOC(
-GenNCCLId operator
-
-For trainer 0: generate a new UniqueId and send it to all the other trainers.
-For trainer 1~n: start a gRPC server to get the UniqueId, once got, stop the server.
-)DOC");
-    AddAttr<std::vector<std::string>>(
-        "trainers",
-        "['trainer0_ip:port', 'trainer1_ip:port', ...] "
-        "list of all trainer endpoints")
-        .SetDefault({});
-    AddAttr<int>("trainer_id",
-                 "(int) "
-                 "The index of the trainer in distributed training.");
-    AddAttr<int>("nccl_comm_num",
-                 "(int default 1) "
-                 "The number of nccl communicator num.")
-        .SetDefault(1);
-    AddAttr<bool>("use_hierarchical_allreduce",
-                  "(bool default false) "
-                  "Wheter to use hierarchical allreduce.")
-        .SetDefault(false);
-    AddAttr<int>("hierarchical_allreduce_inter_nranks",
-                 "(int default 1) "
-                 "Wheter to use hierarchical allreduce.")
-        .SetDefault(-1);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(gen_nccl_id, ops::GenNCCLIdOp, ops::GenNCCLIdOpMaker);
diff --git a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
deleted file mode 100644
index 43de8488a0e4a..0000000000000
--- a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
+++ /dev/null
@@ -1,636 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <stdio.h>  // for removing the port file
-#include <csignal>
-#include <cstdlib>
-#include <fstream>
-#include <thread>  // NOLINT
-#include <vector>
-
-#include "gflags/gflags.h"
-
-#include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-#include "paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h"
-#include "paddle/fluid/operators/distributed/heart_beat_monitor.h"
-#include "paddle/fluid/operators/distributed/large_scale_kv.h"
-#include "paddle/fluid/operators/distributed/request_handler_impl.h"
-#include "paddle/fluid/operators/distributed_ops/listen_and_serv_op.h"
-
-#include "paddle/fluid/platform/profiler.h"
-
-DEFINE_int32(rpc_send_thread_num, 12, "number of threads for rpc send");
-DEFINE_int32(rpc_get_thread_num, 12, "number of threads for rpc get");
-DEFINE_int32(rpc_prefetch_thread_num, 12, "number of threads for rpc prefetch");
-
-namespace paddle {
-namespace operators {
-
-void RunServer(std::shared_ptr<distributed::RPCServer> service) {
-  service->StartServer();
-  VLOG(4) << "RunServer thread end";
-}
-
-static void split(const std::string &str, char sep,
-                  std::vector<std::string> *pieces) {
-  pieces->clear();
-  if (str.empty()) {
-    return;
-  }
-  size_t pos = 0;
-  size_t next = str.find(sep, pos);
-  while (next != std::string::npos) {
-    pieces->push_back(str.substr(pos, next - pos));
-    pos = next + 1;
-    next = str.find(sep, pos);
-  }
-  if (!str.substr(pos).empty()) {
-    pieces->push_back(str.substr(pos));
-  }
-}
-
-static void ParallelExecuteBlocks(
-    const std::vector<size_t> &parallel_blkids, framework::Executor *executor,
-    const std::vector<std::shared_ptr<framework::ExecutorPrepareContext>>
-        &prepared,
-    framework::ProgramDesc *program, framework::Scope *scope) {
-  std::vector<std::future<void>> fs;
-  for (size_t idx : parallel_blkids) {
-    fs.push_back(framework::Async([&executor, &prepared, &scope, idx]() {
-      int run_block = idx;  // thread local
-      try {
-        VLOG(3) << "running server block: " << run_block
-                << "pointer: " << prepared[run_block].get();
-        executor->RunPreparedContext(prepared[run_block].get(), scope);
-      } catch (const std::exception &e) {
-        PADDLE_THROW(platform::errors::Fatal(
-            "Run %d-th sub program failed. The exception is:\n%s.", idx,
-            e.what()));
-      }
-    }));
-  }
-  for (size_t i = 0; i < fs.size(); ++i) fs[i].wait();
-}
-
-ListenAndServOp::ListenAndServOp(const std::string &type,
-                                 const framework::VariableNameMap &inputs,
-                                 const framework::VariableNameMap &outputs,
-                                 const framework::AttributeMap &attrs)
-    : OperatorBase(type, inputs, outputs, attrs) {}
-
-ListenAndServOp::~ListenAndServOp() { Stop(); }
-
-void ListenAndServOp::Stop() {
-  rpc_service_->ShutDown();
-  server_thread_->join();
-  auto file_path = string::Sprintf("/tmp/paddle.%d.port", ::getpid());
-  remove(file_path.c_str());
-}
-
-void ListenAndServOp::SavePort() const {
-  // NOTE: default write file to /tmp/paddle.selected_port
-  rpc_service_->SavePort();
-}
-
-static int64_t GetTimestamp() {
-  struct timeval tp;
-  gettimeofday(&tp, NULL);
-  return tp.tv_sec * 1000 + tp.tv_usec / 1000;
-}
-
-// For sync, sparse variables need recover grad type from LodTensor to
-// SelectedRows
-void ResetSparseVarsType(framework::Scope *recv_scope) {
-  auto *ins = distributed::LargeScaleKV::GetInstance();
-  auto grads = ins->GetAllGrads();
-
-  for (auto &grad : grads) {
-    auto *v = recv_scope->FindVar(grad);
-    v->Clear();
-    v->GetMutable<framework::SelectedRows>();
-  }
-}
-
-void ListenAndServOp::RunSyncLoop(
-    framework::Executor *executor, framework::ProgramDesc *program,
-    framework::Scope *recv_scope, platform::DeviceContext *dev_ctx,
-    const std::vector<int> &prefetch_block_id_list,
-    const int checkpoint_point_block_id) const {
-  VLOG(2) << "RunSyncLoop";
-  size_t num_blocks = program->Size();
-  auto optimize_blocks =
-      Attr<std::vector<framework::BlockDesc *>>(kOptimizeBlocks);
-  PADDLE_ENFORCE_GE(num_blocks, 2,
-                    platform::errors::PreconditionNotMet(
-                        "Invalid number of blocks in server program. Expected "
-                        "equal or greater than 2. Recieved %zu",
-                        num_blocks));
-
-  // Prepare all the server block
-  std::vector<int> optimize_blocks_list;
-  for (size_t i = 1; i < program->Size(); ++i) {
-    optimize_blocks_list.push_back(i);
-  }
-  auto optimize_prepared = executor->Prepare(*program, optimize_blocks_list);
-  // Insert placeholder for block0 which holds current op itself,
-  // NOTE the first block in `optimize_prepared` should never be ran.
-  optimize_prepared.insert(
-      optimize_prepared.begin(),
-      std::shared_ptr<framework::ExecutorPrepareContext>(nullptr));
-
-  // Trainers will get all parameters from pserver in the
-  // startup program, so we will wait RequestGet first
-  rpc_service_->SetCond(distributed::kRequestGet);
-  rpc_service_->WaitBarrier(distributed::kRequestGet);
-  rpc_service_->ResetBarrierCounter();
-
-  while (true) {
-    // Get from multiple trainers, we don't care about the order in which
-    // the gradients arrives, just add suffix 0~n and merge the gradient.
-    VLOG(3) << "wait all clients to send gradient";
-    rpc_service_->SetCond(distributed::kRequestSend);
-    VLOG(3) << "wait all clients to send send_barrier";
-    rpc_service_->WaitBarrier(distributed::kRequestSend);
-
-    if (rpc_service_->IsExit()) {
-      LOG(WARNING) << "get exit!rpc_processor break!";
-      rpc_service_->SetCond(distributed::kRequestGet);
-      break;
-    }
-
-    // NOTE: if is_gpu_place, CUDA kernels are launched by multiple threads
-    // and this will still work.
-    // The optimize blocks which have the same parent ID would run parallel
-    // TODO(Yancey1989): need to use ParallelExecutor for future
-    int32_t last_parent_blkid = optimize_blocks[0]->Parent();
-    std::vector<size_t> parallel_blkids;
-    parallel_blkids.push_back(optimize_blocks[0]->ID());
-    double ts = GetTimestamp();
-    for (size_t i = 1; i < optimize_blocks.size(); ++i) {
-      // skip the first optimize block because it is already in the
-      // parallel_blkids.
-      int blkid = optimize_blocks[i]->ID();
-      if (program->Block(blkid).Parent() != last_parent_blkid) {
-        ParallelExecuteBlocks(parallel_blkids, executor, optimize_prepared,
-                              program, recv_scope);
-        parallel_blkids.clear();
-        last_parent_blkid = program->Block(blkid).Parent();
-      }
-      parallel_blkids.push_back(blkid);
-    }
-    ParallelExecuteBlocks(parallel_blkids, executor, optimize_prepared, program,
-                          recv_scope);
-    VLOG(3) << "run all blocks spent " << GetTimestamp() - ts << "(ms)";
-
-    VLOG(3) << "ResetReceivedVars";
-    ResetReceivedVars(recv_scope, dev_ctx, rpc_service_->NeedResetAllVars());
-    ResetSparseVarsType(recv_scope);
-
-    VLOG(3) << "wait all clients to get parameters back";
-    rpc_service_->SetCond(distributed::kRequestGet);
-    VLOG(3) << "wait all clients to send fetch_barrier";
-    rpc_service_->WaitBarrier(distributed::kRequestGet);
-    VLOG(3) << "ResetBarrierCounter";
-    rpc_service_->ResetBarrierCounter();
-  }  // while(true)
-}
-
-void ListenAndServOp::ResetReceivedVars(framework::Scope *recv_scope,
-                                        platform::DeviceContext *dev_ctx,
-                                        bool reset_all) const {
-  for (auto &varname : sparse_vars_) {
-    auto var = recv_scope->FindVar(varname);
-    if (var == nullptr) {
-      VLOG(2) << "can not find var " << varname << " in received scope";
-      continue;
-    }
-    if (var->IsType<framework::SelectedRows>()) {
-      VLOG(3) << "reset sparse var: " << varname;
-      var->GetMutable<framework::SelectedRows>()->mutable_rows()->clear();
-    } else {
-      PADDLE_THROW(platform::errors::PreconditionNotMet(
-          "The type of sparse var should be SelectedRows"));
-    }
-  }
-  if (UNLIKELY(reset_all)) {
-    for (auto &varname : dense_vars_) {
-      auto var = recv_scope->FindVar(varname);
-      if (var == nullptr) {
-        VLOG(2) << "can not find var " << varname << " in received scope";
-        continue;
-      }
-      if (var->IsType<framework::LoDTensor>()) {
-        math::set_constant(*dev_ctx, var->GetMutable<framework::LoDTensor>(),
-                           static_cast<float>(0));
-      } else if (var->IsType<framework::Tensor>()) {
-        math::set_constant(*dev_ctx, var->GetMutable<framework::Tensor>(),
-                           static_cast<float>(0));
-      } else {
-        PADDLE_THROW(platform::errors::PreconditionNotMet(
-            "The type of dense var should be in [LoDTensor, Tensor]"));
-      }
-    }
-  }
-}
-
-void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
-                                   framework::ProgramDesc *program,
-                                   framework::Scope *recv_scope) const {
-  VLOG(2) << "RunAsyncLoop";
-  auto grad_to_block_id_str =
-      Attr<std::vector<std::string>>("grad_to_block_id");
-  DoubleFindMap<std::string, int32_t> grad_to_block_id;
-
-  auto append_block_maps = [](DoubleFindMap<std::string, int32_t> *out_map,
-                              const std::string &grad_and_id) {
-    std::vector<std::string> pieces;
-    split(grad_and_id, ':', &pieces);
-    VLOG(3) << "after split, key = " << pieces[0] << ", id=" << pieces[1];
-    PADDLE_ENFORCE_EQ(pieces.size(), 2,
-                      platform::errors::PreconditionNotMet(
-                          "Invalid format of grad_and_id argument. "
-                          "Expected \"grad:block_id\". Recieved %s",
-                          grad_and_id.c_str()));
-    PADDLE_ENFORCE_EQ(out_map->count(pieces[0]), 0,
-                      platform::errors::AlreadyExists(
-                          "The gradient name %s has already existed in out_map",
-                          pieces[0].c_str()));
-
-    int block_id = std::stoi(pieces[1]);
-    (*out_map)[pieces[0]] = block_id;
-  };
-
-  for (const auto &grad_and_id : grad_to_block_id_str) {
-    append_block_maps(&grad_to_block_id, grad_and_id);
-  }
-
-  size_t num_blocks = program->Size();
-  PADDLE_ENFORCE_GE(num_blocks, 2,
-                    platform::errors::PreconditionNotMet(
-                        "Invalid number of blocks in server program. Expected "
-                        "equal or greater than 2. Recieved %zu",
-                        num_blocks));
-  std::vector<int> block_list;
-  for (size_t blkid = 1; blkid < num_blocks; ++blkid) {
-    block_list.push_back(blkid);
-  }
-  auto optimize_prepared = executor->Prepare(*program, block_list);
-  // execute global block if needed, block id 1 in the program is global
-  // block if it's not bind to a grad var for it's update.
-  if (block_list[0] == 1 &&
-      grad_to_block_id.find_value(static_cast<int32_t>(1)) ==
-          grad_to_block_id.end()) {
-    executor->RunPreparedContext(optimize_prepared[0].get(), recv_scope);
-  }
-  std::unordered_map<std::string,
-                     std::shared_ptr<framework::ExecutorPrepareContext>>
-      grad_to_prepared_ctx, param_to_prepared_ctx;
-  for (size_t i = 0; i < block_list.size(); ++i) {
-    auto blkid = block_list[i];
-    auto it = grad_to_block_id.find_value(blkid);
-    if (it != grad_to_block_id.end()) {
-      grad_to_prepared_ctx[it->first] = optimize_prepared[i];
-    }
-  }
-
-  request_send_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx);
-  request_get_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx);
-  request_prefetch_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx);
-  request_send_and_recv_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx);
-
-  while (true) {
-    if (rpc_service_->IsExit()) {
-      VLOG(4) << "get exit!rpc_processor break!";
-      break;
-    }
-
-    sleep(1);
-  }  // while(true)
-}
-
-static void FillRequestCtx(
-    distributed::RequestHandler *h, framework::Scope *scope,
-    platform::DeviceContext *dev_ctx, framework::Executor *executor,
-    framework::ProgramDesc *program,
-    std::unordered_map<std::string,
-                       std::shared_ptr<framework::ExecutorPrepareContext>>
-        *prefetch_ctx,
-    std::unordered_map<std::string, std::string>
-        *sparse_grad_name_to_param_name,
-    std::shared_ptr<framework::ExecutorPrepareContext> checkpoint_ctx,
-    std::shared_ptr<framework::ExecutorPrepareContext> lr_decay_ctx,
-    distributed::RPCServer *rpc_server) {
-  h->SetScope(scope);
-  h->SetDevCtx(dev_ctx);
-  h->SetExecutor(executor);
-  h->SetProgram(program);
-  h->SetPrefetchPreparedCtx(prefetch_ctx);
-  h->SetSparseGradToParam(sparse_grad_name_to_param_name);
-  h->SetRPCServer(rpc_server);
-  h->SetCheckpointNotifyPreparedCtx(checkpoint_ctx);
-  h->SetLrDecayPreparedCtx(lr_decay_ctx);
-}
-
-void ListenAndServOp::CacheVarsType(const std::vector<std::string> &varnames,
-                                    const framework::Scope &scope) const {
-  for (const auto &varname : varnames) {
-    auto var = scope.FindVar(varname);
-    PADDLE_ENFORCE_NOT_NULL(
-        var, platform::errors::PreconditionNotMet(
-                 "Received var is not initialized in the received scope."));
-    if (var->IsType<framework::SelectedRows>()) {
-      sparse_vars_.push_back(varname);
-    } else if (var->IsType<framework::LoDTensor>() ||
-               var->IsType<framework::Tensor>()) {
-      dense_vars_.push_back(varname);
-    } else {
-      PADDLE_THROW(platform::errors::PreconditionNotMet(
-          "The type of received var should be in [SelectedRows, LoDTensor, "
-          "Tensor]."));
-    }
-  }
-}
-
-void ListenAndServOp::RunImpl(const framework::Scope &scope,
-                              const platform::Place &dev_place) const {
-  // Mark this as PS that it should decide profiling by listening from trainer.
-  platform::SetProfileListener();
-  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-  auto &dev_ctx = *pool.Get(dev_place);
-  framework::Scope &recv_scope = scope.NewScope();
-
-  int distributed_mode = Attr<int>("distributed_mode");
-  bool dc_sgd = Attr<bool>("dc_asgd");
-  auto fan_in = Attr<int>("Fanin");
-  auto pserver_id = Attr<int>("pserver_id");
-  auto inputs = Inputs("X");
-
-  PADDLE_ENFORCE_EQ(rpc_service_, nullptr,
-                    platform::errors::PreconditionNotMet(
-                        "RPC service has been created unexpectedly."));
-  std::string endpoint = Attr<std::string>("endpoint");
-  int checkpoint_block_id = Attr<int>(kCheckpointBlockId);
-  int lr_decay_block_id = Attr<int>(kLRDecayBlockId);
-
-  VLOG(4) << "pserver_id: " << pserver_id
-          << ", distributed_mode:" << distributed_mode << ", fan_in:" << fan_in
-          << ", end_point:" << endpoint
-          << ", checkpoint_block_id: " << checkpoint_block_id
-          << ", lr_decay_block_id: " << lr_decay_block_id;
-
-  rpc_service_.reset(new RPCSERVER_T(endpoint, fan_in));
-
-  auto rpc_get_thread_num = Attr<int>("rpc_get_thread_num");
-  auto rpc_send_thread_num = Attr<int>("rpc_send_thread_num");
-  auto rpc_prefetch_thread_num = Attr<int>("rpc_prefetch_thread_num");
-
-  request_send_handler_.reset(
-      new distributed::RequestSendHandler(distributed_mode, dc_sgd));
-  request_get_handler_.reset(
-      new distributed::RequestGetHandler(distributed_mode, dc_sgd));
-  request_prefetch_handler_.reset(
-      new distributed::RequestPrefetchHandler(distributed_mode));
-  request_checkpoint_handler_.reset(
-      new distributed::RequestCheckpointHandler(distributed_mode));
-  request_get_no_barrier_handler_.reset(
-      new distributed::RequestGetNoBarrierHandler());
-  request_notify_handler_.reset(
-      new distributed::RequestNotifyHandler(distributed_mode, fan_in));
-  request_send_and_recv_handler_.reset(
-      new distributed::RequestSendAndRecvHandler(distributed_mode));
-
-  rpc_service_->RegisterRPC(distributed::kRequestSend,
-                            request_send_handler_.get(), rpc_send_thread_num);
-  rpc_service_->RegisterRPC(distributed::kRequestGet,
-                            request_get_handler_.get(), rpc_get_thread_num);
-  rpc_service_->RegisterRPC(distributed::kRequestPrefetch,
-                            request_prefetch_handler_.get(),
-                            rpc_prefetch_thread_num);
-  rpc_service_->RegisterRPC(distributed::kRequestCheckpoint,
-                            request_checkpoint_handler_.get());
-  rpc_service_->RegisterRPC(distributed::kRequestGetNoBarrier,
-                            request_get_no_barrier_handler_.get());
-  rpc_service_->RegisterRPC(distributed::kRequestNotify,
-                            request_notify_handler_.get(), rpc_send_thread_num);
-  rpc_service_->RegisterRPC(distributed::kRequestSendAndRecv,
-                            request_send_and_recv_handler_.get(),
-                            rpc_get_thread_num);
-
-  auto optimize_blocks =
-      Attr<std::vector<framework::BlockDesc *>>(kOptimizeBlocks);
-  PADDLE_ENFORCE_GE(optimize_blocks.size(), 1,
-                    platform::errors::PreconditionNotMet(
-                        "optimize blocks is less than 1. Optimize blocks "
-                        "should be 1 at least on the pserver side."));
-  auto *program = optimize_blocks[0]->Program();
-
-  framework::Executor executor(dev_place);
-
-  std::shared_ptr<framework::ExecutorPrepareContext> ckpt_pre_context = nullptr;
-  if (checkpoint_block_id != -1) {
-    auto ctx = executor.Prepare(*program, checkpoint_block_id);
-    // see: https://stackoverflow.com/a/14856553
-    ckpt_pre_context = std::move(ctx);
-  }
-
-  std::shared_ptr<framework::ExecutorPrepareContext> lr_decay_context = nullptr;
-  if (lr_decay_block_id != -1) {
-    auto ctx = executor.Prepare(*program, lr_decay_block_id);
-    // see: https://stackoverflow.com/a/14856553
-    lr_decay_context = std::move(ctx);
-  }
-
-  // prepare for prefetch
-  std::vector<int> prefetch_block_id_list;
-  std::unordered_map<int, std::string> block_id_to_prefetch_var_name;
-
-  auto prefetch_var_name_to_block_id_str =
-      Attr<std::vector<std::string>>(kPrefetchVarNameToBlockId);
-  for (const auto &prefetch_var_name_and_id :
-       prefetch_var_name_to_block_id_str) {
-    std::vector<std::string> pieces;
-    split(prefetch_var_name_and_id, ':', &pieces);
-    VLOG(3) << "after split, prefetch_var = " << pieces[0]
-            << ", id=" << pieces[1];
-    PADDLE_ENFORCE_EQ(
-        pieces.size(), 2,
-        platform::errors::PreconditionNotMet(
-            "Invalid format of prefetch_var_name_and_id argument. "
-            "Expected \"xxx:xxx\". Recieved %s",
-            prefetch_var_name_and_id.c_str()));
-
-    int block_id = std::stoi(pieces[1]);
-    prefetch_block_id_list.push_back(block_id);
-    block_id_to_prefetch_var_name[block_id] = pieces[0];
-  }
-
-  auto prefetch_prepared = executor.Prepare(*program, prefetch_block_id_list);
-
-  std::unordered_map<std::string,
-                     std::shared_ptr<framework::ExecutorPrepareContext>>
-      prefetch_var_name_to_prepared_ctx;
-  for (size_t i = 0; i < prefetch_block_id_list.size(); ++i) {
-    auto block_id = prefetch_block_id_list[i];
-    auto prefetch_var_name = block_id_to_prefetch_var_name[block_id];
-    prefetch_var_name_to_prepared_ctx[prefetch_var_name] = prefetch_prepared[i];
-  }
-
-  // parse attr of kSparseGradToParam  sparse_grad_name -> param_name
-  std::unordered_map<std::string, std::string> sparse_grad_name_to_param_name;
-  auto sparse_grad_name_to_param_name_str =
-      Attr<std::vector<std::string>>(kSparseGradToParam);
-  for (const auto &sparse_grad_name_and_param_name :
-       sparse_grad_name_to_param_name_str) {
-    std::vector<std::string> pieces;
-    split(sparse_grad_name_and_param_name, ':', &pieces);
-    PADDLE_ENFORCE_EQ(
-        pieces.size(), 2,
-        platform::errors::PreconditionNotMet(
-            "Invalid format of sparse_grad_name_and_param_name argument. "
-            "Expected \"xxx:xxx\". Recieved %s",
-            sparse_grad_name_and_param_name.c_str()));
-    VLOG(3) << "after split, sparse_grad_name = " << pieces[0]
-            << ", param_name = " << pieces[1];
-    sparse_grad_name_to_param_name[pieces[0]] = pieces[1];
-  }
-
-  auto f =
-      std::bind(FillRequestCtx, std::placeholders::_1, &recv_scope, &dev_ctx,
-                &executor, program, &prefetch_var_name_to_prepared_ctx,
-                &sparse_grad_name_to_param_name, ckpt_pre_context,
-                lr_decay_context, rpc_service_.get());
-
-  f(request_send_handler_.get());
-  f(request_get_handler_.get());
-  f(request_prefetch_handler_.get());
-  f(request_checkpoint_handler_.get());
-  f(request_get_no_barrier_handler_.get());
-  f(request_notify_handler_.get());
-  f(request_send_and_recv_handler_.get());
-
-  // register SIGINT(from ctrl+C) and SIGTERM(from kill) signal handlers
-  signal(SIGINT, SignalHandler::StopAndExit);
-  signal(SIGTERM, SignalHandler::StopAndExit);
-
-  if (distributed_mode == distributed::DistributedMode::kSync) {
-    // start the server listening after all member initialized.
-    server_thread_.reset(new std::thread(RunServer, rpc_service_));
-    VLOG(3) << "wait server thread to become ready...";
-    rpc_service_->WaitServerReady();
-
-    CacheVarsType(inputs, recv_scope);
-
-    // Write to a file of server selected port for python use.
-    SavePort();
-
-    RunSyncLoop(&executor, program, &recv_scope, &dev_ctx,
-                prefetch_block_id_list, checkpoint_block_id);
-  } else {
-    if (distributed_mode == distributed::DistributedMode::kGeo) {
-      distributed::AsyncSparseParamUpdateRecorder::Init(
-          fan_in, sparse_grad_name_to_param_name);
-    }
-
-    VLOG(2) << "RunAsyncLoop";
-    auto grad_to_block_id_str =
-        Attr<std::vector<std::string>>("grad_to_block_id");
-
-    if (grad_to_block_id_str.size() == 0) {
-      VLOG(0) << "there are no gradients on this parameter server";
-    } else {
-      std::vector<std::string> pieces;
-      split(grad_to_block_id_str[0], ':', &pieces);
-      distributed::HeartBeatMonitor::Init(fan_in, pserver_id == 0, pieces[0]);
-    }
-
-    // start the server listening after all member initialized.
-    server_thread_.reset(new std::thread(RunServer, rpc_service_));
-    VLOG(3) << "wait server thread to become ready...";
-    rpc_service_->WaitServerReady();
-
-    // Write to a file of server selected port for python use.
-    SavePort();
-
-    RunAsyncLoop(&executor, program, &recv_scope);
-  }
-}
-
-class ListenAndServOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddInput("X", "(Tensor) Variables that server recv.").AsDuplicable();
-    AddComment(R"DOC(" + "ListenAndServ operator" + "\n" + "This operator" +
-" will start a RPC server which can receive variables from send_op and send" +
-"back variables to recv_op.)DOC");
-    AddAttr<std::string>("endpoint",
-                         "(string, default 127.0.0.1:6164)"
-                         "IP address to listen on.")
-        .SetDefault("127.0.0.1:6164")
-        .AddCustomChecker([](const std::string &ip) { return !ip.empty(); });
-    AddAttr<int>("pserver_id",
-                 "(int, default -1), the parameter server index id")
-        .SetDefault(-1);
-    AddAttr<std::vector<std::string>>(
-        "grad_to_block_id",
-        "['param1@GRAD.block0:1', 'param2@GRAD.blockn:2'] "
-        "a map from grad name to it's optimize block id")
-        .SetDefault({});
-    AddAttr<int>("distributed_mode",
-                 "indicate distriubte training mode, 0 is sync, 1 is "
-                 "fully-async, 2 is half-async, 3 is geo")
-        .SetDefault(0);
-    AddAttr<bool>("dc_asgd", "set to true will enable DC-ASGD training.")
-        .SetDefault(false);
-    AddAttr<std::vector<framework::BlockDesc *>>(
-        kOptimizeBlocks, "Optimize blocks to run on server side.")
-        .SetDefault({});
-    AddAttr<std::vector<std::string>>(kPrefetchVarNameToBlockId,
-                                      "prefetch blocks to run on server side.")
-        .SetDefault({});
-    AddAttr<std::vector<std::string>>(
-        kSparseGradToParam,
-        "sparse grad name to param name. like: 'emb@Grad:emb'")
-        .SetDefault({});
-    AddAttr<int>("Fanin", "How many clients send to this server.")
-        .SetDefault(1);
-    AddAttr<int>(kCheckpointBlockId,
-                 "BolckID to run save checkpoint on pserer.")
-        .SetDefault(-1);
-    AddAttr<int>(kLRDecayBlockId, "BolckID to run lr decay on pserer.")
-        .SetDefault(-1);
-    AddAttr<int>("rpc_get_thread_num", "pserver get thread num.").SetDefault(1);
-    AddAttr<int>("rpc_send_thread_num", "pserver send thread num.")
-        .SetDefault(1);
-    AddAttr<int>("rpc_prefetch_thread_num", "pserver prefetch thread num.")
-        .SetDefault(1);
-  }
-};
-
-void SignalHandler::StopAndExit(int signal_num) {
-  // Do not use VLOG here for the device for printing maybe already released.
-  // exit will release interal allocated resoureces.
-  auto file_path = string::Sprintf("/tmp/paddle.%d.port", ::getpid());
-  remove(file_path.c_str());
-  exit(0);
-}
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(listen_and_serv, ops::ListenAndServOp,
-                  ops::ListenAndServOpMaker);
diff --git a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.h b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.h
deleted file mode 100644
index bacfd32cc7391..0000000000000
--- a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.h
+++ /dev/null
@@ -1,135 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stdint.h>
-#include <atomic>
-#include <memory>
-#include <set>
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/threadpool.h"
-#include "paddle/fluid/operators/distributed/request_handler.h"
-#include "paddle/fluid/operators/distributed/rpc_server.h"
-#include "paddle/fluid/platform/device_context.h"
-
-namespace paddle {
-namespace framework {
-class Executor;
-class ProgramDesc;
-class Scope;
-}  // namespace framework
-namespace platform {
-class DeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-
-namespace distributed {
-class RPCServer;
-class RequestHandler;
-}  // namespace distributed
-
-constexpr char kOptimizeBlocks[] = "optimize_blocks";
-constexpr char kPrefetchVarNameToBlockId[] = "prefetch_var_name_to_block_id";
-constexpr char kCheckpointBlockId[] = "checkpint_block_id";
-constexpr char kLRDecayBlockId[] = "lr_decay_block_id";
-constexpr char kSparseGradToParam[] = "sparse_grad_to_param";
-
-void RunServer(std::shared_ptr<distributed::RPCServer> service);
-
-template <class TKey, class TValue>
-class DoubleFindMap : public std::unordered_map<TKey, TValue> {
- public:
-  typename std::unordered_map<TKey, TValue>::iterator find_value(TValue v) {
-    return std::find_if(this->begin(), this->end(),
-                        [&v](const std::pair<const std::string, int> p) {
-                          return p.second == v;
-                        });
-  }
-};
-
-class ListenAndServOp : public framework::OperatorBase {
- public:
-  ListenAndServOp(const std::string& type,
-                  const framework::VariableNameMap& inputs,
-                  const framework::VariableNameMap& outputs,
-                  const framework::AttributeMap& attrs);
-  virtual ~ListenAndServOp();
-
-  void RunSyncLoop(framework::Executor* executor,
-                   framework::ProgramDesc* program,
-                   framework::Scope* recv_scope,
-                   platform::DeviceContext* dev_ctx,
-                   const std::vector<int>& prefetch_block_id_list,
-                   const int checkpoint_point_block_id) const;
-
-  void RunAsyncLoop(framework::Executor* executor,
-                    framework::ProgramDesc* program,
-                    framework::Scope* recv_scope) const;
-
-  void SavePort() const;
-
-  int GetSelectedPort() { return rpc_service_->GetSelectedPort(); }
-
-  void Stop() override;
-
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& dev_place) const override;
-
-  void ResetReceivedVars(framework::Scope* recv_scope,
-                         platform::DeviceContext* dev_ctx,
-                         bool reset_all = false) const;
-
-  void CacheVarsType(const std::vector<std::string>& varnames,
-                     const framework::Scope& scope) const;
-
- protected:
-  mutable std::shared_ptr<distributed::RPCServer> rpc_service_;
-  mutable std::shared_ptr<distributed::RequestHandler> request_send_handler_;
-  mutable std::shared_ptr<distributed::RequestHandler> request_get_handler_;
-  mutable std::shared_ptr<distributed::RequestHandler>
-      request_get_no_barrier_handler_;
-  mutable std::shared_ptr<distributed::RequestHandler>
-      request_prefetch_handler_;
-  mutable std::shared_ptr<distributed::RequestHandler>
-      request_checkpoint_handler_;
-  mutable std::shared_ptr<distributed::RequestHandler> request_notify_handler_;
-  mutable std::shared_ptr<distributed::RequestHandler>
-      request_send_and_recv_handler_;
-
-  mutable std::shared_ptr<std::thread> server_thread_;
-  mutable std::vector<std::string> sparse_vars_;
-  mutable std::vector<std::string> dense_vars_;
-};
-
-class SignalHandler {
- public:
-  static void StopAndExit(int signal_num);
-
- private:
-  DISABLE_COPY_AND_ASSIGN(SignalHandler);
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_adam_op.cc b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_adam_op.cc
deleted file mode 100644
index b8328b88da7d1..0000000000000
--- a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_adam_op.cc
+++ /dev/null
@@ -1,158 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_adam_op.h"
-
-#include <string>
-namespace paddle {
-namespace operators {
-
-class LargeScaleFuseAdamOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(
-        ctx->HasInput("Grad"),
-        platform::errors::InvalidArgument(
-            "Input(Grad) of LargeScaleFuseAdamOp should not be null."));
-    PADDLE_ENFORCE(
-        ctx->HasInput("LearningRate"),
-        platform::errors::InvalidArgument(
-            "Input(LearningRate) of LargeScaleFuseAdamOp should not be null."));
-
-    auto lr_dims = ctx->GetInputDim("LearningRate");
-
-    PADDLE_ENFORCE_NE(framework::product(lr_dims), 0,
-                      platform::errors::InvalidArgument(
-                          "Maybe the Input variable LearningRate has not "
-                          "been initialized. You may need to confirm "
-                          "if you put exe.run(startup_program) "
-                          "after optimizer.minimize function."));
-
-    PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
-                      platform::errors::InvalidArgument(
-                          "Learning rate should have 1 element"));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "Grad");
-    return framework::OpKernelType(data_type, ctx.device_context());
-  }
-
-  framework::OpKernelType GetKernelTypeForVar(
-      const std::string &var_name, const framework::Tensor &tensor,
-      const framework::OpKernelType &expected_kernel_type) const {
-    if (var_name == "LearningRate") {
-      return framework::OpKernelType(tensor.type(), tensor.place(),
-                                     tensor.layout());
-    }
-    return framework::OpKernelType(expected_kernel_type.data_type_,
-                                   tensor.place(), tensor.layout());
-  }
-};
-
-class LargeScaleFuseAdamOpInferVarType : public framework::VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    auto in_var_type = ctx->GetInputType("Grad");
-    PADDLE_ENFORCE_EQ(in_var_type == framework::proto::VarType::SELECTED_ROWS ||
-                          in_var_type == framework::proto::VarType::LOD_TENSOR,
-                      true, platform::errors::InvalidArgument(
-                                "The input Var's type should be LoDtensor or "
-                                "SelectedRows, but the received type is %s",
-                                in_var_type));
-  }
-};
-
-class LargeScaleFuseAdamOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Grad",
-             "(SelectedRows) Ids's type should be SelectedRows"
-             "THe ids to be looked up in W.");
-
-    AddInput("Beta1Pow", "(Tensor) Input beta1 power accumulator");
-    AddInput("Beta2Pow", "(Tensor) Input beta2 power accumulator");
-    AddInput("LearningRate", "(Tensor) Learning rate of SGD");
-    AddOutput("Beta1PowOut", "(Tensor) Output beta1 power accumulator");
-    AddOutput("Beta2PowOut", "(Tensor) Output beta2 power accumulator");
-
-    AddAttr<float>("beta1",
-                   "(float, default 0.9) "
-                   "Exponential decay rate for the "
-                   "first moment estimates.")
-        .SetDefault(0.9f);
-
-    AddAttr<float>("beta2",
-                   "(float, default 0.999) "
-                   "exponential decay rate for the "
-                   "second moment estimates.")
-        .SetDefault(0.999f);
-
-    AddAttr<float>("epsilon",
-                   "(float, default 1.0e-8) "
-                   "Constant for numerical stability")
-        .SetDefault(1.0e-8f);
-
-    AddAttr<bool>("is_entry",
-                  "(bool)"
-                  "sparse table need entry");
-
-    AddAttr<std::string>("tablename",
-                         "(string)"
-                         "sparse table name");
-
-    AddAttr<std::vector<std::string>>("value_names",
-                                      "(strings)"
-                                      "sparse table name");
-
-    AddComment(R"DOC(
-Adam Optimizer.
-
-This implements the Adam optimizer from Section 2 of the Adam
-paper : https://arxiv.org/abs/1412.6980.
-Adam is a first-order gradient-based optimization method based on
-adaptive estimates of lower-order moments.
-
-Adam updates:
-
-$$
-moment\_1\_out = \beta_1 * moment\_1 + (1 - \beta_1) * grad \\
-moment\_2_\out = \beta_2 * moment\_2 + (1 - \beta_2) * grad * grad \\
-learning\_rate = learning\_rate *
-                  \frac{\sqrt{1 - \beta_{2\_pow}}}{1 - \beta_{1\_pow}} \\
-param\_out = param - learning\_rate * \frac{moment\_1}{\sqrt{moment\_2} + \epsilon}
-$$
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(
-    lookup_sparse_table_fuse_adam, ops::LargeScaleFuseAdamOp,
-    ops::LargeScaleFuseAdamOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
-    ops::LargeScaleFuseAdamOpInferVarType);
-
-REGISTER_OP_CPU_KERNEL(
-    lookup_sparse_table_fuse_adam,
-    ops::LargeScaleFuseAdamOpKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_adam_op.h b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_adam_op.h
deleted file mode 100644
index 89b8d54a463b0..0000000000000
--- a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_adam_op.h
+++ /dev/null
@@ -1,142 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <math.h>  // for sqrt in CPU and CUDA
-#include <algorithm>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/operators/distributed/large_scale_kv.h"
-#include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/selected_rows_functor.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class LargeScaleFuseAdamOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override;
-};
-
-template <typename T>
-class LargeScaleFuseAdamOpKernel<platform::CPUDeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    using paddle::framework::LoDTensor;
-
-    const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
-    const auto *grad_var = ctx.InputVar("Grad");
-
-    PADDLE_ENFORCE(
-        grad_var->IsType<framework::SelectedRows>(),
-        platform::errors::InvalidArgument(
-            "in large scale optimize, gradient should only be SelectedRows"));
-
-    const auto &grad = grad_var->Get<framework::SelectedRows>();
-
-    // for distributed training, a sparse var may be empty,
-    // just skip updating.
-    if (grad.rows().size() == 0) {
-      return;
-    }
-
-    framework::SelectedRows tmp_grad_merge;
-    const framework::SelectedRows *grad_merge_ptr;
-    math::scatter::MergeAdd<platform::CPUDeviceContext, T> merge_func;
-    merge_func(ctx.template device_context<platform::CPUDeviceContext>(), grad,
-               &tmp_grad_merge, true);
-    grad_merge_ptr = &tmp_grad_merge;
-
-    std::vector<int64_t> in_rows;
-    in_rows.reserve(grad_merge_ptr->rows().size());
-    std::copy(grad_merge_ptr->rows().begin(), grad_merge_ptr->rows().end(),
-              std::back_inserter(in_rows));
-
-    const auto *lr = learning_rate->data<T>();
-    auto grad_v = grad_merge_ptr->value();
-    auto grad_width = grad_v.dims()[1];
-
-    //    auto is_entry = context.Attr<bool>("is_entry");
-    auto tablename = ctx.Attr<std::string>("tablename");
-    auto value_names = ctx.Attr<std::vector<std::string>>("value_names");
-
-    auto *beta1_pow = ctx.Input<LoDTensor>("Beta1Pow");
-    auto *beta2_pow = ctx.Input<LoDTensor>("Beta2Pow");
-    auto *beta1_pow_out = ctx.Output<LoDTensor>("Beta1PowOut");
-    auto *beta2_pow_out = ctx.Output<LoDTensor>("Beta2PowOut");
-    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
-    T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
-    T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
-
-    PADDLE_ENFORCE_EQ(beta1_pow_out->numel(), 1,
-                      platform::errors::InvalidArgument(
-                          "beta1 pow output size should be 1, but received "
-                          "value is:%d.",
-                          beta1_pow_out->numel()));
-
-    PADDLE_ENFORCE_EQ(beta2_pow_out->numel(), 1,
-                      platform::errors::InvalidArgument(
-                          "beta2 pow output size should be 1, but received "
-                          "value is:%d.",
-                          beta2_pow_out->numel()));
-
-    // update beta1 and beta2
-    beta1_pow_out->mutable_data<T>(ctx.GetPlace())[0] =
-        beta1 * beta1_pow->data<T>()[0];
-    beta2_pow_out->mutable_data<T>(ctx.GetPlace())[0] =
-        beta2 * beta2_pow->data<T>()[0];
-
-    std::vector<std::vector<std::vector<float> *>> values;
-    std::vector<int64_t> dims;
-
-    auto *ins = distributed::LargeScaleKV::GetInstance();
-    auto *table = ins->Get(tablename);
-    table->Get(in_rows, value_names, &values);
-    table->Dims({"Param"}, &dims);
-
-    PADDLE_ENFORCE_EQ(dims[0], grad_width,
-                      platform::errors::InvalidArgument(
-                          "param_row should have the same size with grad_row"));
-
-    T lr_ = lr[0];
-    T beta1_pow_ = beta1_pow->data<T>()[0];
-    T beta2_pow_ = beta2_pow->data<T>()[0];
-
-    lr_ *= sqrt(1 - beta2_pow_) / (1 - beta1_pow_);
-
-    for (size_t i = 0; i < in_rows.size(); i++) {
-      auto &params = values[i][0];
-      auto &moment_1 = values[i][1];
-      auto &moment_2 = values[i][2];
-
-      auto *p_data = params->data();
-      auto *m1_data = moment_1->data();
-      auto *m2_data = moment_2->data();
-
-      for (int x = 0; x < grad_width; ++x) {
-        auto g = grad_v.data<T>()[grad_width * i + x];
-        m1_data[x] = beta1 * m1_data[x] + (1 - beta1) * g;
-        m2_data[x] = beta2 * m2_data[x] + (1 - beta2) * g * g;
-        p_data[x] -= lr_ * (m1_data[x] / (sqrt(m2_data[x]) + epsilon));
-      }
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_sgd_op.cc b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_sgd_op.cc
deleted file mode 100644
index 8794b87f3ff40..0000000000000
--- a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_sgd_op.cc
+++ /dev/null
@@ -1,125 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_sgd_op.h"
-
-#include <string>
-namespace paddle {
-namespace operators {
-
-class LargeScaleFuseSGDOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(
-        ctx->HasInput("Grad"),
-        platform::errors::InvalidArgument(
-            "Input(Grad) of LargeScaleFuseSGDOp should not be null."));
-    PADDLE_ENFORCE(
-        ctx->HasInput("LearningRate"),
-        platform::errors::InvalidArgument(
-            "Input(LearningRate) of LargeScaleFuseSGDOp should not be null."));
-
-    auto lr_dims = ctx->GetInputDim("LearningRate");
-
-    PADDLE_ENFORCE_NE(framework::product(lr_dims), 0,
-                      platform::errors::InvalidArgument(
-                          "Maybe the Input variable LearningRate has not "
-                          "been initialized. You may need to confirm "
-                          "if you put exe.run(startup_program) "
-                          "after optimizer.minimize function."));
-
-    PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
-                      platform::errors::InvalidArgument(
-                          "Learning rate should have 1 element"));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "Grad");
-    return framework::OpKernelType(data_type, ctx.device_context());
-  }
-
-  framework::OpKernelType GetKernelTypeForVar(
-      const std::string &var_name, const framework::Tensor &tensor,
-      const framework::OpKernelType &expected_kernel_type) const {
-    if (var_name == "LearningRate") {
-      return framework::OpKernelType(tensor.type(), tensor.place(),
-                                     tensor.layout());
-    }
-    return framework::OpKernelType(expected_kernel_type.data_type_,
-                                   tensor.place(), tensor.layout());
-  }
-};
-
-class LargeScaleFuseSGDOpInferVarType : public framework::VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    auto in_var_type = ctx->GetInputType("Grad");
-    PADDLE_ENFORCE_EQ(in_var_type == framework::proto::VarType::SELECTED_ROWS ||
-                          in_var_type == framework::proto::VarType::LOD_TENSOR,
-                      true, platform::errors::InvalidArgument(
-                                "The input Var's type should be LoDtensor or "
-                                "SelectedRows, but the received type is %s",
-                                in_var_type));
-  }
-};
-
-class LargeScaleFuseSGDOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Grad",
-             "(SelectedRows) Ids's type should be SelectedRows"
-             "THe ids to be looked up in W.");
-    AddInput("LearningRate", "(Tensor) Learning rate of SGD");
-    AddAttr<bool>("is_entry",
-                  "(bool)"
-                  "sparse table need entry");
-
-    AddAttr<std::string>("tablename",
-                         "(string)"
-                         "sparse table name");
-
-    AddAttr<std::vector<std::string>>("value_names",
-                                      "(strings)"
-                                      "sparse table name");
-
-    AddComment(R"DOC(
-
-LargeScaleFuseSGD operator
-
-This operator implements one step of the stochastic gradient descent algorithm.
-
-$$param\_out = param - learning\_rate * grad$$
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(
-    lookup_sparse_table_fuse_sgd, ops::LargeScaleFuseSGDOp,
-    ops::LargeScaleFuseSGDOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
-    ops::LargeScaleFuseSGDOpInferVarType);
-
-REGISTER_OP_CPU_KERNEL(
-    lookup_sparse_table_fuse_sgd,
-    ops::LargeScaleFuseSGDOpKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_sgd_op.h b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_sgd_op.h
deleted file mode 100644
index 5d4bf1015fa3a..0000000000000
--- a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_sgd_op.h
+++ /dev/null
@@ -1,105 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/operators/distributed/large_scale_kv.h"
-#include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/selected_rows_functor.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class LargeScaleFuseSGDOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override;
-};
-
-template <typename T>
-class LargeScaleFuseSGDOpKernel<platform::CPUDeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
-
-    const auto *grad_var = ctx.InputVar("Grad");
-
-    PADDLE_ENFORCE(
-        grad_var->IsType<framework::SelectedRows>(),
-        platform::errors::InvalidArgument(
-            "in large scale optimize, gradient should only be SelectedRows"));
-
-    const auto &grad = grad_var->Get<framework::SelectedRows>();
-
-    // for distributed training, a sparse var may be empty,
-    // just skip updating.
-    if (grad.rows().size() == 0) {
-      return;
-    }
-
-    framework::SelectedRows tmp_grad_merge;
-    const framework::SelectedRows *grad_merge_ptr;
-    math::scatter::MergeAdd<platform::CPUDeviceContext, T> merge_func;
-    merge_func(ctx.template device_context<platform::CPUDeviceContext>(), grad,
-               &tmp_grad_merge, true);
-    grad_merge_ptr = &tmp_grad_merge;
-
-    std::vector<int64_t> in_rows;
-    in_rows.reserve(grad_merge_ptr->rows().size());
-    std::copy(grad_merge_ptr->rows().begin(), grad_merge_ptr->rows().end(),
-              std::back_inserter(in_rows));
-
-    const auto *lr = learning_rate->data<T>();
-    auto grad_v = grad_merge_ptr->value();
-    auto grad_width = grad_v.dims()[1];
-
-    //    auto is_entry = context.Attr<bool>("is_entry");
-    auto tablename = ctx.Attr<std::string>("tablename");
-    auto value_names = ctx.Attr<std::vector<std::string>>("value_names");
-
-    std::vector<std::vector<std::vector<float> *>> values;
-    std::vector<int64_t> dims;
-
-    auto *ins = distributed::LargeScaleKV::GetInstance();
-    auto *table = ins->Get(tablename);
-    table->Get(in_rows, value_names, &values);
-    table->Dims({"Param"}, &dims);
-
-    PADDLE_ENFORCE_EQ(dims[0], grad_width,
-                      platform::errors::InvalidArgument(
-                          "param_row should have the same size with grad_row"));
-
-    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(ctx);
-
-    std::vector<T> grads;
-    framework::TensorToVector(grad_v, ctx.device_context(), &grads);
-
-    blas.SCAL(grads.size(), lr[0], grads.data());
-
-    for (int x = 0; x < static_cast<int>(in_rows.size()); ++x) {
-      auto &params = values[x][0];
-      blas.VSUB(grad_width, params->data(), grads.data() + grad_width * x,
-                params->data());
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_grad_split_op.cc b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_grad_split_op.cc
deleted file mode 100644
index 9ff2e78d8652d..0000000000000
--- a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_grad_split_op.cc
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/distributed_ops/lookup_sparse_table_grad_split_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-class LookupSparseTableGradSplitOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {}
-};
-
-class LookupSparseTableGradSplitOpMaker
-    : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Grad",
-             "(SelectedRows) Ids's type should be SelectedRows"
-             "THe ids to be looked up in W.");
-
-    AddAttr<bool>("is_entry",
-                  "(bool)"
-                  "sparse table need entry");
-
-    AddAttr<std::string>("tablename",
-                         "(string)"
-                         "sparse table name");
-
-    AddOutput("Row",
-              "(LoDTensor) The lookup results, which have the "
-              "same type as W.");
-    AddOutput("Value",
-              "(LoDTensor) The lookup results, which have the "
-              "same type as W.");
-    AddComment(R"DOC(
-Lookup Sprase Tablel Operator.
-
-This operator is used to perform lookup on parameter W,
-then concatenated into a sparse tensor.
-
-The type of Ids(Input) is SelectedRows, the rows of Ids contains
-the ids to be looked up in W;
-if the Id is not in the sparse table, this operator will return a
-random value and set the value into the table for the next looking up.
-
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(
-    lookup_sparse_table_grad_split, ops::LookupSparseTableGradSplitOp,
-    ops::LookupSparseTableGradSplitOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-
-REGISTER_OP_CPU_KERNEL(
-    lookup_sparse_table_grad_split,
-    ops::LookupSparseTableGradSplitKernel<paddle::platform::CPUDeviceContext,
-                                          float>,
-    ops::LookupSparseTableGradSplitKernel<paddle::platform::CPUDeviceContext,
-                                          double>);
diff --git a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_grad_split_op.h b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_grad_split_op.h
deleted file mode 100644
index b3077efda6de3..0000000000000
--- a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_grad_split_op.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <iostream>
-#include <iterator>
-#include <random>
-#include <sstream>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/distributed/large_scale_kv.h"
-#include "paddle/fluid/operators/math/selected_rows_functor.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using SelectedRows = framework::SelectedRows;
-
-template <typename DeviceContext, typename T>
-class LookupSparseTableGradSplitKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const SelectedRows* in_grad = context.Input<SelectedRows>("Grad");
-
-    // merge duplicated rows if any.
-    // The rows of grad_merge_ptr have been sorted inside MergeAdd functor
-    framework::SelectedRows tmp_grad_merge;
-    const framework::SelectedRows* grad_merge_ptr;
-    math::scatter::MergeAdd<DeviceContext, T> merge_func;
-    merge_func(context.template device_context<DeviceContext>(), *in_grad,
-               &tmp_grad_merge, true);
-    grad_merge_ptr = &tmp_grad_merge;
-
-    std::vector<int64_t> in_rows;
-    in_rows.reserve(grad_merge_ptr->rows().size());
-    std::copy(grad_merge_ptr->rows().begin(), grad_merge_ptr->rows().end(),
-              std::back_inserter(in_rows));
-
-    auto* out_row = context.Output<Tensor>("Row");
-    out_row->Resize(
-        framework::make_ddim({static_cast<int64_t>(in_rows.size()), 1}));
-    out_row->mutable_data<int64_t>(context.GetPlace());
-    framework::TensorFromVector(in_rows, context.device_context(), out_row);
-
-    auto in_value = grad_merge_ptr->value();
-    std::vector<T> ins_vector;
-    framework::TensorToVector(in_value, context.device_context(), &ins_vector);
-    auto dims = in_value.dims();
-
-    auto is_entry = context.Attr<bool>("is_entry");
-    auto tablename = context.Attr<std::string>("tablename");
-
-    if (is_entry) {
-      auto* ins = distributed::LargeScaleKV::GetInstance();
-      std::vector<int64_t> ids;
-      ins->Get(tablename)->GetEntry(in_rows, &ids);
-
-      for (auto& id : ids) {
-        auto it = std::find(in_rows.begin(), in_rows.end(), id);
-        if (it == in_rows.end()) {
-          PADDLE_THROW(platform::errors::OutOfRange(
-              "the input key should be exists. But received %d.", id));
-        }
-
-        auto distance =
-            static_cast<int64_t>(std::distance(in_rows.begin(), it));
-        std::fill(ins_vector.data() + distance * dims[1],
-                  ins_vector.data() + dims[1], 0.0);
-      }
-    }
-
-    auto* out_v = context.OutputVar("Value");
-    out_v->Clear();
-    auto* out_t = out_v->GetMutable<framework::LoDTensor>();
-    out_t->mutable_data<T>(context.GetPlace());
-    framework::TensorFromVector(ins_vector, context.device_context(), out_t);
-    out_t->Resize(dims);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_init_op.cc b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_init_op.cc
deleted file mode 100644
index 96ec6a85d6eab..0000000000000
--- a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_init_op.cc
+++ /dev/null
@@ -1,147 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/distributed/large_scale_kv.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-// examples: embedding:Param,Moment1,Moment2:64,64,64:0
-constexpr char kLargeScaleKV[] = "large_scale_metas";
-constexpr int64_t kNoPadding = -1;
-
-static void split(const std::string &str, char sep,
-                  std::vector<std::string> *pieces) {
-  pieces->clear();
-  if (str.empty()) {
-    return;
-  }
-  size_t pos = 0;
-  size_t next = str.find(sep, pos);
-  while (next != std::string::npos) {
-    pieces->push_back(str.substr(pos, next - pos));
-    pos = next + 1;
-    next = str.find(sep, pos);
-  }
-  if (!str.substr(pos).empty()) {
-    pieces->push_back(str.substr(pos));
-  }
-}
-
-class LookupSparseTableInitInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {}
-};
-
-void InitLargeScaleKV(std::vector<std::string> kv_attrs) {
-  std::vector<distributed::SparseMeta> metas;
-
-  for (auto attrs : kv_attrs) {
-    std::vector<std::string> pieces;
-    split(attrs, ':', &pieces);
-    PADDLE_ENFORCE_EQ(
-        pieces.size(), 8,
-        platform::errors::InvalidArgument(
-            "param, names, dims, mode, grad, cached_var, init_attrs"));
-
-    std::string name;
-    std::string grad_name;
-    std::vector<std::string> value_names;
-    std::vector<int> value_dims;
-    distributed::Mode mode;
-    std::vector<std::string> cached_names;
-    std::vector<std::string> init_attrs;
-    std::string entry_attr;
-
-    name = pieces[0];
-    split(pieces[1], ',', &value_names);
-
-    std::vector<std::string> value_dims_str;
-    split(pieces[2], ',', &value_dims_str);
-    for (auto &str : value_dims_str) {
-      value_dims.push_back(std::stoi(str));
-    }
-
-    mode = pieces[3] == "0" ? distributed::Mode::training
-                            : distributed::Mode::infer;
-
-    grad_name = pieces[4];
-    split(pieces[5], ',', &cached_names);
-    split(pieces[6], ',', &init_attrs);
-    entry_attr = pieces[7];
-
-    auto meta = distributed::SparseMeta();
-    meta.name = name;
-    meta.value_names = value_names;
-    meta.value_dims = value_dims;
-    meta.mode = mode;
-    meta.grad_name = grad_name;
-    meta.cached_varnames = cached_names;
-    meta.initializer_attrs = init_attrs;
-    meta.entry = entry_attr;
-
-    VLOG(3) << "add sparse meta: " << meta.ToString();
-    metas.push_back(meta);
-  }
-
-  distributed::LargeScaleKV::Init(metas);
-  VLOG(3) << "init large scale kv with " << metas.size() << " params";
-}
-
-class LookupSparseTableInitOp : public framework::OperatorBase {
- public:
-  using framework::OperatorBase::OperatorBase;
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    auto kv_attrs = Attr<std::vector<std::string>>(kLargeScaleKV);
-    InitLargeScaleKV(kv_attrs);
-  }
-};
-
-class LookupSparseTableInitOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddAttr<std::vector<std::string>>(kLargeScaleKV,
-                                      "(string)"
-                                      "sparse table name");
-    AddComment(R"DOC(
-Lookup Sprase Tablel Operator.
-
-This operator is used to perform lookup on parameter W,
-then concatenated into a sparse tensor.
-
-The type of Ids(Input) is SelectedRows, the rows of Ids contains
-the ids to be looked up in W;
-if the Id is not in the sparse table, this operator will return a
-random value and set the value into the table for the next looking up.
-
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(
-    lookup_sparse_table_init, ops::LookupSparseTableInitOp,
-    ops::LookupSparseTableInitInferShape, ops::LookupSparseTableInitOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
diff --git a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_merge_op.cc b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_merge_op.cc
deleted file mode 100644
index 79dc206f040cc..0000000000000
--- a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_merge_op.cc
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/distributed_ops/lookup_sparse_table_merge_op.h"
-
-namespace paddle {
-namespace operators {
-
-class LookupSparseTableMergeOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInputs("X"), true,
-        platform::errors::InvalidArgument("Input(X) should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasOutput("Out"), true,
-        platform::errors::InvalidArgument("Output(Out) should not be null."));
-
-    PADDLE_ENFORCE_EQ(ctx->GetInputsVarType("X").front(),
-                      framework::proto::VarType::SELECTED_ROWS,
-                      platform::errors::InvalidArgument(
-                          "Input X only should be SelectedRows."));
-    PADDLE_ENFORCE_EQ(ctx->GetOutputsVarType("Out").front(),
-                      framework::proto::VarType::SELECTED_ROWS,
-                      platform::errors::InvalidArgument(
-                          "Output Y only should be SelectedRows."));
-
-    ctx->ShareDim("X", /*->*/ "Out");
-  }
-};
-
-class LookupSparseTableMergeMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "The input type is SelectedRows, and the selected rows may be "
-             "duplicated.")
-        .AsDuplicable();
-    AddOutput("Out",
-              "The output type is SelectedRows, and the selected rows are not "
-              "duplicated.");
-    AddComment(
-        R"DOC(
-Merge sparse lookup table(selected rows as parameter).
-)DOC");
-  }
-};
-
-class LookupSparseTableMergeOpInferVarType
-    : public framework::PassInDtypeAndVarTypeToOutput {
- protected:
-  std::unordered_map<std::string, std::string>& GetInputOutputWithSameType()
-      const override {
-    static std::unordered_map<std::string, std::string> m{{"X", /*->*/ "Out"}};
-    return m;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OPERATOR(lookup_sparse_table_merge, ops::LookupSparseTableMergeOp,
-                  ops::LookupSparseTableMergeMaker,
-                  ops::LookupSparseTableMergeOpInferVarType);
-
-REGISTER_OP_CPU_KERNEL(
-    lookup_sparse_table_merge,
-    ops::LookupSparseTableMergeKernel<plat::CPUDeviceContext, float>,
-    ops::LookupSparseTableMergeKernel<plat::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_merge_op.h b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_merge_op.h
deleted file mode 100644
index 0efd5cada1c93..0000000000000
--- a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_merge_op.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *     http://www.apache.org/licenses/LICENSE-2.0
- *     Unless required by applicable law or agreed to in writing, software
- *     distributed under the License is distributed on an "AS IS" BASIS,
- *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *     See the License for the specific language governing permissions and
- *     limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-int64_t GetDelimiterForShard(const std::vector<int64_t>& rows, int start_idx,
-                             int shard_id, int shard_num) {
-  int64_t rows_num = rows.size() / 2;
-  for (int64_t i = start_idx; i < rows_num; ++i) {
-    if (rows[i] % shard_num != shard_id) {
-      return i;
-    }
-  }
-  return rows_num;
-}
-
-template <typename DeviceContext, typename T>
-class LookupSparseTableMergeKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto inputs = ctx.MultiInput<framework::SelectedRows>("X");
-    auto* out = ctx.Output<framework::SelectedRows>("Out");
-
-    int64_t height = 0;
-    int64_t ids_num = 0;
-    int64_t width = 0;
-
-    height = inputs[0]->height();
-    width = inputs[0]->value().dims()[1];
-
-    for (auto& in : inputs) {
-      ids_num += in->rows().size();
-      height += in->height();
-    }
-
-    T* out_data = out->mutable_value()->mutable_data<T>({ids_num, width},
-                                                        platform::CPUPlace());
-
-    out->set_height(height);
-    std::vector<int64_t> all_ids;
-    all_ids.reserve(ids_num);
-    for (auto& in : inputs) {
-      all_ids.insert(all_ids.end(), in->rows().begin(), in->rows().end());
-    }
-    out->set_rows(all_ids);
-
-    int64_t cnt = 0;
-
-    for (auto& in : inputs) {
-      auto rows = in->rows().size();
-      const T* in_data = in->value().data<T>();
-      std::copy_n(in_data, rows * width, out_data + cnt);
-      cnt += rows * width;
-    }
-    out->SyncIndex();
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_read_op.cc b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_read_op.cc
deleted file mode 100644
index 87a37c5bfdefa..0000000000000
--- a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_read_op.cc
+++ /dev/null
@@ -1,133 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/distributed/large_scale_kv.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-constexpr int64_t kNoPadding = -1;
-
-class LookupSparseTableReadInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {}
-};
-
-class LookupSparseTableReadOp : public framework::OperatorBase {
- public:
-  using framework::OperatorBase::OperatorBase;
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    auto init = Attr<bool>("init");
-
-    auto &id_tensor = scope.FindVar(Input("Ids"))->Get<framework::LoDTensor>();
-    auto *id_data = id_tensor.data<int64_t>();
-    auto tablename = Attr<std::string>("tablename");
-    auto value_names = Attr<std::vector<std::string>>("value_names");
-    auto out_names = Outputs("Out");
-
-    std::vector<int64_t> ids;
-    for (int64_t i = 0; i < id_tensor.numel(); ++i) {
-      ids.push_back(id_data[i]);
-    }
-
-    std::vector<std::vector<std::vector<float> *>> values;
-    std::vector<int64_t> dims;
-
-    auto *ins = distributed::LargeScaleKV::GetInstance();
-
-    if (init) {
-      ins->Get(tablename)->Init(ids);
-      ins->Get(tablename)->Get(ids, value_names, &values);
-    } else {
-      ins->Get(tablename)->Get(ids, value_names, &values);
-    }
-
-    ins->Get(tablename)->Dims(value_names, &dims);
-
-    platform::CPUPlace cpu;
-    std::vector<float *> tensors;
-
-    for (int i = 0; i < static_cast<int>(value_names.size()); i++) {
-      auto out_var = scope.FindVar(out_names[i]);
-      auto out_t = out_var->GetMutable<framework::LoDTensor>();
-
-      std::vector<int64_t> o_dims;
-      o_dims.push_back(static_cast<int64_t>(ids.size()));
-      o_dims.push_back(dims[i]);
-      out_t->Resize(framework::make_ddim(o_dims));
-      auto *out_d = out_t->mutable_data<float>(cpu);
-      tensors.push_back(out_d);
-    }
-
-    for (int i = 0; i < static_cast<int>(values.size()); i++) {
-      for (int j = 0; j < static_cast<int>(tensors.size()); j++) {
-        std::memcpy(tensors[j] + i * dims[j], values[i][j]->data(),
-                    sizeof(float) * dims[j]);
-      }
-    }
-  }
-};
-
-class LookupSparseTableReadOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Ids",
-             "(LoDTensor) Ids's type should be LoDTensor"
-             "THe ids to be looked up in W.");
-    AddOutput("Out",
-              "(LoDTensor) The lookup results, which have the "
-              "same type as W.")
-        .AsDuplicable();
-
-    AddAttr<std::string>("tablename",
-                         "(string)"
-                         "sparse table name");
-
-    AddAttr<std::vector<std::string>>("value_names",
-                                      "(strings)"
-                                      "sparse table name");
-
-    AddAttr<bool>("init", " for test init large scale kv").SetDefault(false);
-
-    AddComment(R"DOC(
-Lookup Sprase Tablel Operator.
-
-This operator is used to perform lookup on parameter W,
-then concatenated into a sparse tensor.
-
-The type of Ids(Input) is SelectedRows, the rows of Ids contains
-the ids to be looked up in W;
-if the Id is not in the sparse table, this operator will return a
-random value and set the value into the table for the next looking up.
-
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(
-    lookup_sparse_table_read, ops::LookupSparseTableReadOp,
-    ops::LookupSparseTableReadInferShape, ops::LookupSparseTableReadOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
diff --git a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_write_op.cc b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_write_op.cc
deleted file mode 100644
index afe79cd1c316c..0000000000000
--- a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_write_op.cc
+++ /dev/null
@@ -1,116 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/distributed/large_scale_kv.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-constexpr int64_t kNoPadding = -1;
-
-class LookupSparseTableWriteInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {}
-};
-
-class LookupSparseTableWriteOp : public framework::OperatorBase {
- public:
-  using framework::OperatorBase::OperatorBase;
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    auto &id_tensor = scope.FindVar(Input("Ids"))->Get<framework::LoDTensor>();
-    auto *id_data = id_tensor.data<int64_t>();
-
-    std::vector<int64_t> ids;
-    for (int64_t i = 0; i < id_tensor.numel(); ++i) {
-      ids.push_back(id_data[i]);
-    }
-
-    auto tablename = Attr<std::string>("tablename");
-    auto value_names = Attr<std::vector<std::string>>("value_names");
-
-    std::vector<const float *> tensors;
-    std::vector<int64_t> dims;
-    std::vector<std::vector<std::vector<float>>> values;
-    values.resize(ids.size());
-
-    auto in_names = Inputs("In");
-    for (int i = 0; i < static_cast<int>(in_names.size()); i++) {
-      auto *in = scope.FindVar(in_names[i]);
-      auto in_t = in->Get<framework::LoDTensor>();
-      dims.push_back(in_t.dims()[1]);
-      tensors.push_back(in_t.data<float>());
-    }
-
-    for (int i = 0; i < static_cast<int>(ids.size()); i++) {
-      values[i].resize(tensors.size());
-      for (int j = 0; j < static_cast<int>(tensors.size()); j++) {
-        values[i][j].resize(dims[j]);
-        std::memcpy(values[i][j].data(), tensors[j] + i * dims[j],
-                    sizeof(float) * dims[j]);
-      }
-    }
-
-    auto *ins = distributed::LargeScaleKV::GetInstance();
-    ins->Get(tablename)->Set(ids, value_names, values);
-  }
-};
-
-class LookupSparseTableWriteOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Ids",
-             "(LoDTensor) Ids's type should be LoDTensor"
-             "THe ids to be looked up in W.");
-    AddInput("In",
-             "(LoDTensor) The lookup results, which have the "
-             "same type as W.")
-        .AsDuplicable();
-
-    AddAttr<std::string>("tablename",
-                         "(string)"
-                         "sparse table name");
-    AddAttr<std::vector<std::string>>("value_names",
-                                      "(strings)"
-                                      "sparse table name");
-    AddComment(R"DOC(
-Lookup Sprase Tablel Operator.
-
-This operator is used to perform lookup on parameter W,
-then concatenated into a sparse tensor.
-
-The type of Ids(Input) is SelectedRows, the rows of Ids contains
-the ids to be looked up in W;
-if the Id is not in the sparse table, this operator will return a
-random value and set the value into the table for the next looking up.
-
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(
-    lookup_sparse_table_write, ops::LookupSparseTableWriteOp,
-    ops::LookupSparseTableWriteInferShape, ops::LookupSparseTableWriteOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
diff --git a/paddle/fluid/operators/distributed_ops/merge_ids_op.cc b/paddle/fluid/operators/distributed_ops/merge_ids_op.cc
deleted file mode 100644
index 33a433b0dbe04..0000000000000
--- a/paddle/fluid/operators/distributed_ops/merge_ids_op.cc
+++ /dev/null
@@ -1,134 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/distributed_ops/merge_ids_op.h"
-
-namespace paddle {
-namespace operators {
-
-class MergeIdsOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Ids", "(LoDTensor) the input ids with shape{batch_num, 1}")
-        .AsDuplicable();
-    AddInput("Rows", "(LoDTensor) the input ids with shape{row_size, 1}, ")
-        .AsDuplicable();
-    AddInput("X",
-             "(LoDTensors) multi input tensor with shape{Rows, N}, N is the "
-             "size of embedding table")
-        .AsDuplicable();
-    AddOutput("Out", "(LoDTensor) The merged outputs of the input tensors.")
-        .AsDuplicable();
-
-    AddComment(R"DOC(
-Merge multi LoDTensor's into one according to Ids's shard num.
-
-
-split_ids_op -> prefetch_op -> merge_ids_op
-
-
-merge_ids_op should be used after split_ids_op and prefetch_op, split_ids_op
- will split input Ids into multiple tensors according to Id's shard number.
-prefetch_op will send them to parameter server to prefetch embedding value
-back. During split, the order of ids is disordered. In merge_ids_op we use
-the original Ids to restore the order of the fetched embedding value and
- also pass the lod information to the merged output.
-
-
-Example:
-
-    Ids = [1,2,3,4,5,6] # 3 shared
-
-split_ids_op ->
-
-    Id0 = [3, 6] # id % 3 == 0
-    Id1 = [1, 4] # id % 3 == 1
-    Id2 = [2, 5] # id % 3 == 2
-
-prefetch_op ->
-
-    X0 = [[0.3 0.3]   # 3
-          [0.6 0.6]]  # 6
-    X1 = [[0.1 0.1]   # 1
-          [0.4 0.4]]  # 4
-    X2 = [[0.2 0.2]   # 2
-          [0.5 0.5]]  # 5
-
-merge_ids_op ->
-
-    Out = [[0.1 0.1]  # 1
-           [0.2 0.2]  # 2
-           [0.3 0.3]  # 3
-           [0.4 0.4]  # 4
-           [0.5 0.5]  # 5
-           [0.6 0.6]] # 6
-)DOC");
-  }
-};
-
-class MergeIdsOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInputs("Ids"), "Input", "Ids", "MergeIds");
-    OP_INOUT_CHECK(ctx->HasInputs("Rows"), "Input", "Rows", "MergeIds");
-    OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X", "MergeIds");
-    OP_INOUT_CHECK(ctx->HasOutputs("Out"), "Output", "Out", "MergeIds");
-
-    auto ids_var_type = ctx->GetInputsVarType("Ids").front();
-    auto ids_dims = ctx->GetInputsDim("Ids");
-    if (ids_var_type == framework::proto::VarType::LOD_TENSOR) {
-      PADDLE_ENFORCE_EQ(
-          ids_dims[0].size(), 2,
-          platform::errors::InvalidArgument(
-              "the ids size must be 2, but received %d", ids_dims[0].size()));
-      PADDLE_ENFORCE_EQ(
-          ids_dims[0][1], 1,
-          platform::errors::InvalidArgument(
-              "the ids dim must be 1, but received %d", ids_dims[0][1]));
-    }
-    auto x_var_type = ctx->GetInputsVarType("X");
-    for (auto &var_type : x_var_type) {
-      PADDLE_ENFORCE_EQ(var_type, framework::proto::VarType::LOD_TENSOR,
-                        platform::errors::InvalidArgument(
-                            "input X only support lod tensors"));
-    }
-    ctx->ShareLoD("Ids", "Out");
-  }
-
- private:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
-  }
-};
-
-class MergeIdsOpInferVarType : public framework::VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    auto input_type = ctx->GetInputType("Ids");
-    ctx->SetOutputType("Out", input_type, framework::ALL_ELEMENTS);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(merge_ids, ops::MergeIdsOp, ops::MergeIdsOpMaker,
-                  ops::MergeIdsOpInferVarType);
-REGISTER_OP_CPU_KERNEL(
-    merge_ids, ops::MergeIdsOpKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/fluid/operators/distributed_ops/merge_ids_op.h b/paddle/fluid/operators/distributed_ops/merge_ids_op.h
deleted file mode 100644
index 9af014f57a687..0000000000000
--- a/paddle/fluid/operators/distributed_ops/merge_ids_op.h
+++ /dev/null
@@ -1,112 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <tuple>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/math/selected_rows_functor.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class MergeIdsOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto place = ctx.GetPlace();
-    if (!platform::is_cpu_place(place)) {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "MergeIds do not support GPU kernel"));
-    }
-
-    const auto ids = ctx.MultiInput<framework::LoDTensor>("Ids");
-    const auto row_ids = ctx.MultiInput<framework::LoDTensor>("Rows");
-    const auto x_tensors = ctx.MultiInput<framework::LoDTensor>("X");
-    auto outs = ctx.MultiOutput<framework::LoDTensor>("Out");
-
-    PADDLE_ENFORCE_EQ(row_ids.size(), x_tensors.size(),
-                      platform::errors::InvalidArgument(
-                          "the number of Rows and X should be the same"));
-    PADDLE_ENFORCE_EQ(ids.size(), outs.size(),
-                      platform::errors::InvalidArgument(
-                          "the number of Ids and Out should be the same"));
-
-    int64_t row_ids_size = 0;
-    int64_t row_size = 0;
-    int64_t embedding_size = 0;
-
-    for (size_t i = 0; i < x_tensors.size(); ++i) {
-      const auto *x_tensor = x_tensors[i];
-      const auto *row_id = row_ids[i];
-
-      if (embedding_size == 0) {
-        embedding_size = x_tensor->dims()[1];
-      }
-      PADDLE_ENFORCE_EQ(embedding_size, x_tensor->dims()[1],
-                        platform::errors::InvalidArgument(
-                            "embedding size of all input should be the same"));
-      row_size += x_tensor->dims()[0];
-      row_ids_size += row_id->dims()[0];
-    }
-
-    PADDLE_ENFORCE_EQ(
-        row_size, row_ids_size,
-        platform::errors::InvalidArgument(
-            "the merged X dim[0] and merged Rows dim[0] should be the same"));
-
-    std::unordered_map<int64_t, std::tuple<int64_t, int64_t>>
-        selected_rows_idx_map;
-    for (size_t i = 0; i < x_tensors.size(); ++i) {
-      const auto *row_id = row_ids[i];
-
-      for (auto j = 0; j < row_id->numel(); ++j) {
-        int64_t key = row_id->data<int64_t>()[j];
-        std::tuple<int64_t, int64_t> val = std::make_tuple(i, j);
-        selected_rows_idx_map.insert(std::make_pair(key, val));
-      }
-    }
-    PADDLE_ENFORCE_EQ(row_ids_size, selected_rows_idx_map.size(),
-                      platform::errors::InvalidArgument(
-                          "the rows and tensor map size should be the same"));
-
-    for (size_t i = 0; i < outs.size(); ++i) {
-      auto *out_ids = ids[i];
-      auto *out = outs[i];
-
-      out->set_lod(out_ids->lod());
-
-      auto nums = out_ids->dims()[0];
-      auto *out_data = out->mutable_data<T>(
-          framework::make_ddim({nums, embedding_size}), place);
-      for (auto j = 0; j < nums; ++j) {
-        auto id = out_ids->data<int64_t>()[j];
-        auto row_tuple = selected_rows_idx_map.at(id);
-        auto row_idx = std::get<1>(row_tuple);
-        const auto *x_tensor = x_tensors[std::get<0>(row_tuple)];
-
-        memcpy(out_data + embedding_size * j,
-               x_tensor->data<T>() + row_idx * embedding_size,
-               sizeof(T) * embedding_size);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed_ops/prefetch_op.cc b/paddle/fluid/operators/distributed_ops/prefetch_op.cc
deleted file mode 100644
index 007dbbbfbf510..0000000000000
--- a/paddle/fluid/operators/distributed_ops/prefetch_op.cc
+++ /dev/null
@@ -1,119 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
-
-namespace paddle {
-namespace framework {
-class InferShapeContext;
-class OpDesc;
-class Scope;
-template <typename T>
-class EmptyGradOpMaker;
-}  // namespace framework
-namespace imperative {
-class OpBase;
-}  // namespace imperative
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-
-namespace distributed {
-class RPCClient;
-}  // namespace distributed
-
-class PrefetchOp : public framework::OperatorBase {
- public:
-  PrefetchOp(const std::string& type, const framework::VariableNameMap& inputs,
-             const framework::VariableNameMap& outputs,
-             const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& place) const override {
-    auto ins = Inputs("X");
-    auto outs = Outputs("Out");
-
-    std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
-
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    auto& ctx = *pool.Get(place);
-
-    distributed::RPCClient* rpc_client =
-        distributed::RPCClient::GetInstance<RPCCLIENT_T>(
-            Attr<int>("trainer_id"));
-
-    std::vector<distributed::VarHandlePtr> rets;
-    for (size_t i = 0; i < ins.size(); i++) {
-      if (NeedSend(scope, ins[i])) {
-        VLOG(3) << "sending " << ins[i] << " to " << epmap[i] << " to get "
-                << outs[i] << " back";
-        rets.push_back(rpc_client->AsyncPrefetchVar(epmap[i], ctx, scope,
-                                                    ins[i], outs[i]));
-      } else {
-        VLOG(3) << "don't send no-initialied variable: " << ins[i];
-      }
-    }
-    for (size_t i = 0; i < rets.size(); i++) {
-      PADDLE_ENFORCE_EQ(
-          rets[i]->Wait(), true,
-          platform::errors::Fatal(
-              "It's a fatal error of RPCClient that RPCClient can't "
-              "get the wait result. It may happen when trainers or "
-              "parameter servers exit un normally or the network "
-              "issue!"));
-    }
-  }
-};
-
-class PrefetchOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddInput("X", "(LoDTensor) Input Id variables to be sent").AsDuplicable();
-    AddOutput("Out",
-              "(LoDTensor) result "
-              "to be fetched from parameter server")
-        .AsDuplicable();
-    AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
-    AddAttr<std::vector<std::string>>(
-        "epmap",
-        "(string vector, default 127.0.0.1:6164)"
-        "Server endpoints in the order of input variables for mapping")
-        .SetDefault({"127.0.0.1:6164"});
-    AddComment(R"DOC(
-Prefetch operator
-
-This operator will send Ids variables to listen_and_serve op at
-the parameter server and fetch result back.
-)DOC");
-  }
-};
-
-class PrefetchOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext* ctx) const override {}
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(
-    prefetch, ops::PrefetchOp,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
-    ops::PrefetchOpMaker, ops::PrefetchOpShapeInference);
diff --git a/paddle/fluid/operators/distributed_ops/recv_op.cc b/paddle/fluid/operators/distributed_ops/recv_op.cc
deleted file mode 100644
index 9729d0dadd7ed..0000000000000
--- a/paddle/fluid/operators/distributed_ops/recv_op.cc
+++ /dev/null
@@ -1,153 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/distributed/communicator.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
-
-namespace paddle {
-namespace framework {
-class InferShapeContext;
-class OpDesc;
-class Scope;
-template <typename T>
-class EmptyGradOpMaker;
-}  // namespace framework
-namespace imperative {
-class OpBase;
-}  // namespace imperative
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-
-namespace distributed {
-class RPCClient;
-}  // namespace distributed
-
-class RecvOp : public framework::OperatorBase {
- public:
-  RecvOp(const std::string &type, const framework::VariableNameMap &inputs,
-         const framework::VariableNameMap &outputs,
-         const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &place) const override {
-    std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
-    std::vector<std::string> varnames =
-        Attr<std::vector<std::string>>("varnames");
-
-    auto outs = Outputs("Out");
-    bool with_barrier = Attr<bool>("with_barrier");
-
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &ctx = *pool.Get(place);
-    auto trainer_id = Attr<int>("trainer_id");
-
-    distributed::RPCClient *rpc_client =
-        distributed::RPCClient::GetInstance<RPCCLIENT_T>(trainer_id);
-
-    std::vector<std::string> recv_varnames =
-        Attr<std::vector<std::string>>("recv_varnames");
-
-    if (recv_varnames.size() > 0) {
-      auto *communicator = distributed::Communicator::GetInstance();
-
-      if (communicator != nullptr) {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "execute startup program must before fleet.init_worker"));
-      }
-    } else {
-      std::vector<distributed::VarHandlePtr> rets;
-      if (with_barrier) {
-        for (size_t i = 0; i < outs.size(); i++) {
-          std::string varname = varnames.size() == 0 ? outs[i] : varnames[i];
-          VLOG(4) << "recv " << outs[i] << " from " << epmap[i] << " with "
-                  << varname << " and with AsyncGetVar";
-          rets.push_back(
-              rpc_client->AsyncGetVar(epmap[i], ctx, scope, varname, outs[i]));
-        }
-      } else {
-        for (size_t i = 0; i < outs.size(); i++) {
-          std::string varname = varnames.size() == 0 ? outs[i] : varnames[i];
-          VLOG(4) << "recv " << outs[i] << " from " << epmap[i] << " with "
-                  << varname << " and with AsyncGetVarNoBarrier";
-          rets.push_back(rpc_client->AsyncGetVarNoBarrier(epmap[i], ctx, scope,
-                                                          varname, outs[i]));
-        }
-      }
-      for (size_t i = 0; i < rets.size(); i++) {
-        VLOG(7) << "before sync_recv " << outs[i] << "from " << epmap[i];
-        PADDLE_ENFORCE_NE(
-            rets[i]->Wait(), 0U,
-            platform::errors::ExecutionTimeout("internal error in RPCClient"));
-        VLOG(7) << "after sync_recv " << outs[i] << "from " << epmap[i];
-      }
-    }
-  }
-};
-
-class RecvOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddInput("X", "(Any) Dummy inputs, used for control dependency")
-        .AsDuplicable();
-    AddOutput("Out", "(Tensor) Variables to get from server.").AsDuplicable();
-    AddComment(R"DOC(
-Recv operator
-
-This operator can get variables from server side.
-)DOC");
-    AddAttr<std::vector<std::string>>("epmap",
-                                      "(string vector, default 127.0.0.1:6164)"
-                                      "Server endpoints in the order of input "
-                                      "variables for mapping")
-        .SetDefault({});
-    AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
-    AddAttr<bool>("with_barrier",
-                  "(bool, default True) if with_barrier=False, will use "
-                  "AsyncGetVarNoBarrier get variable from pserver immediately")
-        .SetDefault(true);
-    AddAttr<std::vector<std::string>>(
-        "varnames",
-        "(string vector, default {}) "
-        "sometimes we need to put received var in another name "
-        "for example: we need var named 'moment_1@127.0.0.1:1001', "
-        "and it real name on parameter server is 'moment_1'. ")
-        .SetDefault({});
-    AddAttr<std::vector<std::string>>(
-        "recv_varnames",
-        "(vector<string>) "
-        "the split parameter varnames to be recved from pserver")
-        .SetDefault(std::vector<std::string>{});
-    AddAttr<int>("do_not_run", "if recv need to really run").SetDefault(0);
-  }
-};
-
-class RecvOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {}
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(
-    recv, ops::RecvOp,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
-    ops::RecvOpMaker, ops::RecvOpShapeInference);
diff --git a/paddle/fluid/operators/distributed_ops/recv_save_op.cc b/paddle/fluid/operators/distributed_ops/recv_save_op.cc
deleted file mode 100644
index d6da818e1df51..0000000000000
--- a/paddle/fluid/operators/distributed_ops/recv_save_op.cc
+++ /dev/null
@@ -1,328 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <stdint.h>
-#include <fstream>
-#include <numeric>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/data_type_transform.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/variable.h"
-#include "paddle/fluid/framework/version.h"
-#include "paddle/fluid/operators/distributed/communicator_common.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/distributed/parameter_recv.h"
-#include "paddle/fluid/string/string_helper.h"
-
-namespace paddle {
-namespace operators {
-class RecvSaveOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {}
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        framework::proto::VarType::Type(ctx.Attr<int>("dtype")),
-        platform::CPUPlace());
-  }
-};
-
-class RecvSaveOpProtoMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddComment(R"DOC(
-Recv Save operator
-
-This operator will serialize and write LoDTensor variable to file on disk.
-)DOC");
-    AddAttr<int>("dtype",
-                 "(int, default 5 (FP32)) "
-                 "Output data type")
-        .SetDefault(framework::proto::VarType::FP32);
-
-    AddAttr<bool>("overwrite",
-                  "(boolean, default true)"
-                  "Overwrite the output file if exist")
-        .SetDefault(true);
-
-    AddAttr<std::string>("file_path",
-                         "(string)"
-                         "The \"file_path\" where the variable will be saved.")
-        .AddCustomChecker(
-            [](const std::string &path) { return !path.empty(); });
-
-    AddAttr<std::vector<int64_t>>("shape",
-                                  "(vector<int64_t>) The shape of the output")
-        .SetDefault({});
-
-    AddAttr<std::vector<std::string>>(
-        "slice_varnames",
-        "(string vector, default {}) "
-        "sometimes we need to put received var in another name "
-        "for example: we need var named 'moment_1@127.0.0.1:1001', "
-        "and it real name on parameter server is 'moment_1'. ")
-        .SetDefault({});
-
-    AddAttr<std::vector<std::string>>(
-        "remote_varnames",
-        "(string vector, default {}) "
-        "sometimes we need to put received var in another name "
-        "for example: we need var named 'moment_1@127.0.0.1:1001', "
-        "and it real name on parameter server is 'moment_1'. ")
-        .SetDefault({});
-
-    AddAttr<std::vector<std::string>>("slice_shapes",
-                                      "(vector<int>) "
-                                      "the length of each output along the "
-                                      "specified axis.")
-        .SetDefault({});
-
-    AddAttr<std::vector<std::string>>("endpoints",
-                                      "(string vector, default 127.0.0.1:6164)"
-                                      "Server endpoints in the order of input "
-                                      "variables for mapping")
-        .SetDefault({});
-
-    AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
-    AddAttr<bool>("is_sparse", "sparse or dense param");
-    AddAttr<int>("pserver_num", "the number of pserver").SetDefault(0);
-    AddAttr<bool>("is_distributed", "sparse id range [0, N) or [0, INT64]")
-        .SetDefault(false);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class RecvSaveOpKernel : public framework::OpKernel<T> {
- private:
-  void SerializeVersionToStream(std::ostream &os) const {
-    {  // the 1st field, uint32_t version for LoDTensor
-      os.write(reinterpret_cast<const char *>(&framework::kCurTensorVersion),
-               sizeof(framework::kCurTensorVersion));
-    }
-    // the 2st field, LoD information
-    // in this scene, skip LoD information.
-    uint64_t size = 0;
-    os.write(reinterpret_cast<const char *>(&size), sizeof(size));
-  }
-
-  void SerializeTensorHeaderToStream(
-      std::ostream &os, const framework::proto::VarType::Type &type,
-      const framework::DDim &dims) const {
-    {  // the 1st field, uint32_t version
-      constexpr uint32_t version = 0;
-      os.write(reinterpret_cast<const char *>(&version), sizeof(version));
-    }
-    {  // the 2nd field, tensor description
-      // int32_t  size
-      // void*    protobuf message
-      framework::proto::VarType::TensorDesc desc;
-      desc.set_data_type(type);
-      auto tensor_dims = framework::vectorize(dims);
-      auto *pb_dims = desc.mutable_dims();
-      pb_dims->Resize(static_cast<int>(tensor_dims.size()), 0);
-      std::copy(tensor_dims.begin(), tensor_dims.end(), pb_dims->begin());
-      int32_t size = desc.ByteSize();
-      os.write(reinterpret_cast<const char *>(&size), sizeof(size));
-      auto out = desc.SerializeAsString();
-      os.write(out.data(), size);
-    }
-  }
-
-  void SerializeTensorAppendToStream(std::ostream &os,
-                                     const framework::Tensor &tensor) const {
-    uint64_t size = tensor.numel() * framework::SizeOfType(tensor.type());
-    auto *data_ptr = tensor.data<void>();
-
-    PADDLE_ENFORCE_LT(size, std::numeric_limits<std::streamsize>::max(),
-                      platform::errors::ResourceExhausted(
-                          "tensor size %d overflow when writing tensor", size));
-    os.write(static_cast<const char *>(data_ptr),
-             static_cast<std::streamsize>(size));
-  }
-
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto filename = ctx.Attr<std::string>("file_path");
-    auto overwrite = ctx.Attr<bool>("overwrite");
-
-    if (FileExists(filename) && !overwrite) {
-      PADDLE_THROW(platform::errors::AlreadyExists(
-          "%s is existed, cannot save to it when overwrite=false", filename));
-    }
-
-    MkDirRecursively(DirName(filename).c_str());
-
-    auto origin_shape = ctx.Attr<std::vector<int64_t>>("shape");
-    auto slice_shapes = ctx.Attr<std::vector<std::string>>("slice_shapes");
-    auto slice_varnames = ctx.Attr<std::vector<std::string>>("slice_varnames");
-    auto remote_varnames =
-        ctx.Attr<std::vector<std::string>>("remote_varnames");
-    auto endpoints = ctx.Attr<std::vector<std::string>>("endpoints");
-
-    auto trainer_id = ctx.Attr<int>("trainer_id");
-    auto is_sparse = ctx.Attr<bool>("is_sparse");
-    auto pserver_num = ctx.Attr<int>("pserver_num");
-    // auto is_distributed = ctx.Attr<int>("is_distributed");
-
-    PADDLE_ENFORCE_EQ(slice_shapes.size(), slice_varnames.size(),
-                      platform::errors::InvalidArgument(
-                          "Expected attr len(slice_shapes) must be equal to "
-                          "len(slice_varnames)"));
-
-    PADDLE_ENFORCE_EQ(
-        slice_shapes.size(), endpoints.size(),
-        platform::errors::InvalidArgument(
-            "Expected attr len(slice_shapes) must be equal to len(endpoints)"));
-
-    auto data_type =
-        static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype"));
-
-    // it to save an output stream.
-    std::ofstream fout(filename, std::ios::binary);
-    PADDLE_ENFORCE_EQ(
-        static_cast<bool>(fout), true,
-        platform::errors::NotFound("Cannot open %s to write", filename));
-
-    SerializeVersionToStream(fout);
-    SerializeTensorHeaderToStream(fout, data_type,
-                                  framework::make_ddim(origin_shape));
-
-    framework::Scope &local_scope = ctx.scope().NewScope();
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto place = ctx.GetPlace();
-    auto &device_ctx = *pool.Get(place);
-
-    distributed::RPCClient *rpc_client =
-        distributed::RPCClient::GetInstance<RPCCLIENT_T>(trainer_id);
-
-    if (!is_sparse) {
-      for (size_t i = 0; i < slice_varnames.size(); i++) {
-        auto &varname = slice_varnames[i];
-        auto *var = local_scope.Var(varname);
-        auto *tensor = var->GetMutable<framework::LoDTensor>();
-
-        auto slice_string =
-            string::split_string<std::string>(slice_shapes[i], ",");
-        std::vector<int64_t> slice_shape;
-
-        for (auto &dim : slice_string) {
-          slice_shape.push_back(static_cast<int64_t>(std::stoull(dim)));
-        }
-
-        tensor->Resize(framework::make_ddim(slice_shape));
-
-        distributed::VarHandlePtr ret;
-
-        ret = rpc_client->AsyncGetVarNoBarrier(
-            endpoints[i], device_ctx, local_scope, remote_varnames[i], varname);
-
-        PADDLE_ENFORCE_NE(
-            ret->Wait(), 0U,
-            platform::errors::ExecutionTimeout(
-                "rpc error when communication with %s", endpoints[i]));
-
-        auto &c_tensor = var->Get<framework::LoDTensor>();
-
-        SerializeTensorAppendToStream(fout, c_tensor);
-        local_scope.EraseVars({varname});
-      }
-    } else {
-      PADDLE_ENFORCE_GT(
-          pserver_num, 0,
-          platform::errors::InvalidArgument(
-              "Expected attr len(pserver_num) must gather than 0"));
-
-      std::vector<std::string> varnames;
-      auto *var = local_scope.Var("tmp_for_sparse_merge");
-      auto *o_t = var->GetMutable<framework::LoDTensor>();
-      o_t->Resize(framework::make_ddim(origin_shape));
-      auto *out_d = o_t->mutable_data<float>(place);
-
-      varnames.push_back("tmp_for_sparse_merge");
-      for (size_t i = 0; i < slice_varnames.size(); i++) {
-        varnames.push_back(slice_varnames[i]);
-      }
-
-      std::vector<const float *> tensors;
-
-      for (size_t i = 0; i < slice_varnames.size(); i++) {
-        auto &varname = slice_varnames[i];
-        auto *local_var = local_scope.Var(varname);
-        auto *tensor = local_var->GetMutable<framework::LoDTensor>();
-
-        auto slice_string =
-            string::split_string<std::string>(slice_shapes[i], ",");
-        std::vector<int64_t> slice_shape;
-
-        for (auto &dim : slice_string) {
-          slice_shape.push_back(static_cast<int64_t>(std::stoull(dim)));
-        }
-
-        tensor->Resize(framework::make_ddim(slice_shape));
-
-        distributed::VarHandlePtr ret;
-
-        ret = rpc_client->AsyncGetVarNoBarrier(
-            endpoints[i], device_ctx, local_scope, remote_varnames[i], varname);
-
-        PADDLE_ENFORCE_NE(
-            ret->Wait(), 0U,
-            platform::errors::ExecutionTimeout(
-                "rpc error when communication with %s", endpoints[i]));
-
-        const auto *value =
-            local_var->Get<framework::LoDTensor>().data<float>();
-        tensors.push_back(value);
-      }
-
-      auto dims1 = origin_shape[1];
-      for (int j = 0; j < origin_shape[0]; ++j) {
-        auto id = j % pserver_num;
-        auto idx = j / pserver_num;
-        std::memcpy(out_d + j * dims1, tensors[id] + idx * dims1,
-                    sizeof(float) * dims1);
-      }
-
-      auto &c_tensor = var->Get<framework::LoDTensor>();
-      SerializeTensorAppendToStream(fout, c_tensor);
-
-      local_scope.EraseVars(varnames);
-    }
-
-    fout.close();
-    ctx.scope().DeleteScope(&local_scope);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(recv_save, ops::RecvSaveOp, ops::RecvSaveOpProtoMaker);
-
-REGISTER_OP_CPU_KERNEL(
-    recv_save, ops::RecvSaveOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::RecvSaveOpKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::RecvSaveOpKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::RecvSaveOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.cc b/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.cc
deleted file mode 100644
index 4727b3bb249de..0000000000000
--- a/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.cc
+++ /dev/null
@@ -1,99 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.h"
-
-#include <string>
-
-namespace paddle {
-namespace framework {
-class InferShapeContext;
-class OpDesc;
-template <typename T>
-class EmptyGradOpMaker;
-}  // namespace framework
-namespace imperative {
-class OpBase;
-}  // namespace imperative
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-
-class RefByTrainerIdOp : public framework::OperatorWithKernel {
- public:
-  RefByTrainerIdOp(const std::string &type,
-                   const framework::VariableNameMap &inputs,
-                   const framework::VariableNameMap &outputs,
-                   const framework::AttributeMap &attrs)
-      : OperatorWithKernel(type, inputs, outputs, attrs) {}
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInputs("X"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(X) of RefByTrainerIdOp should not be null."));
-
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("TrainerId"), true,
-        platform::errors::InvalidArgument(
-            "Input(TrainerId) of RefByTrainerIdOp should not be null."));
-
-    PADDLE_ENFORCE_EQ(
-        ctx->HasOutput("Out"), true,
-        platform::errors::InvalidArgument(
-            "Output(Out) of RefByTrainerIdOp should not be null."));
-
-    PADDLE_ENFORCE_EQ(
-        ctx->GetInputDim("TrainerId").size(), 1,
-        platform::errors::InvalidArgument("TrainerId should be a scalar."));
-    // Out's shape is determined at runtime.
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
-  }
-};
-
-class RefByTrainerIdOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor) Input tensor list.").AsDuplicable();
-    AddInput("TrainerId", "(Tensor) Scalar int, the trainer id runtime value.");
-    AddOutput("Out", "(Tensor) Return one tensor reference of X[trainer_id]");
-    AddComment(R"DOC(
-**RefByTrainerId operator**
-
-Return a reference of a tensor, using trainer_id as the index to find from the input.
-
-$$Out = X[TrainerId]$$
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_WITHOUT_GRADIENT(ref_by_trainer_id, ops::RefByTrainerIdOp,
-                             ops::RefByTrainerIdOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    ref_by_trainer_id,
-    ops::RefByTrainerIdKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::RefByTrainerIdKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::RefByTrainerIdKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::RefByTrainerIdKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.cu.cc b/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.cu.cc
deleted file mode 100644
index 168cd51355de5..0000000000000
--- a/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.cu.cc
+++ /dev/null
@@ -1,26 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.h"
-
-REGISTER_OP_CUDA_KERNEL(
-    ref_by_trainer_id,
-    paddle::operators::RefByTrainerIdKernel<paddle::platform::CUDADeviceContext,
-                                            float>,
-    paddle::operators::RefByTrainerIdKernel<paddle::platform::CUDADeviceContext,
-                                            double>,
-    paddle::operators::RefByTrainerIdKernel<paddle::platform::CUDADeviceContext,
-                                            int>,
-    paddle::operators::RefByTrainerIdKernel<paddle::platform::CUDADeviceContext,
-                                            int64_t>);
diff --git a/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.h b/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.h
deleted file mode 100644
index c8c437c4965e7..0000000000000
--- a/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stdio.h>
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-template <typename DeviceContext, typename T>
-class RefByTrainerIdKernel : public framework::OpKernel<T> {
- public:
-  virtual void Compute(const framework::ExecutionContext& context) const {
-    auto* out = context.Output<framework::Tensor>("Out");
-    auto in_list = context.MultiInput<framework::Tensor>("X");
-    auto* trainer_id_t = context.Input<framework::Tensor>("TrainerId");
-    int64_t trainer_id = 0;
-    auto* trainer_id_data = trainer_id_t->data<int64_t>();
-    if (platform::is_gpu_place(context.GetPlace())) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      auto stream = context.cuda_device_context().stream();
-      memory::Copy<>(platform::CPUPlace(), &trainer_id,
-                     BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()),
-                     trainer_id_data, sizeof(int64_t), stream);
-#endif
-    } else {
-      trainer_id = *trainer_id_data;
-    }
-    PADDLE_ENFORCE_LT((size_t)trainer_id, in_list.size(),
-                      platform::errors::InvalidArgument(
-                          "X' size must >= TrainerId: [%s], but received [%s]",
-                          trainer_id, in_list.size()));
-    out->mutable_data<T>(context.GetPlace());
-    framework::TensorCopy(*(in_list[trainer_id]), in_list[trainer_id]->place(),
-                          out);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed_ops/send_and_recv_op.cc b/paddle/fluid/operators/distributed_ops/send_and_recv_op.cc
deleted file mode 100644
index 00cdbe70ca47e..0000000000000
--- a/paddle/fluid/operators/distributed_ops/send_and_recv_op.cc
+++ /dev/null
@@ -1,98 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <future>  // NOLINT
-#include <ostream>
-
-#include "paddle/fluid/framework/blocking_queue.h"
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/distributed/communicator.h"
-#include "paddle/fluid/operators/distributed/communicator_common.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/distributed/parameter_send.h"
-#include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
-#include "paddle/fluid/platform/profiler.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class SendAndRecvKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& scope = ctx.scope();
-    const auto& place = ctx.GetPlace();
-    auto send_var_name = ctx.Attr<std::string>("send_var_name");
-    auto recv_var_name = ctx.Attr<std::string>("recv_var_name");
-    auto epmap = ctx.Attr<std::string>("endpoint");
-    auto trainer_id = ctx.Attr<int>("trainer_id");
-
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    auto& context = *pool.Get(place);
-
-    distributed::RPCClient* rpc_client =
-        distributed::RPCClient::GetInstance<RPCCLIENT_T>(trainer_id);
-    VLOG(3) << "SendAndRecvOp Send_var_name: " << send_var_name
-            << " Recv_var_name: " << recv_var_name;
-    distributed::VarHandlePtr rets = rpc_client->AsyncSendAndRecv(
-        epmap, context, scope, send_var_name, recv_var_name);
-    rets->Wait();
-  }
-};
-
-class SendAndRecvOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {}
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
-    return framework::OpKernelType(data_type, platform::CPUPlace());
-  }
-};
-
-class SendAndRecvOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddInput("X", "Tensor Input variable to be sent").AsDuplicable();
-    AddOutput("Out", "Tensor Output varibale to be recv").AsDuplicable();
-    AddAttr<std::string>("send_var_name", "Send Tensor's name")
-        .SetDefault(std::string(""));
-    AddAttr<std::string>("recv_var_name", "Recv Tensor's name")
-        .SetDefault(std::string(""));
-    AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
-    AddAttr<std::string>("endpoint", "Server endpoint")
-        .SetDefault({"127.0.0.1:6164"});
-    AddComment(R"DOC(
-    SendAndRecv operator
-    This operator will send variables to listen_and_serve op at the parameter server.
-    And recv variable from parameter server of send variable's scope.
-    )DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(send_and_recv, ops::SendAndRecvOp, ops::SendAndRecvOpMaker);
-
-REGISTER_OP_CPU_KERNEL(
-    send_and_recv,
-    ops::SendAndRecvKernel<paddle::platform::CPUDeviceContext, float>)
diff --git a/paddle/fluid/operators/distributed_ops/send_barrier_op.cc b/paddle/fluid/operators/distributed_ops/send_barrier_op.cc
deleted file mode 100644
index 5aa2ba26aa4d6..0000000000000
--- a/paddle/fluid/operators/distributed_ops/send_barrier_op.cc
+++ /dev/null
@@ -1,120 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
-
-namespace paddle {
-namespace framework {
-class InferShapeContext;
-class OpDesc;
-class Scope;
-template <typename T>
-class EmptyGradOpMaker;
-}  // namespace framework
-namespace imperative {
-class OpBase;
-}  // namespace imperative
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-
-namespace distributed {
-class RPCClient;
-}  // namespace distributed
-
-class SendBarrierOp : public framework::OperatorBase {
- public:
-  SendBarrierOp(const std::string& type,
-                const framework::VariableNameMap& inputs,
-                const framework::VariableNameMap& outputs,
-                const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& place) const override {
-    auto is_half_async = Attr<bool>("half_async");
-
-    if (is_half_async) {
-      distributed::Communicator::GetInstance()->Barrier();
-      return;
-    }
-
-    std::vector<std::string> eps = Attr<std::vector<std::string>>("endpoints");
-
-    distributed::RPCClient* rpc_client =
-        distributed::RPCClient::GetInstance<RPCCLIENT_T>(
-            Attr<int>("trainer_id"));
-
-    VLOG(3) << "SendBarrierOp sync";
-
-    std::vector<distributed::VarHandlePtr> rets;
-
-    for (auto& ep : eps) {
-      VLOG(3) << "send barrier, ep: " << ep;
-      rets.push_back(rpc_client->AsyncSendBatchBarrier(ep));
-    }
-
-    for (size_t i = 0; i < rets.size(); i++) {
-      PADDLE_ENFORCE_NE(
-          rets[i]->Wait(), 0U,
-          platform::errors::ExecutionTimeout("internal error in RPCClient"));
-    }
-  }
-};
-
-class SendBarrierOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddInput("X", "(Any) Dummy inputs, used for control dependency")
-        .AsDuplicable();
-    AddOutput("Out", "(Any) Dummy outputs, used for control dependency")
-        .AsDuplicable();
-    AddComment(R"DOC(
-SendBarrier operator
-
-This operator will send a send barrier signal to list_and_serv op, so that
-the Parameter Server would knew all variables have been sent.
-)DOC");
-
-    AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
-    AddAttr<std::vector<std::string>>("endpoints",
-                                      "(string vector, default 127.0.0.1:6164)"
-                                      "Server endpoints to send variables to.")
-        .SetDefault({"127.0.0.1:6164"});
-    AddAttr<bool>(
-        "half_async",
-        "(bool, default false)"
-        "half_async=True is for half_async mode, this will send signal "
-        "to HalfAsyncCommunicator Instance")
-        .SetDefault(false);
-  }
-};
-
-class SendBarrierOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext* ctx) const override {}
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(
-    send_barrier, ops::SendBarrierOp,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
-    ops::SendBarrierOpMaker, ops::SendBarrierOpShapeInference);
diff --git a/paddle/fluid/operators/distributed_ops/send_op.cc b/paddle/fluid/operators/distributed_ops/send_op.cc
deleted file mode 100644
index a4192c18afae5..0000000000000
--- a/paddle/fluid/operators/distributed_ops/send_op.cc
+++ /dev/null
@@ -1,160 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
-
-namespace paddle {
-namespace framework {
-class InferShapeContext;
-class OpDesc;
-class Scope;
-template <typename T>
-class EmptyGradOpMaker;
-}  // namespace framework
-namespace imperative {
-class OpBase;
-}  // namespace imperative
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-
-namespace distributed {
-class RPCClient;
-}  // namespace distributed
-
-class SendOp : public framework::OperatorBase {
- public:
-  SendOp(const std::string& type, const framework::VariableNameMap& inputs,
-         const framework::VariableNameMap& outputs,
-         const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& place) const override {
-    auto ins = Inputs("X");
-
-    auto epmap = Attr<std::vector<std::string>>("endpoints");
-    auto trainer_id = Attr<int>("trainer_id");
-
-    auto send_varnames = Attr<std::vector<std::string>>("send_varnames");
-    auto height_sections = Attr<std::vector<int64_t>>("sections");
-    auto use_send_handler = Attr<bool>("use_send_handler");
-
-    if (send_varnames.size() > 0) {
-      distributed::Communicator::GetInstance()->Send(ins, send_varnames, scope);
-    } else {
-      platform::DeviceContextPool& pool =
-          platform::DeviceContextPool::Instance();
-      auto& ctx = *pool.Get(place);
-
-      distributed::RPCClient* rpc_client =
-          distributed::RPCClient::GetInstance<RPCCLIENT_T>(trainer_id);
-
-      std::vector<distributed::VarHandlePtr> rets;
-      if (use_send_handler) {
-        for (size_t i = 0; i < ins.size(); i++) {
-          if (NeedSend(scope, ins[i])) {
-            VLOG(3) << "sending " << ins[i] << " to " << epmap[i];
-            rets.push_back(
-                rpc_client->AsyncSendVar(epmap[i], ctx, scope, ins[i]));
-          } else {
-            VLOG(3) << "don't send no-initialied variable: " << ins[i];
-          }
-        }
-      } else {
-        for (size_t i = 0; i < ins.size(); i++) {
-          for (size_t j = 0; j < epmap.size(); j++) {
-            if (NeedSend(scope, ins[i])) {
-              VLOG(3) << "sending " << ins[i] << " to " << epmap[j];
-              rets.push_back(rpc_client->AsyncDistributeNotify(epmap[j], ctx,
-                                                               scope, ins[i]));
-            } else {
-              VLOG(3) << "don't send no-initialied variable: " << ins[i];
-            }
-          }
-        }
-      }
-      for (size_t i = 0; i < rets.size(); i++) {
-        VLOG(7) << "before sync_send " << ins[i] << "from " << epmap[i];
-        PADDLE_ENFORCE_NE(
-            rets[i]->Wait(), 0U,
-            platform::errors::ExecutionTimeout("internal error in RPCClient"));
-        VLOG(7) << "after sync_send " << ins[i] << "from " << epmap[i];
-      }
-    }
-  }
-};
-
-class SendOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddInput("X", "(Tensor, SelectedRows) Input variables to be sent")
-        .AsDuplicable();
-    AddOutput("Out", "(Any) Dummy outputs, used for control dependency")
-        .AsDuplicable();
-    AddComment(R"DOC(
-Send operator
-
-This operator will send variables to listen_and_serve op at the parameter server.
-)DOC");
-    AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
-    AddAttr<std::vector<std::string>>("endpoints",
-                                      "(string vector, default 127.0.0.1:6164)"
-                                      "Server endpoints in the order of input "
-                                      "variables for mapping")
-        .SetDefault({"127.0.0.1:6164"});
-    AddAttr<std::vector<int64_t>>("sections",
-                                  "(vector<int>) "
-                                  "the length of each output along the "
-                                  "specified axis.")
-        .SetDefault(std::vector<int64_t>{});
-    AddAttr<std::vector<std::string>>(
-        "send_varnames",
-        "(vector<string>) "
-        "the split output varnames to send to pserver")
-        .SetDefault(std::vector<std::string>{});
-    AddAttr<int>("num",
-                 "(int, default 0)"
-                 "Number of sub-tensors. This must evenly divide "
-                 "Input.dims()[axis]")
-        .SetDefault(0);
-    AddAttr<bool>("merge_add",
-                  "(bool, default 0)"
-                  "merge method, true represent add, false represent average")
-        .SetDefault(false);
-    AddAttr<bool>(
-        "use_send_handler",
-        "(bool, default 1)"
-        "if it's true, use send handler, other wise, use notify handler")
-        .SetDefault(true);
-  }
-};
-
-class SendOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext* ctx) const override {}
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(
-    send, ops::SendOp,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
-    ops::SendOpMaker, ops::SendOpShapeInference);
diff --git a/paddle/fluid/operators/distributed_ops/send_recv_op_test.cc b/paddle/fluid/operators/distributed_ops/send_recv_op_test.cc
deleted file mode 100644
index 1f8e05a471983..0000000000000
--- a/paddle/fluid/operators/distributed_ops/send_recv_op_test.cc
+++ /dev/null
@@ -1,257 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <unistd.h>
-#include <string>
-#include <thread>  // NOLINT
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/distributed_ops/listen_and_serv_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/operators/math/selected_rows_functor.h"
-#include "paddle/fluid/string/printf.h"
-
-USE_NO_KERNEL_OP(send);
-USE_NO_KERNEL_OP(listen_and_serv);
-USE_OP(sum);
-
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-namespace m = paddle::operators::math;
-namespace d = paddle::operators::distributed
-
-    // global for simplicity.
-    std::unique_ptr<f::OperatorBase>
-        listen_and_serv_op;
-int selected_port;
-
-void InitTensorsInScope(const p::CPUPlace &place, f::Scope *scope) {
-  p::CPUDeviceContext ctx(place);
-  for (int i = 0; i < 2; ++i) {
-    auto var_name = paddle::string::Sprintf("x%d", i);
-    auto var = scope->Var(var_name);
-    auto tensor = var->GetMutable<f::LoDTensor>();
-    tensor->Resize({10, 10});
-    float *expect = tensor->mutable_data<float>(place);
-    for (int64_t i = 0; i < tensor->numel(); ++i) {
-      expect[i] = static_cast<float>(i);
-    }
-  }
-
-  auto out_var = scope->Var("Out");
-  auto out_tensor = out_var->GetMutable<f::LoDTensor>();
-  out_tensor->Resize({10, 10});
-  out_tensor->mutable_data<float>(place);  // allocate
-}
-
-void InitSelectedRowsInScope(const p::CPUPlace &place, f::Scope *scope) {
-  p::CPUDeviceContext ctx(place);
-  int64_t height = 10;
-  int64_t row_numel = 10;
-  m::SetConstant<p::CPUDeviceContext, float> set_one;
-  // init x0
-  std::vector<int64_t> rows0{0, 4, 7};
-  auto x0_var = scope->Var("x0");
-  auto x0 = x0_var->GetMutable<f::SelectedRows>();
-  x0->set_rows(rows0);
-  x0->set_height(height);
-  auto x0_value = x0->mutable_value();
-  x0_value->mutable_data<float>(
-      f::make_ddim({static_cast<int64_t>(rows0.size()), row_numel}), place);
-  set_one(ctx, x0_value, 1.0);
-
-  // init x1
-  std::vector<int64_t> rows1{2, 9};
-  auto x1_var = scope->Var("x1");
-  auto x1 = x1_var->GetMutable<f::SelectedRows>();
-  x1->set_rows(rows1);
-  x1->set_height(height);
-  auto x1_value = x1->mutable_value();
-  x1_value->mutable_data<float>(
-      f::make_ddim({static_cast<int64_t>(rows1.size()), row_numel}), place);
-  set_one(ctx, x1_value, 1.0);
-
-  auto out_var = scope->Var("Out");
-  auto out = out_var->GetMutable<f::SelectedRows>();
-  auto out_value = out->mutable_value();
-  out->set_height(height);
-  out_value->mutable_data<float>(f::make_ddim({5, 10}), place);
-}
-
-void AddOp(const std::string &type, const f::VariableNameMap &inputs,
-           const f::VariableNameMap &outputs, f::AttributeMap attrs,
-           f::BlockDesc *block, bool is_sparse) {
-  // insert output
-  for (auto kv : outputs) {
-    for (auto v : kv.second) {
-      auto var = block->Var(v);
-      var->SetDataType(f::proto::VarType::FP32);
-      var->SetPersistable(true);
-      if (is_sparse) {
-        var->SetType(f::proto::VarType::SELECTED_ROWS);
-      }
-    }
-  }
-
-  // insert op
-  auto op = block->AppendOp();
-  op->SetType(type);
-  for (auto &kv : inputs) {
-    op->SetInput(kv.first, kv.second);
-  }
-  for (auto &kv : outputs) {
-    op->SetOutput(kv.first, kv.second);
-  }
-  op->SetAttrMap(attrs);
-}
-
-void StartServerNet(bool is_sparse, std::atomic<bool> *initialized) {
-  f::Scope scope;
-  p::CPUPlace place;
-  VLOG(4) << "before init tensor";
-  if (is_sparse) {
-    InitSelectedRowsInScope(place, &scope);
-  } else {
-    InitTensorsInScope(place, &scope);
-  }
-  // sub program run in listen_and_serv_op, for simple test we use sum
-  f::ProgramDesc program;
-  const auto &root_block = program.Block(0);
-  std::vector<framework::BlockDesc *> optimize_blocks;
-  auto *optimize_block = program.AppendBlock(root_block);
-  optimize_blocks.push_back(optimize_block);
-
-  auto *prefetch_block = program.AppendBlock(root_block);
-  // X for server side tensors, RX for received tensors, must be of same shape.
-  AddOp("sum", {{"X", {"x0", "x1"}}}, {{"Out", {"Out"}}}, {}, optimize_block,
-        is_sparse);
-  f::AttributeMap attrs;
-  attrs.insert({"endpoint", std::string("127.0.0.1:0")});
-  attrs.insert({"Fanin", 1});
-  attrs.insert({"ParamList", std::vector<std::string>({"Out"})});
-  attrs.insert({"GradList", std::vector<std::string>({"x1"})});
-  attrs.insert({"optimize_blocks", optimize_blocks});
-  attrs.insert({"PrefetchBlock", prefetch_block});
-  attrs.insert({"grad_to_block_id", std::vector<std::string>({""})});
-  attrs.insert({"distributed_mode", d::DistributedMode::kSync});
-  VLOG(4) << "before init op";
-  listen_and_serv_op =
-      f::OpRegistry::CreateOp("listen_and_serv", {{"X", {"x1"}}}, {}, attrs);
-  *initialized = true;
-  listen_and_serv_op->Run(scope, place);
-  LOG(INFO) << "server exit";
-}
-
-TEST(SendRecvOp, CPUDense) {
-  std::atomic<bool> initialized{false};
-  std::thread server_thread(StartServerNet, false, &initialized);
-  while (!initialized) {
-  }
-
-  static_cast<paddle::operators::ListenAndServOp *>(listen_and_serv_op.get())
-      ->WaitServerReady();
-
-  // local net
-  f::Scope scope;
-  p::CPUPlace place;
-  InitTensorsInScope(place, &scope);
-  // create rpc client var
-  scope.Var("RPC_CLIENT_VAR");
-
-  f::AttributeMap attrs;
-  auto *listen_and_serv_op_ptr =
-      static_cast<paddle::operators::ListenAndServOp *>(
-          listen_and_serv_op.get());
-  ASSERT_TRUE(listen_and_serv_op_ptr != nullptr);
-  selected_port = listen_and_serv_op_ptr->GetSelectedPort();
-  std::string endpoint = paddle::string::Sprintf("127.0.0.1:%d", selected_port);
-  attrs.insert({"endpoints", std::vector<std::string>({endpoint})});
-  attrs.insert({"epmap", std::vector<std::string>({endpoint})});
-  const f::VariableNameMap &inputs = {{"X", {"x1"}}};
-  const f::VariableNameMap &outputs = {{"Out", {"Out"}}};
-
-  auto send_op = f::OpRegistry::CreateOp("send", inputs, outputs, attrs);
-  send_op->Run(scope, place);
-
-  auto in_var = scope.Var("x1");
-  auto tensor = in_var->GetMutable<f::LoDTensor>();
-  float *expected = tensor->data<float>();
-  auto out_var = scope.Var("Out");
-  auto target = out_var->GetMutable<f::LoDTensor>();
-  // x1 * 2 == x0
-  EXPECT_NE(target->memory_size(), size_t(0));
-  float *actual = target->data<float>();
-  for (int64_t i = 0; i < target->numel(); ++i) {
-    EXPECT_EQ(expected[i] * 2, actual[i]);
-  }
-  listen_and_serv_op->Stop();
-  server_thread.join();
-  listen_and_serv_op.reset(nullptr);
-  paddle::operators::ListenAndServOp::ResetPort();
-}
-
-TEST(SendRecvOp, CPUSparse) {
-  std::atomic<bool> initialized;
-  initialized = false;
-  std::thread server_thread(StartServerNet, true, &initialized);
-  while (!initialized) {
-  }
-  auto *listen_and_serv_op_ptr =
-      static_cast<paddle::operators::ListenAndServOp *>(
-          listen_and_serv_op.get());
-  ASSERT_TRUE(listen_and_serv_op_ptr != nullptr);
-  listen_and_serv_op_ptr->WaitServerReady();
-
-  // local net
-  f::Scope scope;
-  p::CPUPlace place;
-  p::CPUDeviceContext ctx(place);
-  InitSelectedRowsInScope(place, &scope);
-  scope.Var("RPC_CLIENT_VAR");
-  f::AttributeMap attrs;
-  selected_port = listen_and_serv_op_ptr->GetSelectedPort();
-  std::string endpoint = paddle::string::Sprintf("127.0.0.1:%d", selected_port);
-  attrs.insert({"endpoints", std::vector<std::string>({endpoint})});
-  attrs.insert({"epmap", std::vector<std::string>({endpoint})});
-  auto send_op = f::OpRegistry::CreateOp("send", {{"X", {"x1"}}},
-                                         {{"Out", {"Out"}}}, attrs);
-  send_op->Run(scope, place);
-
-  auto x0 = scope.Var("x0")->GetMutable<f::SelectedRows>();
-  auto x1 = scope.Var("x1")->GetMutable<f::SelectedRows>();
-  auto out = scope.Var("Out")->GetMutable<f::SelectedRows>();
-  auto actual = out->mutable_value();
-
-  std::unique_ptr<f::SelectedRows> expect{new f::SelectedRows()};
-  auto expect_value = expect->mutable_value();
-  expect_value->mutable_data<float>(f::make_ddim({5, 10}), place);
-
-  m::SelectedRowsAdd<p::CPUDeviceContext, float> add_functor;
-  add_functor(ctx, *x0, *x1, expect.get());
-
-  EXPECT_EQ(actual->numel(), expect_value->numel());
-  EXPECT_EQ(out->rows().size(), x0->rows().size() + x1->rows().size());
-
-  for (int64_t i = 0; i < expect_value->numel(); ++i) {
-    EXPECT_EQ(expect_value->mutable_data<float>(place)[i],
-              actual->mutable_data<float>(place)[i]);
-  }
-  listen_and_serv_op->Stop();
-  server_thread.join();
-  listen_and_serv_op.reset();
-  paddle::operators::ListenAndServOp::ResetPort();
-}
diff --git a/paddle/fluid/operators/distributed_ops/send_recv_util.h b/paddle/fluid/operators/distributed_ops/send_recv_util.h
deleted file mode 100644
index 7dc0596ac31e2..0000000000000
--- a/paddle/fluid/operators/distributed_ops/send_recv_util.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/ir/node.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-
-namespace paddle {
-namespace operators {
-
-inline bool NeedSend(const framework::Scope& scope,
-                     const std::string& varname) {
-  // dummy variable is only used in parallel executor to represent
-  // some dependency relationship, we don't need to send/recv it.
-  // TODO(paddle-dev): Why would parallel executor logic leaked into here?
-  if (varname.find(framework::ir::Node::kControlDepVarName) !=
-      std::string::npos)
-    return false;
-  auto* var = scope.FindVar(varname);
-  PADDLE_ENFORCE_NOT_NULL(
-      var, platform::errors::NotFound(
-               "Can not find variable '%s' in the send side.", varname));
-  if (var->IsType<framework::LoDTensor>()) {
-    return var->Get<framework::LoDTensor>().IsInitialized();
-  } else if (var->IsType<framework::SelectedRows>()) {
-    return var->Get<framework::SelectedRows>().rows().size() > 0UL;
-  } else {
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "Variable type in send side should be LodTensor or SelectedRows."));
-  }
-  return false;
-}
-
-inline std::vector<int64_t> ToAbsoluteSection(
-    const std::vector<int64_t>& height_sections) {
-  std::vector<int64_t> abs_sections;
-  abs_sections.resize(height_sections.size());
-  abs_sections[0] = 0;
-  for (size_t i = 1; i < height_sections.size(); ++i) {
-    abs_sections[i] = height_sections[i - 1] + abs_sections[i - 1];
-  }
-  return abs_sections;
-}
-
-inline size_t GetSectionIndex(int64_t id,
-                              const std::vector<int64_t>& abs_sections) {
-  for (size_t i = 1; i < abs_sections.size(); ++i) {
-    if (id < abs_sections[i]) {
-      return i - 1;
-    }
-  }
-  return abs_sections.size() - 1;
-}
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed_ops/sparse_tensor_load_op.cc b/paddle/fluid/operators/distributed_ops/sparse_tensor_load_op.cc
deleted file mode 100644
index 6cd01089f9bc2..0000000000000
--- a/paddle/fluid/operators/distributed_ops/sparse_tensor_load_op.cc
+++ /dev/null
@@ -1,217 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <future>  // NOLINT
-#include <ostream>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/version.h"
-#include "paddle/fluid/platform/profiler.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-using SelectedRows = framework::SelectedRows;
-
-struct DeserializedDataFunctor {
-  DeserializedDataFunctor(void **buf, Tensor *tensor,
-                          const platform::Place &place)
-      : buf_(buf), tensor_(tensor), place_(place) {}
-
-  template <typename T>
-  void apply() {
-    *buf_ = tensor_->mutable_data<T>(place_);
-  }
-
-  void **buf_;
-  Tensor *tensor_;
-  platform::Place place_;
-};
-
-template <typename DeviceContext, typename T>
-class SparseTensorLoadKernel : public paddle::framework::OpKernel<T> {
- public:
-  void Compute(const paddle::framework::ExecutionContext &ctx) const override {
-    auto place = ctx.GetPlace();
-    auto filename = ctx.Attr<std::string>("file_path");
-    std::ifstream fin(filename, std::ios::binary);
-    PADDLE_ENFORCE_EQ(static_cast<bool>(fin), true,
-                      platform::errors::Unavailable(
-                          "Load operator fail to open file %s, please check "
-                          "whether the model file is complete or damaged.",
-                          filename));
-    auto name = ctx.OutputNames("Out")[0];
-    VLOG(4) << "Sparse Load Var name: " << name;
-    auto *out_var = ctx.OutputVar("Out");
-    PADDLE_ENFORCE_NOT_NULL(
-        out_var, platform::errors::InvalidArgument(
-                     "The variable %s to be loaded cannot be found.", name));
-    PADDLE_ENFORCE_EQ(out_var->IsType<paddle::framework::LoDTensor>(), true,
-                      platform::errors::InvalidArgument(
-                          "SparseLoad OP only support LoDTensor"));
-    LoadLodTensor(fin, place, out_var, ctx);
-  }
-
-  void LoadLodTensor(std::istream &is, const platform::Place &place,
-                     paddle::framework::Variable *var,
-                     const paddle::framework::ExecutionContext &ctx) const {
-    auto *tensor = var->GetMutable<paddle::framework::LoDTensor>();
-
-    auto node_index = ctx.Attr<int64_t>("node_index");
-    auto node_num = ctx.Attr<int64_t>("node_num");
-    auto shape = ctx.Attr<std::vector<int64_t>>("shape");
-    VLOG(4) << "Sparse LoadLodTensor node_num" << node_num;
-    VLOG(4) << "Sparse LoadLodTensor node_index" << node_index;
-    VLOG(4) << "Sparse LoadLodTensor shape[0]" << shape[0];
-    PADDLE_ENFORCE_GE(node_index, 0, platform::errors::InvalidArgument(
-                                         "node_num great than or equal to 0"));
-    PADDLE_ENFORCE_GE(node_num, 1, platform::errors::InvalidArgument(
-                                       "node_num great than or equal to 1"));
-
-    {
-      // the 1st field, unit32_t version for LoDTensor
-      uint32_t version;
-      is.read(reinterpret_cast<char *>(&version), sizeof(version));
-      PADDLE_ENFORCE_EQ(paddle::framework::IsTensorVersionSupported(version),
-                        true,
-                        platform::errors::InvalidArgument(
-                            "Tensor version %u is not supported.", version));
-      PADDLE_ENFORCE_EQ(version, 0U, platform::errors::InvalidArgument(
-                                         "Tensor version %u is not supported, "
-                                         "only version 0 is supported.",
-                                         version));
-    }
-
-    {
-      // the 2st field, LoD information
-      // Todo sparse load need change LoDTensor's lod level
-      uint64_t lod_level;
-      is.read(reinterpret_cast<char *>(&lod_level), sizeof(lod_level));
-      auto &lod = *tensor->mutable_lod();
-      lod.resize(lod_level);
-    }
-
-    // the 3st filed, Tensor
-
-    uint32_t version;
-    is.read(reinterpret_cast<char *>(&version), sizeof(version));
-
-    PADDLE_ENFORCE_EQ(
-        version, 0U,
-        platform::errors::InvalidArgument(
-            "tensor version %u is not supported, Only version 0 is supported",
-            version));
-
-    paddle::framework::proto::VarType::TensorDesc desc;
-
-    {  // int32_t size
-      // proto buffer
-      int32_t size;
-      is.read(reinterpret_cast<char *>(&size), sizeof(size));
-      std::unique_ptr<char[]> buf(new char[size]);
-      is.read(reinterpret_cast<char *>(buf.get()), size);
-      PADDLE_ENFORCE_EQ(
-          desc.ParseFromArray(buf.get(), size), true,
-          platform::errors::InvalidArgument("Cannot parse tensor desc"));
-    }
-
-    {  // read tensor
-      std::vector<int64_t> dims;
-      dims.reserve(static_cast<size_t>(desc.dims().size()));
-      std::copy(desc.dims().begin(), desc.dims().end(),
-                std::back_inserter(dims));
-
-      int64_t line_numel = 1;
-      for (size_t dim = 1; dim < dims.size(); dim++) {
-        line_numel *= dims[dim];
-      }
-      auto total_line = dims[0];
-
-      tensor->Resize(paddle::framework::make_ddim(shape));
-
-      void *buf;
-      auto ctx = platform::CPUDeviceContext();
-
-      paddle::framework::VisitDataType(
-          desc.data_type(),
-          DeserializedDataFunctor(&buf, tensor, ctx.GetPlace()));
-
-      auto line_size =
-          line_numel * paddle::framework::SizeOfType(desc.data_type());
-      char *cur_buf = static_cast<char *>(buf);
-      char *temp_row = new char[line_size];
-      VLOG(4) << "TensorFromStream: line_size " << line_size;
-      VLOG(4) << "TensorFromStream: total_line " << total_line;
-      for (size_t line_index = 0; line_index < static_cast<size_t>(total_line);
-           ++line_index) {
-        is.read(temp_row, line_size);
-        if (static_cast<int64_t>(line_index) % node_num == node_index) {
-          memcpy(cur_buf, temp_row, line_size);
-          cur_buf += line_size;
-        }
-      }
-    }
-  }
-};
-
-class SparseTensorLoadOp : public paddle::framework::OperatorWithKernel {
- public:
-  using paddle::framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(paddle::framework::InferShapeContext *ctx) const override {}
-
- protected:
-  paddle::framework::OpKernelType GetExpectedKernelType(
-      const paddle::framework::ExecutionContext &ctx) const override {
-    paddle::framework::OpKernelType kt = paddle::framework::OpKernelType(
-        paddle::framework::proto::VarType::FP32, ctx.GetPlace());
-    return kt;
-  }
-};
-
-class SparseTensorLoadOpMaker
-    : public paddle::framework::OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddOutput("Out", "The LoDTensor / SelectedRows need to be loaded");
-    AddAttr<std::string>("file_path",
-                         R"(Variable will be loaded from "file_path")")
-        .AddCustomChecker(
-            [](const std::string &path) { return !path.empty(); });
-    AddAttr<int64_t>("node_index", "role id from 0 ~ node_num.").SetDefault(0);
-    AddAttr<int64_t>("node_num", "role nums which need load current varibale.")
-        .SetDefault(0);
-    AddAttr<std::vector<int64_t>>("shape",
-                                  "(vector<int64_t>) The shape of the output")
-        .SetDefault({});
-    AddComment(R"DOC(
-    SparseTensorLoad OP, Load sprase tensor on parameter server
-    )DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(sparse_tensor_load, ops::SparseTensorLoadOp,
-                  ops::SparseTensorLoadOpMaker);
-
-REGISTER_OP_CPU_KERNEL(
-    sparse_tensor_load,
-    ops::SparseTensorLoadKernel<paddle::platform::CPUDeviceContext, float>)
diff --git a/paddle/fluid/operators/distributed_ops/split_byref_op.cc b/paddle/fluid/operators/distributed_ops/split_byref_op.cc
deleted file mode 100644
index 042a22b8ff199..0000000000000
--- a/paddle/fluid/operators/distributed_ops/split_byref_op.cc
+++ /dev/null
@@ -1,103 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/distributed_ops/split_byref_op.h"
-#include "paddle/fluid/operators/split_op.h"
-
-namespace paddle {
-namespace operators {
-using framework::Tensor;
-
-class SplitByrefOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "Ids", "SplitByrefOp");
-    OP_INOUT_CHECK(ctx->HasOutputs("Out"), "Output", "Out", "SplitByrefOp");
-
-    auto in_dims = ctx->GetInputDim("X");
-    auto outs_names = ctx->Outputs("Out");
-    size_t num = static_cast<size_t>(ctx->Attrs().Get<int>("num"));
-    auto sections = ctx->Attrs().Get<std::vector<int>>("sections");
-    const size_t outs_number = outs_names.size();
-    std::vector<framework::DDim> outs_dims;
-    outs_dims.reserve(outs_number);
-
-    if (num > 0) {
-      int64_t in_axis_dim = 0;
-      if (ctx->IsRuntime()) {
-        in_axis_dim = in_dims[0];
-      }
-      PADDLE_ENFORCE_EQ(in_axis_dim % num, 0, platform::errors::InvalidArgument(
-                                                  "tensor split does not result"
-                                                  " in an equal division"));
-      size_t out_axis_dim = in_axis_dim / num;
-      for (size_t i = 0; i < outs_number; ++i) {
-        auto dim = in_dims;
-        dim[0] = out_axis_dim;
-        outs_dims.push_back(dim);
-      }
-    } else if (sections.size() > 0) {
-      PADDLE_ENFORCE_EQ(
-          sections.size(), outs_number,
-          platform::errors::InvalidArgument("tensor split sections size"
-                                            "should be equal to output size"));
-      for (size_t i = 0; i < outs_number; ++i) {
-        auto dim = in_dims;
-        dim[0] = sections[i];
-        outs_dims.push_back(dim);
-      }
-    }
-    ctx->SetOutputsDim("Out", outs_dims);
-  }
-};
-
-class SplitByrefOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor) Input tensor of the split operator.");
-    AddOutput("Out", "(Tensor) Output tensors of the split operator.")
-        .AsDuplicable();
-    AddComment(R"DOC(
-SplitByref operator
-
-Split source tensor to sevaral tensors by axis 0. No copy in this operator
-is performed, output tensor shares the same blocks of memory.
-)DOC");
-    AddAttr<std::vector<int>>("sections",
-                              "(vector<int>) "
-                              "the length of each output along the "
-                              "specified axis.")
-        .SetDefault(std::vector<int>{});
-    AddAttr<int>("num",
-                 "(int, default 0)"
-                 "Number of sub-tensors. This must evenly divide "
-                 "Input.dims()[axis]")
-        .SetDefault(0);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-// NOTE: concat op default axis must be 0!
-USE_CPU_ONLY_OP(concat);
-
-REGISTER_OPERATOR(split_byref, ops::SplitByrefOp, ops::SplitByrefOpMaker,
-                  ops::SplitGradMaker<paddle::framework::OpDesc>,
-                  ops::SplitGradMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(
-    split_byref, ops::SplitByrefOpKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/fluid/operators/distributed_ops/split_ids_op.cc b/paddle/fluid/operators/distributed_ops/split_ids_op.cc
deleted file mode 100644
index 86738b7c69e8b..0000000000000
--- a/paddle/fluid/operators/distributed_ops/split_ids_op.cc
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/distributed_ops/split_ids_op.h"
-
-#include <memory>
-
-namespace paddle {
-namespace operators {
-
-class SplitIdsOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Ids", "(LoDTensor) the input ids with shape{batch_num, 1}")
-        .AsDuplicable();
-
-    AddOutput("Out", "(LoDTensors) The outputs of the input Ids.")
-        .AsDuplicable();
-
-    AddComment(R"DOC(
-Split a LoDTensor of Ids into multi LoDTensors, the number is pserver's number
-Example:
-  Input:
-    X = [[1,2,3,4,5,6],[2,3]]
-
-  Out(3 output):
-    if compress is True:
-        out0 = [3, 3, 6]
-        out1 = [1, 4]
-        out2 = [2, 2, 5]
-    else:
-        out0 = [3, 6]
-        out1 = [1, 4]
-        out2 = [2, 5]
-)DOC");
-  }
-};
-
-class SplitIdsOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInputs("Ids"), "Input", "Ids", "SplitIdsOp");
-    OP_INOUT_CHECK(ctx->HasOutputs("Out"), "Output", "Out", "SplitIdsOp");
-
-    auto ids_var_type = ctx->GetInputsVarType("Ids").front();
-    auto ids_dims = ctx->GetInputsDim("Ids");
-    if (ids_var_type == framework::proto::VarType::LOD_TENSOR) {
-      PADDLE_ENFORCE_EQ(
-          ids_dims[0].size(), 2,
-          platform::errors::InvalidArgument(
-              "ShapeError: The dimensions of the 'split_ids' must be 2. "
-              "But received split_ids's dimensions = %d, "
-              "split_ids's shape = [%s].",
-              ids_dims[0].size(), ids_dims[0]));
-    }
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        OperatorWithKernel::IndicateVarDataType(ctx, "Ids"), ctx.GetPlace());
-  }
-};
-
-class SplitIdsOpInferVarType : public framework::VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    auto input_type = ctx->GetInputType("Ids");
-    ctx->SetOutputType("Out", input_type, framework::ALL_ELEMENTS);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(split_ids, ops::SplitIdsOp, ops::SplitIdsOpMaker,
-                  ops::SplitIdsOpInferVarType);
-
-REGISTER_OP_CPU_KERNEL(
-    split_ids, ops::SplitIdsOpKernel<paddle::platform::CPUPlace, int64_t>,
-    ops::SplitIdsOpKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/fluid/operators/distributed_ops/split_ids_op.h b/paddle/fluid/operators/distributed_ops/split_ids_op.h
deleted file mode 100644
index 8a3ebe6e258e5..0000000000000
--- a/paddle/fluid/operators/distributed_ops/split_ids_op.h
+++ /dev/null
@@ -1,127 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <iterator>
-#include <set>
-#include <unordered_map>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/selected_rows_functor.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class SplitIdsOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto place = ctx.GetPlace();
-    if (!platform::is_cpu_place(place)) {
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "SplitIds do not support GPU kernel"));
-    }
-
-    const auto ids_vars = ctx.MultiInputVar("Ids");
-
-    PADDLE_ENFORCE_GT(
-        ids_vars.size(), 0,
-        platform::errors::InvalidArgument(
-            ids_vars.size(), 0, "The number of Ids expected > 0, but got %d",
-            ids_vars.size()));
-    auto *ids_var = ids_vars[0];
-
-    if (ids_var->IsType<framework::LoDTensor>()) {
-      int batch_size = 0;
-      const auto ids_tensors = ctx.MultiInput<framework::LoDTensor>("Ids");
-      for (size_t i = 0; i < ids_tensors.size(); ++i) {
-        batch_size += ids_tensors[i]->dims()[0];
-      }
-      VLOG(4) << "Get Total BatchSize is: " << batch_size;
-
-      std::vector<T> all_ids(batch_size);
-      int offset = 0;
-      for (size_t i = 0; i < ids_tensors.size(); ++i) {
-        const auto *ids = ids_tensors[i];
-        std::memcpy(all_ids.data() + offset, ids->data<T>(),
-                    ids->numel() * sizeof(T));
-        offset += ids->numel();
-      }
-
-      std::set<T> st(all_ids.begin(), all_ids.end());
-      all_ids.assign(st.begin(), st.end());
-
-      auto outs = ctx.MultiOutput<framework::LoDTensor>("Out");
-      const size_t shard_num = outs.size();
-      std::vector<std::vector<T>> out_ids;
-      out_ids.resize(outs.size());
-
-      // split id by their shard_num.
-      for (size_t i = 0; i < all_ids.size(); ++i) {
-        T id = all_ids[i];
-        size_t shard_id = static_cast<size_t>(id) % shard_num;
-        out_ids[shard_id].push_back(id);
-      }
-
-      // create tensor for each shard and send to parameter server
-      for (size_t i = 0; i < out_ids.size(); ++i) {
-        auto *shard_t = outs[i];
-        std::vector<T> ids = out_ids[i];
-        auto *shard_data = shard_t->mutable_data<T>(
-            framework::make_ddim({static_cast<int64_t>(ids.size()), 1}), place);
-        for (size_t i = 0; i < ids.size(); ++i) {
-          shard_data[i] = ids[i];
-        }
-      }
-    } else if (ids_var->IsType<framework::SelectedRows>()) {
-      const auto *ids_selected_rows = ctx.Input<framework::SelectedRows>("Ids");
-      auto &ids_dims = ids_selected_rows->value().dims();
-      const T *ids_data = ids_selected_rows->value().data<T>();
-      const auto &ids_rows = ids_selected_rows->rows();
-      auto outs = ctx.MultiOutput<framework::SelectedRows>("Out");
-      const size_t shard_num = outs.size();
-      for (auto &out : outs) {
-        out->mutable_rows()->clear();
-      }
-      // get rows for outputs
-      std::unordered_map<int64_t, size_t> id_to_index;
-      for (size_t i = 0; i < ids_rows.size(); ++i) {
-        id_to_index[ids_rows[i]] = i;
-        size_t shard_id = static_cast<size_t>(ids_rows[i]) % shard_num;
-        outs[shard_id]->mutable_rows()->push_back(ids_rows[i]);
-      }
-
-      int64_t row_width = ids_dims[1];
-      for (auto &out : outs) {
-        out->set_height(ids_selected_rows->height());
-        framework::DDim ddim = framework::make_ddim(
-            {static_cast<int64_t>(out->rows().size()), row_width});
-        T *output = out->mutable_value()->mutable_data<T>(ddim, place);
-        for (int64_t i = 0; i < ddim[0]; ++i) {
-          memcpy(output + i * row_width,
-                 ids_data + id_to_index[out->rows()[i]] * row_width,
-                 row_width * sizeof(T));
-        }
-      }
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "% should be LoDTensor or SelectedRows, but the received type is %s",
-          ctx.InputNames("Ids")[0], framework::ToTypeName(ids_var->Type())));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed_ops/test_send_nccl_id.cc b/paddle/fluid/operators/distributed_ops/test_send_nccl_id.cc
deleted file mode 100644
index b65621a0886b0..0000000000000
--- a/paddle/fluid/operators/distributed_ops/test_send_nccl_id.cc
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <unistd.h>
-#include <string>
-#include <thread>  // NOLINT
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/distributed/request_handler_impl.h"
-#include "paddle/fluid/operators/distributed_ops/listen_and_serv_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/operators/math/selected_rows_functor.h"
-#include "paddle/fluid/platform/nccl_helper.h"
-#include "paddle/fluid/string/printf.h"
-
-#ifdef PADDLE_WITH_GRPC
-#include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
-#endif
-
-USE_NO_KERNEL_OP(listen_and_serv);
-
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-namespace m = paddle::operators::math;
-namespace distributed = paddle::operators::distributed;
-namespace string = paddle::string;
-
-std::unique_ptr<distributed::RPCServer> g_rpc_service;
-std::unique_ptr<distributed::RequestHandler> g_req_handler;
-
-void StartServer() {
-  f::Scope scope;
-  p::CPUPlace place;
-  scope.Var(NCCL_ID_VARNAME);
-  p::DeviceContextPool& pool = p::DeviceContextPool::Instance();
-  auto& dev_ctx = *pool.Get(p::CPUPlace());
-
-  f::ProgramDesc empty_program;
-  f::Executor executor(dev_ctx.GetPlace());
-  g_req_handler->SetScope(&scope);
-  g_req_handler->SetDevCtx(&dev_ctx);
-  g_req_handler->SetProgram(&empty_program);
-  g_req_handler->SetExecutor(&executor);
-
-  g_rpc_service->RegisterRPC(distributed::kRequestSend, g_req_handler.get());
-  g_req_handler->SetRPCServer(g_rpc_service.get());
-
-  std::thread server_thread(
-      std::bind(&distributed::RPCServer::StartServer, g_rpc_service.get()));
-
-  g_rpc_service->SetCond(distributed::kRequestSend);
-  g_rpc_service->WaitBarrier(distributed::kRequestSend);
-
-  LOG(INFO) << "got nccl id and stop server...";
-  g_rpc_service->ShutDown();
-  server_thread.join();
-}
-
-TEST(SendNcclId, RPCServer) {
-  g_req_handler.reset(
-      new distributed::RequestSendHandler(distributed::DistributedMode::kSync));
-  g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 1));
-
-  std::thread server_thread(StartServer);
-  g_rpc_service->WaitServerReady();
-
-  f::Scope scope;
-  p::CPUPlace place;
-  p::DeviceContextPool& pool = p::DeviceContextPool::Instance();
-  auto& dev_ctx = *pool.Get(p::CPUPlace());
-
-  auto var = scope.Var(NCCL_ID_VARNAME);
-  auto id = var->GetMutable<ncclUniqueId>();
-  p::dynload::ncclGetUniqueId(id);
-
-  int port = g_rpc_service->GetSelectedPort();
-
-  std::string ep = string::Sprintf("127.0.0.1:%d", port);
-
-  distributed::RPCClient* client =
-      distributed::RPCClient::GetInstance<RPCCLIENT_T>(0);
-
-  LOG(INFO) << "connect to server" << ep;
-  client->AsyncSendVar(ep, dev_ctx, scope, NCCL_ID_VARNAME);
-  client->Wait();
-  client->AsyncSendBatchBarrier(ep);
-  client->Wait();
-
-  server_thread.join();
-  g_rpc_service.reset(nullptr);
-  g_req_handler.reset(nullptr);
-}
diff --git a/paddle/fluid/operators/dlnne/CMakeLists.txt b/paddle/fluid/operators/dlnne/CMakeLists.txt
new file mode 100644
index 0000000000000..4fe9cf214eaa7
--- /dev/null
+++ b/paddle/fluid/operators/dlnne/CMakeLists.txt
@@ -0,0 +1,54 @@
+# compile flags
+set(DLNNE_FLAGS
+  -Wno-error=non-virtual-dtor
+  -Wno-error=unused-variable
+  -Wno-error=attributes
+  ${fsanitize}
+)
+foreach(flag ${DLNNE_FLAGS})
+  safe_set_cflag(CMAKE_C_FLAGS ${flag})
+  safe_set_cxxflag(CMAKE_CXX_FLAGS ${flag})
+endforeach()
+
+
+# add nne
+find_path(DLNNE_INCLUDE_DIR dlnne.h
+  PATHS
+  $ENV{SOFTWARE_SOURCE_DIR} $ENV{SOFTWARE_SOURCE_DIR}/driver/nne/include
+  NO_DEFAULT_PATH
+)
+
+find_library(DLNNE_LIB libdlnne.so
+  PATHS
+  $ENV{SOFTWARE_BUILD_DIR} $ENV{SOFTWARE_BUILD_DIR}/driver/nne
+  NO_DEFAULT_PATH
+)
+
+find_path(CUDA_INCLUDE_DIR cuda.h
+  $ENV{SOFTWARE_BUILD_DIR}/llvm-project-10/cuda/include
+)
+
+find_library(CURT_LIB libcurt.so
+  PATHS
+  $ENV{SOFTWARE_BUILD_DIR} $ENV{SOFTWARE_BUILD_DIR}/llvm-project-10/cuda/lib
+  NO_DEFAULT_PATH
+)
+
+
+message("DLNNE_INCLUDE_DIR: "${DLNNE_INCLUDE_DIR})
+message("DLNNE_LIB: "${DLNNE_LIB})
+message("CUDA_INCLUDE_DIR: "${CUDA_INCLUDE_DIR})
+message("CURT_LIB: "${CURT_LIB})
+
+include_directories("${DLNNE_INCLUDE_DIR}")
+include_directories("${CUDA_INCLUDE_DIR}")
+
+op_library(dlnne_engine_op DEPS ${GLOB_OPERATOR_DEPS} framework_proto boost device_context op_registry scope)
+
+#message("PYBIND_FILE:${pybind_file}")
+#file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(dlnne_engine);\n")
+#endif()
+
+target_link_libraries(dlnne_engine_op ${DLNNE_LIB} ${CURT_LIB})
+
+cc_test(test_dlnne_engine_op SRCS dlnne_engine_op_test.cc DEPS dlnne_engine_op analysis)
diff --git a/paddle/fluid/operators/dlnne/dlnne_engine_op.cc b/paddle/fluid/operators/dlnne/dlnne_engine_op.cc
new file mode 100644
index 0000000000000..4654e6a9f978a
--- /dev/null
+++ b/paddle/fluid/operators/dlnne/dlnne_engine_op.cc
@@ -0,0 +1,58 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/dlnne/dlnne_engine_op.h"
+
+namespace paddle {
+namespace inference {
+
+void CopyTensorDeviceToCpu(void* dst_ptr, void* src_ptr, int total_bytes) {
+  cudaDeviceSynchronize();
+  cudaMemcpy(dst_ptr, src_ptr, total_bytes, cudaMemcpyDeviceToHost);
+  cudaDeviceSynchronize();
+}
+void CopyTensorCpuToDevice(void* dst_ptr, void* src_ptr, int total_bytes) {
+  cudaDeviceSynchronize();
+  cudaMemcpy(dst_ptr, src_ptr, total_bytes, cudaMemcpyHostToDevice);
+  cudaDeviceSynchronize();
+}
+
+}  // namespace inference
+
+namespace operators {
+
+class DlnneEngineOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Xs", "A list of inputs.").AsDuplicable();
+    AddOutput("Ys", "A list of outputs").AsDuplicable();
+    AddAttr<std::string>("subgraph", "the subgraph.");
+    AddAttr<std::string>(
+        "engine_key",
+        "The engine_key here is used to distinguish different DLNNE Engines");
+    AddAttr<framework::BlockDesc*>("sub_block", "the trt block");
+    AddComment("Dlnne engine operator.");
+  }
+};
+
+class DlnneEngineInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(framework::InferVarTypeContext* ctx) const override {}
+};
+
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(dlnne_engine, ops::DlnneEngineOp, ops::DlnneEngineOpMaker);
diff --git a/paddle/fluid/operators/dlnne/dlnne_engine_op.h b/paddle/fluid/operators/dlnne/dlnne_engine_op.h
new file mode 100644
index 0000000000000..d426876c18fa5
--- /dev/null
+++ b/paddle/fluid/operators/dlnne/dlnne_engine_op.h
@@ -0,0 +1,351 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <cuda.h>          // NOTLINT
+#include <cuda_runtime.h>  // NOTLINT
+#include <dlnne.h>         // NOTLINT
+
+#include <assert.h>
+#include <ctime>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/inference/analysis/helper.h"
+
+namespace dl {
+namespace nne {
+class Builder;
+class Engine;
+class Network;
+class Parser;
+class ExecutionContext;
+}  // namespace nne
+}  // namespace dl
+
+namespace paddle {
+namespace inference {
+class NneDeleter {
+ public:
+  NneDeleter() {}
+
+  template <typename T>
+  inline void operator()(T *ptr) {
+    if (ptr != nullptr) {
+      ptr->Destroy();
+    }
+  }
+};
+
+void CopyTensorDeviceToCpu(void *dst_ptr, void *src_ptr, int total_bytes);
+
+void CopyTensorCpuToDevice(void *dst_ptr, void *src_ptr, int total_bytes);
+
+template <typename T>
+struct Singleton;
+}  // namespace inference
+}  // namespace paddle
+
+namespace paddle {
+
+namespace operators {
+
+class DlnneEngineOp : public framework::OperatorBase {
+ private:
+  std::vector<std::string> input_names_;
+  std::unordered_set<std::string> param_names_;
+  std::string engine_key_;
+  int num_inputs;
+  int num_outputs;
+  std::vector<std::string> output_names;
+  std::vector<std::string> input_names;
+
+  dl::nne::Builder *builder;
+  dl::nne::Parser *parser;
+  dl::nne::Network *network;
+  dl::nne::ExecutionContext *context;
+  dl::nne::Engine *engine;
+
+  unsigned int engine_input_size;
+  std::vector<int> InputIndexToBindIndex_;
+
+ public:
+  DlnneEngineOp(const std::string &type,
+                const framework::VariableNameMap &inputs,
+                const framework::VariableNameMap &outputs,
+                const framework::AttributeMap &attrs)
+      : framework::OperatorBase(type, inputs, outputs, attrs) {
+    input_names_ = Inputs("Xs");
+    engine_key_ = Attr<std::string>("engine_key");
+    auto params = Attr<std::vector<std::string>>("parameters");
+    for (const auto &param : params) {
+      param_names_.insert(param);
+    }
+
+    num_inputs = 0;
+    for (const auto &x : Inputs("Xs")) {
+      if (param_names_.count(x)) continue;
+      num_inputs += 1;
+      input_names.push_back(x);
+    }
+
+    num_outputs = Outputs("Ys").size();
+    for (const auto &y : Outputs("Ys")) {
+      VLOG(4) << "y: " << y << std::endl;
+      output_names.push_back(y);
+    }
+
+    // onnx path
+    std::stringstream filename;
+    std::string current_path = ".";
+    char *buffer;
+    if ((buffer = getcwd(NULL, 0)) != NULL) {
+      current_path = buffer;
+    } else {
+      current_path = ".";
+    }
+    filename << current_path << "/dump/" << engine_key_ << "/" << engine_key_
+             << ".onnx";
+
+    builder = dl::nne::CreateInferBuilder();
+    PADDLE_ENFORCE_NE(builder, nullptr, platform::errors::Unavailable(
+                                            "nne create builder failed"));
+    parser = dl::nne::CreateParser();
+    PADDLE_ENFORCE_NE(parser, nullptr, platform::errors::Unavailable(
+                                           "nne create parser failed"));
+
+    network = builder->CreateNetwork();
+
+    LOG(INFO) << "set output for dlnne";
+    for (std::string &output_op_name : output_names)
+      parser->RegisterOutput(output_op_name.c_str());
+
+    LOG(INFO) << "parser onnx for dlnne";
+    parser->Parse(filename.str().c_str(), *network);
+
+    LOG(INFO) << "build network";
+    engine = builder->BuildEngine(*network);
+
+    // total size = input_size+output_size
+    engine_input_size = num_inputs + num_outputs;
+    for (std::string &input_name : input_names) {
+      int BindIndex = engine->GetBindingIndex(input_name.c_str());
+      InputIndexToBindIndex_.push_back(BindIndex);
+    }
+
+    for (std::string &output_name : output_names) {
+      int BindIndex = engine->GetBindingIndex(output_name.c_str());
+      InputIndexToBindIndex_.push_back(BindIndex);
+    }
+
+    // context
+    context = engine->CreateExecutionContext();
+  }
+
+  ~DlnneEngineOp() {
+    network->Destroy();
+    context->Destroy();
+    engine->Destroy();
+    parser->Destroy();
+    builder->Destroy();
+  }
+
+ protected:
+  void RunDlnneOnCreateEngine(const framework::Scope &scope,
+                              const platform::Place &dev_place) const {
+    PADDLE_ENFORCE_EQ(
+        input_names_.empty(), false,
+        platform::errors::PreconditionNotMet(
+            "Dlnne engine needs at least one input, but no input is found. "
+            "Please check if you set the input correctly."));
+
+    std::vector<void *> input_buffers(num_inputs);
+    std::vector<void *> cpu_input_buffers(num_inputs);
+    std::vector<std::vector<int64_t>> input_shapes(num_inputs);
+    std::vector<int32_t> input_data_types(num_inputs);
+    std::vector<int64_t> input_bytes(num_inputs);
+
+    int index = 0;
+    for (const auto &x : Inputs("Xs")) {
+      if (param_names_.count(x)) continue;
+      // convert input and copy to Dlnne engine's buffer
+      auto &t =
+          inference::analysis::GetFromScope<framework::LoDTensor>(scope, x);
+
+      const int bind_index = index;
+      index++;
+      int64_t data_bytes;
+      int32_t dtype;
+      auto type = t.type();
+      data_bytes = 1;
+      void *buffer = nullptr;
+      if (type == framework::proto::VarType::FP32) {
+        buffer = static_cast<void *>(t.data<float>());
+        data_bytes = 4;
+        dtype = 0;
+      } else if (type == framework::proto::VarType::INT64) {
+        buffer = static_cast<void *>(t.data<int64_t>());
+        data_bytes = 8;
+        dtype = 1;
+      } else if (type == framework::proto::VarType::INT32) {
+        buffer = static_cast<void *>(t.data<int32_t>());
+        data_bytes = 4;
+        dtype = 2;
+      } else {
+        PADDLE_THROW(platform::errors::Fatal(
+            "The DLNNE Engine OP only support float/int32_t/int64_t input."));
+      }
+      input_buffers[bind_index] = buffer;
+
+      auto t_shape = framework::vectorize<int64_t>(t.dims());
+      std::vector<int64_t> runtime_input_shape(t_shape.begin(), t_shape.end());
+      for (auto &size : t_shape) {
+        data_bytes = data_bytes * size;
+      }
+
+      VLOG(4) << "buffers_size:" << data_bytes;
+      cpu_input_buffers[bind_index] =
+          input_buffers[bind_index];  // malloc(data_bytes);
+      input_shapes[bind_index] = runtime_input_shape;
+      input_data_types[bind_index] = dtype;
+      input_bytes[bind_index] = data_bytes;
+    }
+
+    // output shape
+    std::vector<std::vector<int64_t>> out_shapes;
+    std::vector<int32_t> output_bytes;
+    for (int i = 0; i < num_outputs; i++) {
+      int index = engine->GetBindingIndex(output_names[i].c_str());
+      dl::nne::Dims out_dim = engine->GetBindingDimensions(index);
+      std::vector<int64_t> shape(out_dim.nbDims);
+      for (int dim = 0; dim < out_dim.nbDims; dim++) {
+        shape[dim] = (out_dim.d[dim]);
+      }
+
+      out_shapes.push_back(shape);
+      int64_t data_bytes;
+
+      // float32
+      data_bytes = 4;
+      for (auto &size : shape) {
+        data_bytes = data_bytes * size;
+      }
+      VLOG(4) << "data_bytes: " << data_bytes;
+      output_bytes.push_back(data_bytes);
+    }
+
+    int bind_index = 0;
+    std::vector<void *> cpu_output_buffers(num_outputs);
+    std::vector<void *> output_buffers(num_outputs);
+    std::vector<int32_t> output_dtypes(num_outputs);
+
+    for (const auto &y : Outputs("Ys")) {
+      auto *fluid_v = scope.FindVar(y);
+      PADDLE_ENFORCE_NOT_NULL(
+          fluid_v,
+          platform::errors::NotFound(
+              "Output variable %s is not found in DLNNE subgraph.", y));
+
+      auto *fluid_t = fluid_v->GetMutable<framework::LoDTensor>();
+
+      VLOG(4) << "out_shapes[bind_index] dim:" << out_shapes[bind_index].size();
+      fluid_t->Resize(framework::make_ddim(out_shapes[bind_index]));
+
+      int32_t dtype;
+      output_buffers[bind_index] = fluid_t->mutable_data<float>(
+          BOOST_GET_CONST(platform::CPUPlace, dev_place));
+      dtype = 0;
+      cpu_output_buffers[bind_index] =
+          output_buffers[bind_index];  // malloc(data_bytes);
+      output_dtypes[bind_index] = dtype;
+      bind_index++;
+    }
+
+    std::vector<void *> engine_input_ptr(engine_input_size);
+
+    // set input_ptr
+    for (unsigned int i = 0; i < engine_input_size; i++) {
+      if (InputIndexToBindIndex_[i] < 0) continue;
+
+      if (engine->BindingIsInput(InputIndexToBindIndex_[i])) {
+        // copy cpu buffer to gpu buffer
+        int64_t total_bytes;
+        total_bytes = input_bytes[i];
+        VLOG(4) << "input_bytes: " << total_bytes;
+
+        void *gpu_ptr;
+        cudaMalloc(&gpu_ptr, total_bytes);
+        engine_input_ptr[InputIndexToBindIndex_[i]] = gpu_ptr;
+
+        paddle::inference::CopyTensorCpuToDevice(
+            gpu_ptr, reinterpret_cast<void *>(cpu_input_buffers[i]),
+            total_bytes);
+
+      } else {
+        int64_t total_size;
+        total_size = output_bytes[i - input_names.size()];
+        VLOG(4) << "output_bytes: " << total_size;
+        void *gpu_ptr;
+        cudaMalloc(&gpu_ptr, total_size);
+        engine_input_ptr[InputIndexToBindIndex_[i]] = gpu_ptr;
+      }
+    }
+
+    clock_t startTime, endTime;
+    startTime = clock();
+    context->Execute(1, engine_input_ptr.data());
+    endTime = clock();
+    double during_ms =
+        static_cast<double>(endTime - startTime) / CLOCKS_PER_SEC * 1000;
+    LOG(INFO) << "dlNNE execute time: " << during_ms << " ms";
+
+    bind_index = 0;
+    for (unsigned int i = 0; i < engine_input_size; i++) {
+      if (InputIndexToBindIndex_[i] < 0) continue;
+
+      if (i >= input_names.size()) {
+        void *cpu_ptr = cpu_output_buffers[i - input_names.size()];
+        int64_t size;
+        size = output_bytes[i - input_names.size()];
+        paddle::inference::CopyTensorDeviceToCpu(
+            cpu_ptr, engine_input_ptr[InputIndexToBindIndex_[i]], size);
+        // dtype: float32
+        int32_t dtypes;
+        dtypes = 0;
+
+        cpu_output_buffers[bind_index] = cpu_ptr;
+        output_dtypes[bind_index] = dtypes;
+        bind_index++;
+      }
+      cudaFree(engine_input_ptr[InputIndexToBindIndex_[i]]);
+    }
+  }
+
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
+    RunDlnneOnCreateEngine(scope, dev_place);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/dlnne/dlnne_engine_op_test.cc b/paddle/fluid/operators/dlnne/dlnne_engine_op_test.cc
new file mode 100644
index 0000000000000..caf1a80fcc737
--- /dev/null
+++ b/paddle/fluid/operators/dlnne/dlnne_engine_op_test.cc
@@ -0,0 +1,237 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/dlnne/dlnne_engine_op.h"
+#include <gtest/gtest.h>
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/inference/analysis/helper.h"
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
+#include "paddle/fluid/operators/tensorrt/tensorrt_engine_op.h"
+
+USE_NO_KERNEL_OP(dlnne_engine);
+namespace paddle {
+namespace operators {
+
+namespace {
+void CreateCUDATensor(framework::Scope* scope, const std::string& name,
+                      const std::vector<int64_t>& shape) {
+  auto* var = scope->Var(name);
+  auto* tensor = var->GetMutable<framework::LoDTensor>();
+  auto dims = framework::make_ddim(shape);
+  tensor->Resize(dims);
+  platform::CUDAPlace place;
+  platform::CUDADeviceContext ctx(place);
+  inference::tensorrt::RandomizeTensor(tensor, place, ctx);
+}
+
+void AddTensorToBlockDesc(framework::proto::BlockDesc* block,
+                          const std::string& name,
+                          const std::vector<int64_t>& shape) {
+  using framework::proto::VarType;
+  auto* var = block->add_vars();
+  framework::VarDesc desc(name);
+  desc.SetType(VarType::LOD_TENSOR);
+  desc.SetDataType(VarType::FP32);
+  desc.SetShape(shape);
+  *var = *desc.Proto();
+}
+
+}  // namespace
+
+using inference::analysis::SetAttr;
+
+TEST(DlnneEngineOp, manual) {
+  framework::ProgramDesc program;
+  auto* block_ = program.Proto()->add_blocks();
+  block_->set_idx(0);
+  block_->set_parent_idx(-1);
+
+  LOG(INFO) << "create block desc";
+  framework::BlockDesc block_desc(&program, block_);
+  LOG(INFO) << "create fc op";
+  auto* fc0 = block_desc.AppendOp();
+  fc0->SetType("fc");
+  fc0->SetInput("X", std::vector<std::string>({"x"}));     // 4 x 1 x 1
+  fc0->SetInput("Y", std::vector<std::string>({"y"}));     // 4 x 6
+  fc0->SetOutput("Out", std::vector<std::string>({"z"}));  // 6 x 1 x 1
+
+  LOG(INFO) << "create fc op";
+  auto* fc1 = block_desc.AppendOp();
+  fc1->SetType("fc");
+  fc1->SetInput("X", std::vector<std::string>({"z"}));
+  fc1->SetInput("Y", std::vector<std::string>({"y0"}));     // 6 x 8
+  fc1->SetOutput("Out", std::vector<std::string>({"z0"}));  // 8 x 1 x 1
+
+  // Set inputs' variable shape in BlockDesc
+  // the batch size is 2, so the dims of 'x' is {2, 4, 1, 1}
+  AddTensorToBlockDesc(block_, "x", std::vector<int64_t>({2, 4, 1, 1}));
+  AddTensorToBlockDesc(block_, "y", std::vector<int64_t>({4, 6}));
+  AddTensorToBlockDesc(block_, "y0", std::vector<int64_t>({6, 8}));
+  AddTensorToBlockDesc(block_, "z", std::vector<int64_t>({2, 6}));
+
+  // It is wired, need to copy manually.
+  *block_->add_ops() = *fc0->Proto();
+  *block_->add_ops() = *fc1->Proto();
+
+  ASSERT_EQ(block_->ops_size(), 2);
+
+  LOG(INFO) << "create dlnne desc";
+  framework::OpDesc engine_op_desc(nullptr);
+  engine_op_desc.SetType("dlnne_engine");
+  engine_op_desc.SetInput("Xs", std::vector<std::string>({"x"}));
+  engine_op_desc.SetOutput("Ys", std::vector<std::string>({"z0"}));
+
+  engine_op_desc.SetBlockAttr("sub_block", &block_desc);
+  engine_op_desc.SetAttr("max_batch_size", static_cast<int>(2));
+  engine_op_desc.SetAttr("workspace_size", static_cast<int>(1 << 20));
+  engine_op_desc.SetAttr("parameters", std::vector<std::string>({}));
+  engine_op_desc.SetAttr("engine_key", std::string("a_engine"));
+  engine_op_desc.SetAttr("calibration_engine_key",
+                         std::string("a_calib_engine"));
+  engine_op_desc.SetAttr("predictor_id", 1);
+  engine_op_desc.SetAttr("calibration_data", std::string(""));
+  engine_op_desc.SetAttr("enable_int8", static_cast<bool>(false));
+  engine_op_desc.SetAttr("enable_fp16", static_cast<bool>(false));
+  engine_op_desc.SetAttr("use_calib_mode", static_cast<bool>(false));
+  engine_op_desc.SetAttr("output_name_mapping",
+                         std::vector<std::string>({"z0"}));
+  engine_op_desc.SetAttr("origin_output_dims", std::vector<int>({2}));
+  engine_op_desc.SetAttr("subgraph", std::string(block_->SerializeAsString()));
+  engine_op_desc.SetAttr("engine_serialized_data", std::string(""));
+  int device_id = 0;
+  engine_op_desc.SetAttr("gpu_id", device_id);
+
+  LOG(INFO) << "create engine op";
+  auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc);
+  LOG(INFO) << "engine_op " << engine_op.get();
+
+  framework::Scope scope;
+  platform::CUDAPlace place;
+  platform::CUDADeviceContext ctx(place);
+  // Prepare variables.
+  CreateCUDATensor(&scope, "x", std::vector<int64_t>({2, 4}));
+  CreateCUDATensor(&scope, "y", std::vector<int64_t>({4, 6}));
+  CreateCUDATensor(&scope, "z", std::vector<int64_t>({2, 6}));
+
+  CreateCUDATensor(&scope, "y0", std::vector<int64_t>({6, 8}));
+  CreateCUDATensor(&scope, "z0", std::vector<int64_t>({2, 8}));
+
+  // Execute them.
+  LOG(INFO) << "engine_op run";
+  engine_op->Run(scope, place);
+}
+
+void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
+  framework::ProgramDesc program;
+  framework::Scope scope;
+  platform::CUDAPlace place;
+  platform::CUDADeviceContext ctx(place);
+
+  auto* block_ = program.Proto()->add_blocks();
+  block_->set_idx(0);
+  block_->set_parent_idx(-1);
+
+  using shape_t = std::vector<int64_t>;
+
+  LOG(INFO) << "create block desc";
+  framework::BlockDesc block_desc(&program, block_);
+
+  auto AddFCLayer = [&](const std::string& x_name, const std::string& y_name,
+                        const std::string& z_name, bool x_created,
+                        const shape_t& x_shape, const shape_t& y_shape,
+                        const shape_t& z_shape) {
+    LOG(INFO) << "create fc op";
+    auto* fc = block_desc.AppendOp();
+    fc->SetType("mul");
+    fc->SetInput("X", std::vector<std::string>({x_name}));
+    fc->SetInput("Y", std::vector<std::string>({y_name}));
+    fc->SetOutput("Out", std::vector<std::string>({z_name}));
+
+    // Set inputs' variable shape in BlockDesc
+    if (!x_created) {
+      AddTensorToBlockDesc(block_, x_name,
+                           std::vector<int64_t>({batch_size, input_dim, 1, 1}));
+    }
+    AddTensorToBlockDesc(block_, y_name,
+                         std::vector<int64_t>({input_dim, output_dim}));
+    AddTensorToBlockDesc(block_, z_name,
+                         std::vector<int64_t>({batch_size, output_dim}));
+
+    // Prepare variables.
+    if (!x_created) {
+      CreateCUDATensor(&scope, x_name, std::vector<int64_t>(x_shape));
+    }
+    CreateCUDATensor(&scope, y_name, std::vector<int64_t>(y_shape));
+    CreateCUDATensor(&scope, z_name, std::vector<int64_t>(z_shape));
+
+    // It is wired, need to copy manually.
+    *block_->add_ops() = *fc->Proto();
+  };
+
+  // Test with 4 layer FC
+  AddFCLayer("x0", "y0", "z0", false, {batch_size, input_dim},
+             {input_dim, output_dim}, {batch_size, output_dim});
+  AddFCLayer("z0", "y1", "z1", true, {}, {output_dim, output_dim},
+             {batch_size, output_dim});
+  AddFCLayer("z1", "y2", "z2", true, {}, {output_dim, output_dim},
+             {batch_size, output_dim});
+  AddFCLayer("z2", "y3", "z3", true, {}, {output_dim, output_dim},
+             {batch_size, output_dim});
+
+  LOG(INFO) << "create dlnne desc";
+  framework::OpDesc engine_op_desc(nullptr);
+  engine_op_desc.SetType("dlnne_engine");
+  engine_op_desc.SetInput("Xs", std::vector<std::string>({"x0"}));
+  engine_op_desc.SetOutput("Ys", std::vector<std::string>({"z3"}));
+
+  engine_op_desc.SetBlockAttr("sub_block", &block_desc);
+  engine_op_desc.SetAttr("max_batch_size", static_cast<int>(batch_size));
+  engine_op_desc.SetAttr("workspace_size", static_cast<int>(1 << 20));
+  engine_op_desc.SetAttr("parameters",
+                         std::vector<std::string>({"y0", "y1", "y2", "y3"}));
+  engine_op_desc.SetAttr("engine_key", std::string("b_engine"));
+  engine_op_desc.SetAttr("calibration_engine_key",
+                         std::string("b_calib_engine"));
+  engine_op_desc.SetAttr("predictor_id", 1);
+  engine_op_desc.SetAttr("calibration_data", std::string(""));
+  engine_op_desc.SetAttr("enable_int8", static_cast<bool>(false));
+  engine_op_desc.SetAttr("enable_fp16", static_cast<bool>(false));
+  engine_op_desc.SetAttr("use_calib_mode", static_cast<bool>(false));
+  engine_op_desc.SetAttr("output_name_mapping",
+                         std::vector<std::string>({"z3"}));
+  engine_op_desc.SetAttr("origin_output_dims", std::vector<int>({2}));
+  engine_op_desc.SetAttr("subgraph", std::string(block_->SerializeAsString()));
+  engine_op_desc.SetAttr("engine_serialized_data", std::string(""));
+  int device_id = 0;
+  engine_op_desc.SetAttr("gpu_id", device_id);
+
+  auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc);
+
+  // Execute them.
+  engine_op->Run(scope, place);
+}
+
+// Test with a larger FC layer.
+TEST(DlnneEngineOp, fc) { Execute(40, 28, 28); }
+
+}  // namespace operators
+}  // namespace paddle
+
+USE_TRT_CONVERTER(fc)
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cu b/paddle/fluid/operators/elementwise/elementwise_add_op.cu
index 313607d975e60..5c444e752e797 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cu
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.cu.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
 #include "paddle/fluid/platform/complex128.h"
 #include "paddle/fluid/platform/complex64.h"
 #include "paddle/fluid/platform/float16.h"
@@ -23,54 +23,31 @@ namespace plat = paddle::platform;
 namespace paddle {
 namespace operators {
 
+/*
+   input: an array;
+   return: the result of the math functor
+   1. For Unary Op, the length of input array is 1,
+      e.g. Relu: return args[0] > 0 ? args[0] : 0;
+   2. For Binary Op, the length of input array is 2,
+      e.g. Add: return args[0] + args[1];
+*/
 template <typename T>
-struct SameDimsElemwiseAdd<
-    platform::CUDADeviceContext, T,
-    typename std::enable_if<!std::is_same<T, platform::float16>::value &&
-                            !std::is_same<T, float>::value>::type> {
-  void operator()(const framework::ExecutionContext& ctx,
-                  const framework::Tensor* x, const framework::Tensor* y,
-                  framework::Tensor* z) {
-    AddRangeFunctor<T> functor(x->data<T>(), y->data<T>(), z->data<T>());
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    platform::ForRange<platform::CUDADeviceContext> for_range(dev_ctx,
-                                                              x->numel());
-    for_range(functor);
+struct CudaAddFunctor {
+  __device__ __forceinline__ T operator()(const T* args) const {
+    return args[0] + args[1];
   }
 };
 
 template <typename T>
-struct SameDimsElemwiseAdd<
-    platform::CUDADeviceContext, T,
-    typename std::enable_if<std::is_same<T, platform::float16>::value ||
-                            std::is_same<T, float>::value>::type> {
+struct SameDimsElemwiseAdd<platform::CUDADeviceContext, T> {
   void operator()(const framework::ExecutionContext& ctx,
                   const framework::Tensor* x, const framework::Tensor* y,
                   framework::Tensor* z) {
-    auto size = x->numel();
-    int vec_size = sizeof(float4) / sizeof(T);
-    dim3 grid_size =
-        dim3(((size + vec_size - 1) / vec_size + PADDLE_CUDA_THREAD_SIZE - 1) /
-                 PADDLE_CUDA_THREAD_SIZE,
-             1);
-    dim3 block_size = dim3(PADDLE_CUDA_THREAD_SIZE, 1);
-    if (std::is_same<T, float>::value) {
-      SameDimsElemwiseAddCUDAKernel<<<
-          grid_size, block_size, 0,
-          ctx.template device_context<platform::CUDADeviceContext>()
-              .stream()>>>(x->data<float>(), y->data<float>(), z->data<float>(),
-                           size);
-    } else {
-      const half* x2 =
-          reinterpret_cast<const half*>(x->data<platform::float16>());
-      const half* y2 =
-          reinterpret_cast<const half*>(y->data<platform::float16>());
-      half* z2 = reinterpret_cast<half*>(z->data<platform::float16>());
-      SameDimsElemwiseAddCUDAKernel<<<
-          grid_size, block_size, 0,
-          ctx.template device_context<platform::CUDADeviceContext>()
-              .stream()>>>(x2, y2, z2, size);
-    }
+    std::vector<const framework::Tensor*> ins = {x, y};
+    std::vector<framework::Tensor*> outs = {z};
+    LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T>(
+        ctx.template device_context<platform::CUDADeviceContext>(), ins, &outs,
+        CudaAddFunctor<T>());
   }
 };
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc
index 1e7e5e02c0181..3768748931ded 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc
@@ -12,17 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifdef PADDLE_WITH_ASCEND_CL
 #include <memory>
 #include <string>
 
+#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
 #include "paddle/fluid/operators/npu_op_runner.h"
 
 namespace paddle {
 namespace operators {
+using Tensor = framework::Tensor;
 
-template <typename DeviceContext, typename T>
+template <typename T>
 class ElementwiseAddNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -39,12 +40,125 @@ class ElementwiseAddNPUKernel : public framework::OpKernel<T> {
   }
 };
 
+template <typename T>
+class ElementwiseAddGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    // NOTE(zhiqiu): It seems Ascend Sub follow the broadcast sematics with
+    // default axis=-1?
+    // So, the sub_grad should do reduce if needed.
+    // For example, the shape of each variable in elementwise_sub:
+    // x, dx: [2, 3, 5]
+    // y, dy: [1, 5]
+    // out, dout: [2, 3, 5]
+    // Then, out = x - y  =>  dx = dout, dy = -dout
+    // And, the shape of dy can be computed by two stages reduce,
+    // 1. [2, 3, 5] => [3, 5], ReduceSumD on axis = 0, keep_dims = false.
+    // 2. [3, 5] => [1, 5], ReduceSumD on axis = 0, keep_dims = true.
+
+    if (dx) {
+      dx->mutable_data<T>(ctx.GetPlace());
+      // For dx
+      // stage 1
+      auto reduce_ndim = dout->dims().size() - dx->dims().size();
+      std::vector<int> axes;
+      for (auto i = 0; i < reduce_ndim; ++i) {
+        axes.push_back(i);
+      }
+      Tensor* tmp_dout = const_cast<Tensor*>(dout);
+      Tensor reduced_dout(dx->type());
+      if (axes.size() != 0) {
+        std::vector<int64_t> reduced_dout_dims;
+        for (auto i = reduce_ndim; i < dout->dims().size(); ++i) {
+          reduced_dout_dims.push_back(dout->dims()[i]);
+        }
+        reduced_dout.Resize(framework::make_ddim(reduced_dout_dims));
+        reduced_dout.mutable_data<T>(ctx.GetPlace());
+        auto runner = NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout},
+                                  {{"axes", axes}, {"keep_dims", false}});
+        runner.Run(stream);
+        tmp_dout = &reduced_dout;
+      }
+
+      // stage 2
+      axes.clear();
+      for (auto i = 0; i < dx->dims().size(); ++i) {
+        if (dx->dims()[i] == 1) {
+          axes.push_back(i);
+        }
+      }
+      if (axes.size() != 0) {
+        auto runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {*dx},
+                                  {{"axes", axes}, {"keep_dims", true}});
+        runner.Run(stream);
+      } else {
+        framework::TensorCopy(
+            *tmp_dout, ctx.GetPlace(),
+            ctx.template device_context<platform::DeviceContext>(), dx);
+      }
+    }
+
+    if (dy) {
+      // For dy
+      // stage 1
+      auto reduce_ndim = dout->dims().size() - dy->dims().size();
+      std::vector<int> axes;
+      for (auto i = 0; i < reduce_ndim; ++i) {
+        axes.push_back(i);
+      }
+      Tensor* tmp_dout = const_cast<Tensor*>(dout);
+      Tensor reduced_dout(dout->type());
+      if (axes.size() != 0) {
+        std::vector<int64_t> reduced_dout_dims;
+        for (auto i = reduce_ndim; i < dout->dims().size(); ++i) {
+          reduced_dout_dims.push_back(dout->dims()[i]);
+        }
+        reduced_dout.Resize(framework::make_ddim(reduced_dout_dims));
+        reduced_dout.mutable_data<T>(ctx.GetPlace());
+        auto runner = NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout},
+                                  {{"axes", axes}, {"keep_dims", false}});
+        runner.Run(stream);
+        tmp_dout = &reduced_dout;
+      }
+
+      // stage 2
+      axes.clear();
+      for (auto i = 0; i < dy->dims().size(); ++i) {
+        if (dy->dims()[i] == 1) {
+          axes.push_back(i);
+        }
+      }
+      if (axes.size() != 0) {
+        dy->mutable_data<T>(ctx.GetPlace());
+        auto runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {*dy},
+                                  {{"axes", axes}, {"keep_dims", true}});
+        runner.Run(stream);
+      } else {
+        framework::TensorCopy(
+            *tmp_dout, ctx.GetPlace(),
+            ctx.template device_context<platform::DeviceContext>(), dy);
+      }
+    }
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(elementwise_add, ops::ElementwiseAddNPUKernel<float>,
+                       ops::ElementwiseAddNPUKernel<plat::float16>);
 
-REGISTER_OP_NPU_KERNEL(
-    elementwise_add,
-    ops::ElementwiseAddNPUKernel<paddle::platform::NPUDeviceContext, float>);
-#endif
+REGISTER_OP_NPU_KERNEL(elementwise_add_grad,
+                       ops::ElementwiseAddGradNPUKernel<float>,
+                       ops::ElementwiseAddGradNPUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_div_op_npu.cc
new file mode 100644
index 0000000000000..8852f3a419adc
--- /dev/null
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op_npu.cc
@@ -0,0 +1,143 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/operators/elementwise/elementwise_div_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class ElementwiseDivNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+
+    auto* out = ctx.Output<Tensor>("Out");
+
+    auto place = ctx.GetPlace();
+
+    out->mutable_data<T>(place);
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    auto runner = NpuOpRunner("Div", {*x, *y}, {*out}, {});
+    runner.Run(stream);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class ElementwiseDivGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* out = ctx.Input<Tensor>("Out");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+
+    auto place = ctx.GetPlace();
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    Tensor y_power(y->type());
+    y_power.mutable_data<T>(y->dims(), place);
+    auto y_power_runner = NpuOpRunner("Power", {*y}, {y_power},
+                                      {{"power", static_cast<float>(-1)}});
+    y_power_runner.Run(stream);
+
+    if (dx) {
+      dx->mutable_data<T>(place);
+
+      Tensor tensor_zeros(x->type());
+      tensor_zeros.mutable_data<T>(x->dims(), place);
+      auto tensor_zeros_runner =
+          NpuOpRunner("ZerosLike", {*x}, {tensor_zeros}, {});
+      tensor_zeros_runner.Run(stream);
+
+      Tensor x_zero(paddle::framework::proto::VarType::BOOL);
+      x_zero.mutable_data<bool>(x->dims(), place);
+      auto x_zero_runner =
+          NpuOpRunner("Equal", {*x, tensor_zeros}, {x_zero}, {});
+      x_zero_runner.Run(stream);
+
+      Tensor x_nozero(paddle::framework::proto::VarType::BOOL);
+      x_nozero.mutable_data<bool>(x->dims(), place);
+      auto x_nozero_runner =
+          NpuOpRunner("LogicalNot", {x_zero}, {x_nozero}, {});
+      x_nozero_runner.Run(stream);
+
+      Tensor x_nozero_f(x->type());
+      x_nozero_f.mutable_data<T>(x->dims(), place);
+      auto x_nozero_f_runner =
+          NpuOpRunner("Cast", {x_nozero}, {x_nozero_f},
+                      {{"dst_type", static_cast<int32_t>(0)}});
+      x_nozero_f_runner.Run(stream);
+
+      Tensor x_grad_w(x->type());
+      x_grad_w.mutable_data<T>(x->dims(), place);
+      auto x_grad_w_runner =
+          NpuOpRunner("Mul", {x_nozero_f, y_power}, {x_grad_w}, {});
+      x_grad_w_runner.Run(stream);
+
+      auto x_grad_runner = NpuOpRunner("Mul", {x_grad_w, *dout}, {*dx}, {});
+      x_grad_runner.Run(stream);
+    }
+
+    if (dy) {
+      dy->mutable_data<T>(place);
+
+      Tensor neg_out(y->type());
+      neg_out.mutable_data<T>(y->dims(), place);
+      auto neg_out_runner = NpuOpRunner("Neg", {*out}, {neg_out}, {});
+      neg_out_runner.Run(stream);
+
+      Tensor y_grad_w(y->type());
+      y_grad_w.mutable_data<T>(y->dims(), place);
+      auto y_grad_w_runner = NpuOpRunner("Div", {neg_out, *y}, {y_grad_w}, {});
+      y_grad_w_runner.Run(stream);
+
+      auto y_grad_runner = NpuOpRunner("Mul", {y_grad_w, *dout}, {*dy}, {});
+      y_grad_runner.Run(stream);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    elementwise_div,
+    ops::ElementwiseDivNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::ElementwiseDivNPUKernel<paddle::platform::NPUDeviceContext,
+                                 paddle::platform::float16>);
+
+REGISTER_OP_NPU_KERNEL(
+    elementwise_div_grad,
+    ops::ElementwiseDivGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::ElementwiseDivGradNPUKernel<paddle::platform::NPUDeviceContext,
+                                     paddle::platform::float16>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_floordiv_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_floordiv_op_npu.cc
new file mode 100644
index 0000000000000..da0116114747f
--- /dev/null
+++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op_npu.cc
@@ -0,0 +1,52 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/operators/elementwise/elementwise_div_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+class ElementwiseFloorDivNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* out = ctx.Output<Tensor>("Out");
+
+    out->mutable_data<T>(ctx.GetPlace());
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    auto runner = NpuOpRunner("FloorDiv", {*x, *y}, {*out}, {});
+    runner.Run(stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(elementwise_floordiv,
+                       ops::ElementwiseFloorDivNPUKernel<int>,
+                       ops::ElementwiseFloorDivNPUKernel<int64_t>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_max_op_npu.cc
new file mode 100644
index 0000000000000..3cdb6420e8ee1
--- /dev/null
+++ b/paddle/fluid/operators/elementwise/elementwise_max_op_npu.cc
@@ -0,0 +1,57 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/operators/elementwise/elementwise_max_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class ElementwiseMaxNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+
+    auto* out = ctx.Output<Tensor>("Out");
+
+    auto place = ctx.GetPlace();
+
+    out->mutable_data<T>(place);
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    auto runner = NpuOpRunner("Maximum", {*x, *y}, {*out}, {});
+    runner.Run(stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    elementwise_max,
+    ops::ElementwiseMaxNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::ElementwiseMaxNPUKernel<paddle::platform::NPUDeviceContext,
+                                 paddle::platform::float16>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_min_op_npu.cc
new file mode 100644
index 0000000000000..987c250d65147
--- /dev/null
+++ b/paddle/fluid/operators/elementwise/elementwise_min_op_npu.cc
@@ -0,0 +1,57 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/operators/elementwise/elementwise_min_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class ElementwiseMinNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+
+    auto* out = ctx.Output<Tensor>("Out");
+
+    auto place = ctx.GetPlace();
+
+    out->mutable_data<T>(place);
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    auto runner = NpuOpRunner("Minimum", {*x, *y}, {*out}, {});
+    runner.Run(stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    elementwise_min,
+    ops::ElementwiseMinNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::ElementwiseMinNPUKernel<paddle::platform::NPUDeviceContext,
+                                 paddle::platform::float16>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_mul_op_npu.cc
new file mode 100644
index 0000000000000..08df6d4e27af0
--- /dev/null
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op_npu.cc
@@ -0,0 +1,96 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_ASCEND_CL
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/operators/elementwise/elementwise_mul_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class ElementwiseMulNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+
+    auto* out = ctx.Output<Tensor>("Out");
+
+    auto place = ctx.GetPlace();
+
+    out->mutable_data<T>(place);
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    auto runner = NpuOpRunner("Mul", {*x, *y}, {*out}, {});
+    runner.Run(stream);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class ElementwiseMulGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+
+    auto place = ctx.GetPlace();
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    if (dx) {
+      dx->mutable_data<T>(place);
+      auto dx_runner = NpuOpRunner("Mul", {*dout, *y}, {*dx}, {});
+      dx_runner.Run(stream);
+    }
+
+    if (dy) {
+      dy->mutable_data<T>(place);
+      auto dy_runner = NpuOpRunner("Mul", {*x, *dout}, {*dy}, {});
+      dy_runner.Run(stream);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    elementwise_mul,
+    ops::ElementwiseMulNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::ElementwiseMulNPUKernel<paddle::platform::NPUDeviceContext,
+                                 paddle::platform::float16>);
+
+REGISTER_OP_NPU_KERNEL(
+    elementwise_mul_grad,
+    ops::ElementwiseMulGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::ElementwiseMulGradNPUKernel<paddle::platform::NPUDeviceContext,
+                                     paddle::platform::float16>);
+#endif
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
new file mode 100644
index 0000000000000..321826ec647c9
--- /dev/null
+++ b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
@@ -0,0 +1,213 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/float16.h"
+
+#ifdef __HIPCC__
+#define ELEMENTWISE_BLOCK_SIZE 256
+#else
+#define ELEMENTWISE_BLOCK_SIZE 512
+#endif
+
+namespace paddle {
+namespace operators {
+
+enum ElementwiseType { kUnary = 1, kBinary = 2 };
+
+template <typename T, int Size>
+struct alignas(sizeof(T) * Size) CudaAlignedVector {
+  T val[Size];
+};
+
+template <typename T>
+int GetVectorizedSizeImpl(const T *pointer) {
+  uint64_t address = reinterpret_cast<uint64_t>(pointer);
+  constexpr int vec4 =
+      std::alignment_of<CudaAlignedVector<T, 4>>::value;  // NOLINT
+  constexpr int vec2 =
+      std::alignment_of<CudaAlignedVector<T, 2>>::value;  // NOLINT
+  if (address % vec4 == 0) {
+    return 4;
+  } else if (address % vec2 == 0) {
+    return 2;
+  }
+  return 1;
+}
+
+template <typename T>
+int GetVectorizedSize(const std::vector<const framework::Tensor *> &ins,
+                      const std::vector<framework::Tensor *> &outs) {
+  int vec_size = 4;
+  for (auto iter = ins.begin(); iter != ins.end(); ++iter) {
+    vec_size =
+        std::min<int>(vec_size, GetVectorizedSizeImpl((*iter)->data<T>()));
+  }
+  for (auto iter = outs.begin(); iter != outs.end(); ++iter) {
+    vec_size =
+        std::min<int>(vec_size, GetVectorizedSizeImpl((*iter)->data<T>()));
+  }
+  return vec_size;
+}
+
+template <ElementwiseType ET, int VecSize, typename T>
+struct ElementwiseDataWrapper {
+  T *out;
+  const T *in0;
+  const T *in1;
+  __device__ ElementwiseDataWrapper(T *out, const T *in0,
+                                    const T *in1 = nullptr)
+      : out(out), in0(in0), in1(in1) {}
+
+  using VecType = CudaAlignedVector<T, VecSize>;
+
+  inline __device__ void load_vector(VecType args[], int idx) {
+    const VecType *x_vec = reinterpret_cast<const VecType *>(in0);
+    args[0] = x_vec[idx];
+    if (ET == ElementwiseType::kBinary) {
+      const VecType *y_vec = reinterpret_cast<const VecType *>(in1);
+      args[1] = y_vec[idx];
+    }
+  }
+
+  inline __device__ void load_scalar(T args[], int idx) {
+    args[0] = in0[idx];
+    if (ET == ElementwiseType::kBinary) {
+      args[1] = in1[idx];
+    }
+  }
+
+  inline __device__ void store_vector(VecType res, int idx) {
+    VecType *out_vec = reinterpret_cast<VecType *>(out);
+    out_vec[idx] = res;
+  }
+
+  inline __device__ void store_scalar(T res, int idx) { out[idx] = res; }
+};
+
+template <ElementwiseType ET, int VecSize, typename T, typename Functor>
+__device__ void VectorizedKernelImpl(
+    ElementwiseDataWrapper<ET, VecSize, T> data, Functor func, int tid) {
+  using VecType = CudaAlignedVector<T, VecSize>;
+  VecType ins_vec[ET];
+  VecType out_vec;
+  T *ins_ptr[ET];
+  T *out_ptr;
+#pragma unroll
+  for (int i = 0; i < ET; ++i) {
+    ins_ptr[i] = reinterpret_cast<T *>(&(ins_vec[i]));
+  }
+  out_ptr = reinterpret_cast<T *>(&out_vec);
+
+  // load
+  data.load_vector(ins_vec, tid);
+
+// compute
+#pragma unroll
+  for (int i = 0; i < VecSize; ++i) {
+    T ins[ET];
+#pragma unroll
+    for (int j = 0; j < ET; ++j) {
+      ins[j] = ins_ptr[j][i];
+    }
+    out_ptr[i] = func(ins);
+  }
+
+  // store
+  data.store_vector(out_vec, tid);
+}
+
+template <ElementwiseType ET, int VecSize, typename T, typename Functor>
+__device__ void ScalarKernelImpl(ElementwiseDataWrapper<ET, VecSize, T> data,
+                                 Functor func, int start, int remain) {
+  T ins[ET];
+  T out;
+
+  for (int i = 0; i < remain; ++i) {
+    int idx = start + i;
+    // load
+    data.load_scalar(ins, idx);
+    // compute
+    out = func(ins);
+    // store
+    data.store_scalar(out, idx);
+  }
+}
+
+template <ElementwiseType ET, int VecSize, typename T, typename Functor>
+__global__ void VectorizedKernel(const T *__restrict__ in0,
+                                 const T *__restrict__ in1, T *out, int size,
+                                 Functor func) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int remain = size - VecSize * tid;
+  remain = remain > 0 ? remain : 0;
+  auto data = ElementwiseDataWrapper<ET, VecSize, T>(out, in0, in1);
+  if (remain >= VecSize) {
+    VectorizedKernelImpl(data, func, tid);
+  } else {
+    ScalarKernelImpl(data, func, tid * VecSize, remain);
+  }
+}
+
+template <ElementwiseType ET, typename T, typename Functor>
+__global__ void ScalarKernel(const T *__restrict__ in0,
+                             const T *__restrict__ in1, T *out, int size,
+                             Functor func) {
+  auto data = ElementwiseDataWrapper<ET, 1, T>(out, in0, in1);
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int remain = tid < size ? 1 : 0;
+  ScalarKernelImpl(data, func, tid, remain);
+}
+
+template <ElementwiseType ET, typename T, typename Functor>
+void LaunchElementwiseCudaKernel(
+    const platform::CUDADeviceContext &ctx,
+    const std::vector<const framework::Tensor *> &ins,
+    std::vector<framework::Tensor *> *outs, Functor func) {
+  // calculate the max vec_size for all ins and outs
+  auto size = ins[0]->numel();
+  int vec_size = GetVectorizedSize<T>(ins, *outs);
+  int block_size = ELEMENTWISE_BLOCK_SIZE;
+  int grid_size =
+      ((size + vec_size - 1) / vec_size + block_size - 1) / block_size;
+  const T *in0 = ins[0]->data<T>();
+  const T *in1 = (ET == ElementwiseType::kBinary) ? ins[1]->data<T>() : nullptr;
+  T *out = (*outs)[0]->data<T>();
+  // cuda kernel
+  auto stream = ctx.stream();
+  switch (vec_size) {
+    case 4:
+      VectorizedKernel<ET, 4><<<grid_size, block_size, 0, stream>>>(
+          in0, in1, out, size, func);
+      break;
+    case 2:
+      VectorizedKernel<ET, 2><<<grid_size, block_size, 0, stream>>>(
+          in0, in1, out, size, func);
+      break;
+    case 1:
+      ScalarKernel<ET><<<grid_size, block_size, 0, stream>>>(in0, in1, out,
+                                                             size, func);
+      break;
+    default:
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Unsupported vectorized size: %d !", vec_size));
+      break;
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc
index 3a2a21647083b..f06dbd26873a6 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc
@@ -38,7 +38,7 @@ USE_OP(elementwise_sub);
 USE_OP_DEVICE_KERNEL(elementwise_sub, NPU);
 
 template <typename T>
-void Compare(f::Scope* scope, const p::DeviceContext& ctx,
+void Compare(f::Scope *scope, const p::DeviceContext &ctx,
              std::string op_type) {
   // init
   auto x = scope->Var("X");
@@ -62,8 +62,6 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx,
   TensorFromVector(init_y, ctx, tensor_y);
   tensor_y->Resize({10, 10});
 
-  ctx.Wait();
-
   auto place = ctx.GetPlace();
   auto out = scope->Var("Out");
   auto tensor_out = out->GetMutable<f::LoDTensor>();
@@ -92,7 +90,7 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx,
 }
 
 template <typename T>
-void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx,
+void CompareGrad(f::Scope *scope, const p::DeviceContext &ctx,
                  std::string op_type) {
   // init
   auto dout = scope->Var("DOut");
@@ -121,8 +119,6 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx,
   TensorFromVector(init_dout, ctx, tensor_dout);
   tensor_dout->Resize({2, 3, 5});
 
-  ctx.Wait();
-
   // run
   f::AttributeMap attrs;
   auto op = f::OpRegistry::CreateOp(
@@ -158,24 +154,30 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx,
 
 TEST(elementwise_add, NPU_fp32) {
   f::Scope scope;
-  p::NPUDeviceContext ctx(p::NPUPlace(0));
-  Compare<float>(&scope, ctx, "elementwise_add");
+  auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  Compare<float>(&scope, *ctx, "elementwise_add");
 }
 
 TEST(elementwise_sub, NPU_fp32) {
   f::Scope scope;
-  p::NPUDeviceContext ctx(p::NPUPlace(0));
-  Compare<float>(&scope, ctx, "elementwise_sub");
+  auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  Compare<float>(&scope, *ctx, "elementwise_sub");
 }
 
 TEST(elementwise_sub, NPU_fp16) {
   f::Scope scope;
-  p::NPUDeviceContext ctx(p::NPUPlace(0));
-  Compare<p::float16>(&scope, ctx, "elementwise_sub");
+  auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  Compare<p::float16>(&scope, *ctx, "elementwise_sub");
 }
 
 TEST(elementwise_sub_grad, NPU) {
   f::Scope scope;
-  p::NPUDeviceContext ctx(p::NPUPlace(0));
-  CompareGrad<float>(&scope, ctx, "elementwise_sub_grad");
+  auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  CompareGrad<float>(&scope, *ctx, "elementwise_sub_grad");
+}
+
+TEST(elementwise_add_grad, NPU) {
+  f::Scope scope;
+  auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  CompareGrad<float>(&scope, *ctx, "elementwise_add_grad");
 }
diff --git a/paddle/fluid/operators/elementwise/elementwise_pow_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_pow_op_npu.cc
new file mode 100644
index 0000000000000..26cc925b869c6
--- /dev/null
+++ b/paddle/fluid/operators/elementwise/elementwise_pow_op_npu.cc
@@ -0,0 +1,57 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/operators/elementwise/elementwise_pow_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class ElementwisePowNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+
+    auto* out = ctx.Output<Tensor>("Out");
+
+    auto place = ctx.GetPlace();
+
+    out->mutable_data<T>(place);
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    auto runner = NpuOpRunner("Pow", {*x, *y}, {*out}, {});
+    runner.Run(stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    elementwise_pow,
+    ops::ElementwisePowNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::ElementwisePowNPUKernel<paddle::platform::NPUDeviceContext,
+                                 paddle::platform::float16>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc
index e47c38daee8ba..a6e438f8016e0 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifdef PADDLE_WITH_ASCEND_CL
 #include <memory>
 #include <string>
 
@@ -24,7 +23,7 @@ namespace operators {
 
 using Tensor = framework::Tensor;
 
-template <typename DeviceContext, typename T>
+template <typename T>
 class ElementwiseSubNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -43,7 +42,7 @@ class ElementwiseSubNPUKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename DeviceContext, typename T>
+template <typename T>
 class ElementwiseSubGradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -51,8 +50,9 @@ class ElementwiseSubGradNPUKernel : public framework::OpKernel<T> {
     auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
 
-    dx->mutable_data<T>(ctx.GetPlace());
-    dy->mutable_data<T>(ctx.GetPlace());
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
 
     // NOTE(zhiqiu): It seems Ascend Sub follow the broadcast sematics with
     // default axis=-1?
@@ -66,89 +66,94 @@ class ElementwiseSubGradNPUKernel : public framework::OpKernel<T> {
     // 1. [2, 3, 5] => [3, 5], ReduceSumD on axis = 0, keep_dims = false.
     // 2. [3, 5] => [1, 5], ReduceSumD on axis = 0, keep_dims = true.
 
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    // For dx
-    // stage 1
-    auto reduce_ndim = dout->dims().size() - dx->dims().size();
-    std::vector<int> axes;
-    for (auto i = 0; i < reduce_ndim; ++i) {
-      axes.push_back(i);
-    }
-    auto tmp_dout = dout;
-    Tensor reduced_dout(dx->type());
-    if (axes.size() != 0) {
-      std::vector<int64_t> reduced_dout_dims;
-      for (auto i = reduce_ndim; i < dout->dims().size(); ++i) {
-        reduced_dout_dims.push_back(dout->dims()[i]);
-      }
-      reduced_dout.Resize(framework::make_ddim(reduced_dout_dims));
-      reduced_dout.mutable_data<T>(ctx.GetPlace());
-      auto runner = NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout},
-                                {{"axes", axes}, {"keep_dims", false}});
-      runner.Run(stream);
-      tmp_dout = &reduced_dout;
-    }
-
-    // stage 2
-    axes.clear();
-    for (auto i = 0; i < dx->dims().size(); ++i) {
-      if (dx->dims()[i] == 1) {
+    if (dx) {
+      dx->mutable_data<T>(ctx.GetPlace());
+      // For dx
+      // stage 1
+      auto reduce_ndim = dout->dims().size() - dx->dims().size();
+      std::vector<int> axes;
+      for (auto i = 0; i < reduce_ndim; ++i) {
         axes.push_back(i);
       }
-    }
-    if (axes.size() != 0) {
-      auto runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {*dx},
-                                {{"axes", axes}, {"keep_dims", true}});
-      runner.Run(stream);
-    } else {
-      framework::TensorCopySync(*tmp_dout, ctx.GetPlace(), dx);
-    }
-
-    // For dy
-    // stage 1
-    reduce_ndim = dout->dims().size() - dy->dims().size();
-    axes.clear();
-    for (auto i = 0; i < reduce_ndim; ++i) {
-      axes.push_back(i);
-    }
-    tmp_dout = dout;
-    Tensor reduced_dy(dy->type());
+      Tensor* tmp_dout = const_cast<Tensor*>(dout);
+      Tensor reduced_dout(dx->type());
+      if (axes.size() != 0) {
+        std::vector<int64_t> reduced_dout_dims;
+        for (auto i = reduce_ndim; i < dout->dims().size(); ++i) {
+          reduced_dout_dims.push_back(dout->dims()[i]);
+        }
+        reduced_dout.Resize(framework::make_ddim(reduced_dout_dims));
+        reduced_dout.mutable_data<T>(ctx.GetPlace());
+        auto runner = NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout},
+                                  {{"axes", axes}, {"keep_dims", false}});
+        runner.Run(stream);
+        tmp_dout = &reduced_dout;
+      }
 
-    if (axes.size() != 0) {
-      std::vector<int64_t> reduced_dout_dims;
-      for (auto i = reduce_ndim; i < dout->dims().size(); ++i) {
-        reduced_dout_dims.push_back(dout->dims()[i]);
+      // stage 2
+      axes.clear();
+      for (auto i = 0; i < dx->dims().size(); ++i) {
+        if (dx->dims()[i] == 1) {
+          axes.push_back(i);
+        }
+      }
+      if (axes.size() != 0) {
+        auto runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {*dx},
+                                  {{"axes", axes}, {"keep_dims", true}});
+        runner.Run(stream);
+      } else {
+        framework::TensorCopy(
+            *tmp_dout, ctx.GetPlace(),
+            ctx.template device_context<platform::DeviceContext>(), dx);
       }
-      reduced_dout.Resize(framework::make_ddim(reduced_dout_dims));
-      reduced_dout.mutable_data<T>(ctx.GetPlace());
-      auto runner = NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout},
-                                {{"axes", axes}, {"keep_dims", false}});
-      runner.Run(stream);
-      tmp_dout = &reduced_dout;
     }
-
-    // stage 2
-    axes.clear();
-    auto* tmp_dy = tmp_dout;
-    for (auto i = 0; i < dy->dims().size(); ++i) {
-      if (dy->dims()[i] == 1) {
+    if (dy) {
+      dy->mutable_data<T>(ctx.GetPlace());
+      // For dy
+      // stage 1
+      auto reduce_ndim = dout->dims().size() - dy->dims().size();
+      std::vector<int> axes;
+      for (auto i = 0; i < reduce_ndim; ++i) {
         axes.push_back(i);
       }
-    }
-    if (axes.size() != 0) {
-      reduced_dy.Resize(dy->dims());
-      reduced_dy.mutable_data<T>(ctx.GetPlace());
-      auto runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {reduced_dy},
-                                {{"axes", axes}, {"keep_dims", true}});
+      Tensor* tmp_dout = const_cast<Tensor*>(dout);
+      Tensor reduced_dy(dy->type());
+      Tensor reduced_dout(dy->type());
+
+      if (axes.size() != 0) {
+        std::vector<int64_t> reduced_dout_dims;
+        for (auto i = reduce_ndim; i < dout->dims().size(); ++i) {
+          reduced_dout_dims.push_back(dout->dims()[i]);
+        }
+        reduced_dout.Resize(framework::make_ddim(reduced_dout_dims));
+        reduced_dout.mutable_data<T>(ctx.GetPlace());
+        auto runner = NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout},
+                                  {{"axes", axes}, {"keep_dims", false}});
+        runner.Run(stream);
+        tmp_dout = &reduced_dout;
+      }
+
+      // stage 2
+      axes.clear();
+      Tensor* tmp_dy = tmp_dout;
+      for (auto i = 0; i < dy->dims().size(); ++i) {
+        if (dy->dims()[i] == 1) {
+          axes.push_back(i);
+        }
+      }
+      if (axes.size() != 0) {
+        reduced_dy.Resize(dy->dims());
+        reduced_dy.mutable_data<T>(ctx.GetPlace());
+        auto runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {reduced_dy},
+                                  {{"axes", axes}, {"keep_dims", true}});
+        runner.Run(stream);
+        tmp_dy = &reduced_dy;
+      }
+
+      // stage 3, negative
+      auto runner = NpuOpRunner("Neg", {*tmp_dy}, {*dy}, {});
       runner.Run(stream);
-      tmp_dy = &reduced_dy;
     }
-
-    // stage 3, negative
-    auto runner = NpuOpRunner("Neg", {*tmp_dy}, {*dy}, {});
-    runner.Run(stream);
   }
 };
 
@@ -156,16 +161,11 @@ class ElementwiseSubGradNPUKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(elementwise_sub, ops::ElementwiseSubNPUKernel<float>,
+                       ops::ElementwiseSubNPUKernel<plat::float16>);
 
-REGISTER_OP_NPU_KERNEL(
-    elementwise_sub,
-    ops::ElementwiseSubNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::ElementwiseSubNPUKernel<paddle::platform::NPUDeviceContext,
-                                 paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    elementwise_sub_grad,
-    ops::ElementwiseSubGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::ElementwiseSubGradNPUKernel<paddle::platform::NPUDeviceContext,
-                                     paddle::platform::float16>);
-#endif
+REGISTER_OP_NPU_KERNEL(elementwise_sub_grad,
+                       ops::ElementwiseSubGradNPUKernel<float>,
+                       ops::ElementwiseSubGradNPUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
index b43dddfcf19db..8f519de075760 100644
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
@@ -86,7 +86,8 @@ class EltwiseAddMKLDNNGradKernel : public ElemwiseGradKernel<T> {
         platform::ReductionMKLDNNHandler<T> handler_sum(
             dnnl::algorithm::reduction_sum, 0.0f, 0.0f, dev_ctx, onednn_engine,
             ctx.GetPlace(), dout, dy,
-            ctx.InputName(framework::GradVarName("Out")));
+            ctx.InputName(framework::GradVarName("Out")),
+            CalculateBroadcastedDims(dout, dy));
         auto dy_memory_p = handler_sum.AcquireDstMemory(dy);
         auto reduction_p = handler_sum.AcquireForwardPrimitive();
         reduction_p->execute(astream, {{DNNL_ARG_SRC, *reorder_src_memory_p},
diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h
index df827117a0d30..e5d20893335f7 100644
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h
@@ -81,5 +81,20 @@ class EltwiseMKLDNNKernel : public framework::OpKernel<T> {
     z->set_format(platform::GetMKLDNNFormat(*dst_memory));
   }
 };
+
+inline std::vector<int64_t> CalculateBroadcastedDims(const Tensor* x,
+                                                     const Tensor* y) {
+  const auto src_tz = framework::vectorize(x->dims());
+  const auto dst_tz = framework::vectorize(y->dims());
+
+  size_t j = 0;
+  std::vector<int64_t> dst_tz_ex(src_tz.size(), 1);
+  for (size_t i = 0; i < src_tz.size(); ++i) {
+    dst_tz_ex[i] = (src_tz[i] != dst_tz[j]) ? 1 : dst_tz[j++];
+    if (j == dst_tz.size()) break;
+  }
+
+  return dst_tz_ex;
+}
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc
index c9209cc39d5e3..1c246e8d18937 100644
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc
@@ -105,7 +105,8 @@ class EltwiseMulMKLDNNGradKernel : public ElemwiseGradKernel<T> {
         platform::ReductionMKLDNNHandler<T> handler_sum(
             dnnl::algorithm::reduction_sum, 0.0f, 0.0f, dev_ctx, mkldnn_engine,
             ctx.GetPlace(), dout, dy,
-            ctx.InputName(framework::GradVarName("Out")));
+            ctx.InputName(framework::GradVarName("Out")),
+            CalculateBroadcastedDims(dout, dy));
         auto dy_memory_p = handler_sum.AcquireDstMemory(dy);
         auto reduction_p = handler_sum.AcquireForwardPrimitive();
         // As source we use mem object with results from binary operation
diff --git a/paddle/fluid/operators/expand_as_op.cc b/paddle/fluid/operators/expand_as_op.cc
index 25b83ed93f729..e2bf61de63196 100644
--- a/paddle/fluid/operators/expand_as_op.cc
+++ b/paddle/fluid/operators/expand_as_op.cc
@@ -147,3 +147,17 @@ REGISTER_OP_CPU_KERNEL(
     ops::ExpandAsGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
     ops::ExpandAsGradKernel<paddle::platform::CPUDeviceContext, float>,
     ops::ExpandAsGradKernel<paddle::platform::CPUDeviceContext, double>);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+REGISTER_OP_CUDA_KERNEL(
+    expand_as, ops::ExpandAsKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ExpandAsKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ExpandAsKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ExpandAsKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::ExpandAsKernel<paddle::platform::CUDADeviceContext, bool>);
+REGISTER_OP_CUDA_KERNEL(
+    expand_as_grad,
+    ops::ExpandAsGradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ExpandAsGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::ExpandAsGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ExpandAsGradKernel<paddle::platform::CUDADeviceContext, double>);
+#endif
diff --git a/paddle/fluid/operators/expand_as_op.cu b/paddle/fluid/operators/expand_as_op.cu
deleted file mode 100755
index dbb1fcf3ab326..0000000000000
--- a/paddle/fluid/operators/expand_as_op.cu
+++ /dev/null
@@ -1,25 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/expand_as_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    expand_as, ops::ExpandAsKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ExpandAsKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ExpandAsKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ExpandAsKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::ExpandAsKernel<paddle::platform::CUDADeviceContext, bool>);
-REGISTER_OP_CUDA_KERNEL(
-    expand_as_grad,
-    ops::ExpandAsGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ExpandAsGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::ExpandAsGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ExpandAsGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/expand_as_v2_op.cc b/paddle/fluid/operators/expand_as_v2_op.cc
index 70099afbd5994..5296a144f6247 100644
--- a/paddle/fluid/operators/expand_as_v2_op.cc
+++ b/paddle/fluid/operators/expand_as_v2_op.cc
@@ -129,3 +129,18 @@ REGISTER_OP_CPU_KERNEL(
     ops::ExpandAsV2GradKernel<paddle::platform::CPUDeviceContext, int64_t>,
     ops::ExpandAsV2GradKernel<paddle::platform::CPUDeviceContext, float>,
     ops::ExpandAsV2GradKernel<paddle::platform::CPUDeviceContext, double>);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+REGISTER_OP_CUDA_KERNEL(
+    expand_as_v2,
+    ops::ExpandAsV2Kernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ExpandAsV2Kernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ExpandAsV2Kernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ExpandAsV2Kernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::ExpandAsV2Kernel<paddle::platform::CUDADeviceContext, bool>);
+REGISTER_OP_CUDA_KERNEL(
+    expand_as_v2_grad,
+    ops::ExpandAsV2GradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ExpandAsV2GradKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::ExpandAsV2GradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ExpandAsV2GradKernel<paddle::platform::CUDADeviceContext, double>);
+#endif
diff --git a/paddle/fluid/operators/expand_as_v2_op.cu b/paddle/fluid/operators/expand_as_v2_op.cu
deleted file mode 100644
index e315144472dd9..0000000000000
--- a/paddle/fluid/operators/expand_as_v2_op.cu
+++ /dev/null
@@ -1,26 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/expand_as_v2_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    expand_as_v2,
-    ops::ExpandAsV2Kernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ExpandAsV2Kernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ExpandAsV2Kernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ExpandAsV2Kernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::ExpandAsV2Kernel<paddle::platform::CUDADeviceContext, bool>);
-REGISTER_OP_CUDA_KERNEL(
-    expand_as_v2_grad,
-    ops::ExpandAsV2GradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ExpandAsV2GradKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::ExpandAsV2GradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ExpandAsV2GradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/expand_op.cc b/paddle/fluid/operators/expand_op.cc
index 83e205367a7af..e7da08ff27711 100644
--- a/paddle/fluid/operators/expand_op.cc
+++ b/paddle/fluid/operators/expand_op.cc
@@ -273,3 +273,21 @@ REGISTER_OP_CPU_KERNEL(
     ops::ExpandGradKernel<paddle::platform::CPUDeviceContext, double>,
     ops::ExpandGradKernel<paddle::platform::CPUDeviceContext, int>,
     ops::ExpandGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+REGISTER_OP_CUDA_KERNEL(
+    expand, ops::ExpandKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ExpandKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ExpandKernel<paddle::platform::CUDADeviceContext,
+                      paddle::platform::float16>,
+    ops::ExpandKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ExpandKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::ExpandKernel<paddle::platform::CUDADeviceContext, bool>);
+REGISTER_OP_CUDA_KERNEL(
+    expand_grad,
+    ops::ExpandGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ExpandGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ExpandGradKernel<paddle::platform::CUDADeviceContext,
+                          paddle::platform::float16>,
+    ops::ExpandGradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ExpandGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
+#endif
diff --git a/paddle/fluid/operators/expand_op.cu b/paddle/fluid/operators/expand_op.cu
deleted file mode 100644
index f2f8e2f7414f3..0000000000000
--- a/paddle/fluid/operators/expand_op.cu
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/expand_op.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(
-    expand, ops::ExpandKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ExpandKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ExpandKernel<paddle::platform::CUDADeviceContext, plat::float16>,
-    ops::ExpandKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ExpandKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::ExpandKernel<paddle::platform::CUDADeviceContext, bool>);
-REGISTER_OP_CUDA_KERNEL(
-    expand_grad,
-    ops::ExpandGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ExpandGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ExpandGradKernel<paddle::platform::CUDADeviceContext, plat::float16>,
-    ops::ExpandGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ExpandGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/expand_op.h b/paddle/fluid/operators/expand_op.h
index e1a1ce0a8171e..e566d69096595 100755
--- a/paddle/fluid/operators/expand_op.h
+++ b/paddle/fluid/operators/expand_op.h
@@ -64,6 +64,12 @@ inline std::vector<int> get_expand_times(
       TensorCopySync(*expand_tensor, platform::CPUPlace(), &cpu_expand_tensor);
       expand_data = cpu_expand_tensor.data<int>();
     }
+#ifdef PADDLE_WITH_ASCEND_CL
+    if (platform::is_npu_place(expand_tensor->place())) {
+      TensorCopySync(*expand_tensor, platform::CPUPlace(), &cpu_expand_tensor);
+      expand_data = cpu_expand_tensor.data<int>();
+    }
+#endif
 #ifdef PADDLE_WITH_XPU
     if (platform::is_xpu_place(expand_tensor->place())) {
       TensorCopySync(*expand_tensor, platform::CPUPlace(), &cpu_expand_tensor);
diff --git a/paddle/fluid/operators/expand_op_npu.cc b/paddle/fluid/operators/expand_op_npu.cc
new file mode 100644
index 0000000000000..bb3a6512d2c8b
--- /dev/null
+++ b/paddle/fluid/operators/expand_op_npu.cc
@@ -0,0 +1,86 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_ASCEND_CL
+#include <iostream>
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/expand_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class ExpandNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto rank = context.Input<Tensor>("X")->dims().size();
+    PADDLE_ENFORCE_GE(
+        rank, 1,
+        platform::errors::InvalidArgument(
+            "The number of dimensions of the input 'x' for Op(expand) "
+            "must be greater than or equal to 1, but the value received is %d.",
+            rank));
+    PADDLE_ENFORCE_LE(
+        rank, MAX_RANK_SUPPORTED,
+        platform::errors::InvalidArgument(
+            "The number of dimensions of the input 'x' for Op(expand) "
+            "must be less than or equal to %d, but the value received is %d.",
+            MAX_RANK_SUPPORTED, rank));
+    switch (rank) { REP_EXPAND_TEMPLATE(MAX_RANK_SUPPORTED) }
+  }
+
+ protected:
+  template <int Rank>
+  void Expand(const framework::ExecutionContext& context) const {
+    auto* in0 = context.Input<framework::LoDTensor>("X");
+    auto in_dims = in0->dims();
+    auto expand_times = get_expand_times(context);
+    PADDLE_ENFORCE_EQ(
+        static_cast<size_t>(in_dims.size()), expand_times.size(),
+        platform::errors::InvalidArgument(
+            "The number of elements (%d) of 'expand_times' for "
+            "Op(expand) must be equal to the number "
+            "of dimensions (%d) of the input.",
+            expand_times.size(), static_cast<size_t>(in_dims.size())));
+    auto* out0 = context.Output<framework::LoDTensor>("Out");
+    framework::DDim out_dims(in_dims);
+
+    for (size_t i = 0; i < expand_times.size(); ++i) {
+      out_dims[i] *= expand_times[i];
+    }
+
+    out0->Resize(out_dims);
+    out0->mutable_data<T>(context.device_context().GetPlace());
+    auto runner =
+        NpuOpRunner("TileD", {*in0}, {*out0}, {{"multiples", expand_times}});
+    auto stream =
+        context.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    runner.Run(stream);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_NPU_KERNEL(
+    expand, ops::ExpandNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::ExpandNPUKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::ExpandNPUKernel<paddle::platform::NPUDeviceContext,
+                         paddle::platform::float16>);
+
+#endif
diff --git a/paddle/fluid/operators/expand_op_npu_test.cc b/paddle/fluid/operators/expand_op_npu_test.cc
new file mode 100644
index 0000000000000..880eb341f2093
--- /dev/null
+++ b/paddle/fluid/operators/expand_op_npu_test.cc
@@ -0,0 +1,74 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <iostream>
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(expand);
+USE_OP_DEVICE_KERNEL(expand, NPU);
+
+template <typename T>
+void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
+  // init
+  auto in = scope->Var("X");
+  auto expand_times = scope->Var("ExpandTimes");
+  auto out = scope->Var("Out");
+  auto in_t = in->GetMutable<f::LoDTensor>();
+  auto out_t = out->GetMutable<f::LoDTensor>();
+  auto expand_times_t = expand_times->GetMutable<f::LoDTensor>();
+
+  auto place = ctx.GetPlace();
+  TensorFromVector(std::vector<T>(3 * 1 * 7, 1), ctx, in_t);
+  TensorFromVector(std::vector<int>({1, 10, 1}), ctx, expand_times_t);
+
+  in_t->Resize(f::make_ddim({3, 1, 7}));
+  expand_times_t->Resize(f::make_ddim({3}));
+  out_t->Resize(f::make_ddim({3, 10, 7}));
+  out_t->mutable_data<T>(place);
+
+  f::AttributeMap attrs = {{}};
+  auto op = f::OpRegistry::CreateOp(
+      "expand", {{"X", {"X"}}, {"ExpandTimes", {"ExpandTimes"}}},
+      {{"Out", {"Out"}}}, attrs);
+  op->Run(*scope, place);
+  ctx.Wait();
+
+  auto out_dim = out_t->dims();
+  EXPECT_EQ(out_dim.at(0), 3);
+  EXPECT_EQ(out_dim.at(1), 10);
+  EXPECT_EQ(out_dim.at(2), 7);
+}
+
+TEST(expand, NPU_fp32) {
+  f::Scope scope;
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  Compare<float>(&scope, *ctx);
+}
diff --git a/paddle/fluid/operators/expand_v2_op.cc b/paddle/fluid/operators/expand_v2_op.cc
index 05ab0f6c8dc8f..618c1560c5eac 100644
--- a/paddle/fluid/operators/expand_v2_op.cc
+++ b/paddle/fluid/operators/expand_v2_op.cc
@@ -278,3 +278,21 @@ REGISTER_OP_CPU_KERNEL(
     ops::ExpandV2GradKernel<paddle::platform::CPUDeviceContext, double>,
     ops::ExpandV2GradKernel<paddle::platform::CPUDeviceContext, int>,
     ops::ExpandV2GradKernel<paddle::platform::CPUDeviceContext, int64_t>);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+REGISTER_OP_CUDA_KERNEL(
+    expand_v2, ops::ExpandV2Kernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ExpandV2Kernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ExpandV2Kernel<paddle::platform::CUDADeviceContext,
+                        paddle::platform::float16>,
+    ops::ExpandV2Kernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ExpandV2Kernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::ExpandV2Kernel<paddle::platform::CUDADeviceContext, bool>);
+REGISTER_OP_CUDA_KERNEL(
+    expand_v2_grad,
+    ops::ExpandV2GradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ExpandV2GradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ExpandV2GradKernel<paddle::platform::CUDADeviceContext,
+                            paddle::platform::float16>,
+    ops::ExpandV2GradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ExpandV2GradKernel<paddle::platform::CUDADeviceContext, int64_t>);
+#endif
diff --git a/paddle/fluid/operators/expand_v2_op.cu b/paddle/fluid/operators/expand_v2_op.cu
deleted file mode 100644
index e096dbc27f0c2..0000000000000
--- a/paddle/fluid/operators/expand_v2_op.cu
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/expand_v2_op.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(
-    expand_v2, ops::ExpandV2Kernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ExpandV2Kernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ExpandV2Kernel<paddle::platform::CUDADeviceContext, plat::float16>,
-    ops::ExpandV2Kernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ExpandV2Kernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::ExpandV2Kernel<paddle::platform::CUDADeviceContext, bool>);
-REGISTER_OP_CUDA_KERNEL(
-    expand_v2_grad,
-    ops::ExpandV2GradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ExpandV2GradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ExpandV2GradKernel<paddle::platform::CUDADeviceContext, plat::float16>,
-    ops::ExpandV2GradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ExpandV2GradKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc
index caa2930990193..f35d8b6bbf89f 100644
--- a/paddle/fluid/operators/fill_constant_op.cc
+++ b/paddle/fluid/operators/fill_constant_op.cc
@@ -154,6 +154,7 @@ REGISTER_OP_CPU_KERNEL(fill_constant, ops::FillConstantKernel<float>,
                        ops::FillConstantKernel<int>,
                        ops::FillConstantKernel<bool>,
                        ops::FillConstantKernel<paddle::platform::float16>,
+                       ops::FillConstantKernel<paddle::platform::bfloat16>,
                        ops::FillConstantKernel<paddle::platform::complex64>,
                        ops::FillConstantKernel<paddle::platform::complex128>);
 
diff --git a/paddle/fluid/operators/fill_constant_op.h b/paddle/fluid/operators/fill_constant_op.h
index 4608f167548a3..46c4ae12036a4 100644
--- a/paddle/fluid/operators/fill_constant_op.h
+++ b/paddle/fluid/operators/fill_constant_op.h
@@ -105,7 +105,8 @@ class FillConstantKernel : public framework::OpKernel<T> {
     int actual_place = place_type;
 
     if (actual_place == -1) {
-      bool cpu_place = force_cpu || ctx.GetPlace() == platform::CPUPlace();
+      bool cpu_place = (force_cpu || ctx.GetPlace() == platform::CPUPlace() ||
+                        data_type == framework::proto::VarType::BF16);
       if (cpu_place) {
         actual_place = 0;
       } else if (platform::is_gpu_place(ctx.GetPlace())) {
diff --git a/paddle/fluid/operators/fill_constant_op_npu.cc b/paddle/fluid/operators/fill_constant_op_npu.cc
new file mode 100644
index 0000000000000..4ea4c11c47835
--- /dev/null
+++ b/paddle/fluid/operators/fill_constant_op_npu.cc
@@ -0,0 +1,87 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/operators/fill_constant_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+#include "paddle/fluid/operators/utils.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class FillConstantNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto data_type =
+        static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype"));
+    auto str_value = ctx.Attr<std::string>("str_value");
+    auto float_value = ctx.Attr<float>("value");
+
+    auto* out_var = ctx.Output<framework::Tensor>("Out");
+    auto place = ctx.GetPlace();
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    T value;
+    if (str_value.empty()) {
+      value = static_cast<T>(float_value);
+    } else {
+      // handle NaN/Inf first, which cannot be read from stream.
+      if (str_value == "inf") {
+        value = static_cast<T>(std::numeric_limits<double>::infinity());
+      } else if (str_value == "-inf") {
+        value = static_cast<T>(-std::numeric_limits<double>::infinity());
+      } else if (str_value == "nan") {
+        value = static_cast<T>(std::numeric_limits<double>::quiet_NaN());
+      } else {
+        std::stringstream convert_stream(str_value);
+        if (std::is_same<int64_t, T>::value) {
+          int64_t tmp_value;
+          convert_stream >> tmp_value;
+          value = static_cast<T>(tmp_value);
+        } else {
+          double tmp_value;
+          convert_stream >> tmp_value;
+          value = static_cast<T>(tmp_value);
+        }
+      }
+    }
+    auto shape = GetShape(ctx);
+
+    Tensor tensor_tmp(data_type);
+    tensor_tmp.mutable_data<T>({1}, ctx.GetPlace());
+    FillNpuTensorWithConstant<T>(&tensor_tmp, value);
+
+    out_var->mutable_data<T>(shape, place);
+    auto runner = NpuOpRunner("FillD", {tensor_tmp}, {*out_var},
+                              {{"dims", framework::vectorize(shape)}});
+    runner.Run(stream);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    fill_constant,
+    ops::FillConstantNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::FillConstantNPUKernel<paddle::platform::NPUDeviceContext, bool>,
+    ops::FillConstantNPUKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::FillConstantNPUKernel<paddle::platform::NPUDeviceContext,
+                               paddle::platform::float16>);
diff --git a/paddle/fluid/operators/fused/conv_fusion_op.cu b/paddle/fluid/operators/fused/conv_fusion_op.cu
index 33d408582ff48..c9ba7a61e0907 100644
--- a/paddle/fluid/operators/fused/conv_fusion_op.cu
+++ b/paddle/fluid/operators/fused/conv_fusion_op.cu
@@ -200,13 +200,13 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
 
     PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
         cudnn_conv_desc, CUDNN_DEFAULT_MATH));
-#if CUDNN_VERSION >= 11000
+#if CUDA_VERSION >= 11000 && CUDNN_VERSION >= 8000
     if (!platform::allow_tf32_cudnn) {
       PADDLE_ENFORCE_CUDA_SUCCESS(
           platform::dynload::cudnnSetConvolutionMathType(cudnn_conv_desc,
                                                          CUDNN_FMA_MATH));
     }
-#endif  // CUDA_VERSION >= 11000
+#endif  // CUDA_VERSION >= 11000 && CUDNN_VERSION >= 8000
 
     auto x_dims = framework::vectorize(transformed_input.dims());
     auto f_dims = framework::vectorize(filter->dims());
diff --git a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu
index c448c529f5691..b3796f1df5fdf 100644
--- a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu
+++ b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu
@@ -153,13 +153,13 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
       PADDLE_ENFORCE_CUDA_SUCCESS(
           platform::dynload::cudnnSetConvolutionMathType(conv_desc[i],
                                                          CUDNN_DEFAULT_MATH));
-#if CUDNN_VERSION >= 11000
+#if CUDA_VERSION >= 11000 && CUDNN_VERSION >= 8000
       if (!platform::allow_tf32_cudnn) {
         PADDLE_ENFORCE_CUDA_SUCCESS(
             platform::dynload::cudnnSetConvolutionMathType(conv_desc[i],
                                                            CUDNN_FMA_MATH));
       }
-#endif  // CUDA_VERSION >= 11000
+#endif  // CUDA_VERSION >= 11000 && CUDNN_VERSION >= 8000
     }
     in_dims[2][1] *= 2;
     in_strides[2][0] = oc * h * w;
diff --git a/paddle/fluid/operators/gather_op_npu.cc b/paddle/fluid/operators/gather_op_npu.cc
new file mode 100644
index 0000000000000..1ee8889995f4d
--- /dev/null
+++ b/paddle/fluid/operators/gather_op_npu.cc
@@ -0,0 +1,99 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/gather_op.h"
+#include <memory>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/kron_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+#include "paddle/fluid/platform/npu_info.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class GatherOpNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *x = ctx.Input<Tensor>("X");
+    auto *index = ctx.Input<Tensor>("Index");
+    auto *out = ctx.Output<Tensor>("Out");
+
+    out->mutable_data<T>(ctx.GetPlace());
+    auto runner = NpuOpRunner("Gather", {*x, *index}, {*out},
+                              {{"validate_indices", true}});
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    runner.Run(stream);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class GatherGradOpNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *index = ctx.Input<Tensor>("Index");
+    auto *x = ctx.Input<Tensor>("X");
+    auto *dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto *dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    dx->mutable_data<T>(ctx.GetPlace());
+
+    // step1: Unsqueeze index
+    framework::Tensor tmp_tensor(index->type());
+    const auto index_dims = index->dims();
+    if (index_dims.size() == 1) {
+      tmp_tensor.ShareDataWith(*index);
+      std::vector<int64_t> new_dim = {index_dims[0], 1};
+      tmp_tensor.Resize(framework::make_ddim(new_dim));
+      index = &tmp_tensor;
+    }
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    // step2: ZerosLike x in device
+    Tensor zeroslike_xout(dx->type());
+    zeroslike_xout.Resize(x->dims());
+    auto p = zeroslike_xout.mutable_data<T>(ctx.GetPlace());
+
+    platform::NPUMemsetAsync(static_cast<void *>(p), 0,
+                             zeroslike_xout.numel() * sizeof(T), stream);
+
+    // step3: scatter(x_grad)
+    auto runner_scatter = NpuOpRunner(
+        "TensorScatterUpdate", {zeroslike_xout, *index, *dout}, {*dx}, {});
+    runner_scatter.Run(stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_NPU_KERNEL(
+    gather, ops::GatherOpNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::GatherOpNPUKernel<paddle::platform::NPUDeviceContext, double>,
+    ops::GatherOpNPUKernel<paddle::platform::NPUDeviceContext,
+                           paddle::platform::float16>);
+
+REGISTER_OP_NPU_KERNEL(
+    gather_grad,
+    ops::GatherGradOpNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::GatherGradOpNPUKernel<paddle::platform::NPUDeviceContext, double>,
+    ops::GatherGradOpNPUKernel<paddle::platform::NPUDeviceContext,
+                               paddle::platform::float16>);
diff --git a/paddle/fluid/operators/gather_op_npu_test.cc b/paddle/fluid/operators/gather_op_npu_test.cc
new file mode 100644
index 0000000000000..31e19d8f600c3
--- /dev/null
+++ b/paddle/fluid/operators/gather_op_npu_test.cc
@@ -0,0 +1,169 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/gather_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(gather);
+USE_OP_DEVICE_KERNEL(gather, NPU);
+USE_OP(gather_grad);
+USE_OP_DEVICE_KERNEL(gather_grad, NPU);
+
+template <typename T>
+void Compare(f::Scope* scope, const p::DeviceContext& ctx,
+             std::string op_type) {
+  // init
+  auto x = scope->Var("X");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+
+  auto index = scope->Var("Index");
+  auto tensor_index = index->GetMutable<f::LoDTensor>();
+
+  std::vector<T> init_x;
+  for (int64_t i = 1; i < 7; ++i) {
+    // 1,2,3,4,5,6
+    init_x.push_back(static_cast<T>(i));
+  }
+
+  // [[1, 2],[3, 4],[5, 6]]
+  TensorFromVector(init_x, ctx, tensor_x);
+  tensor_x->Resize(paddle::framework::make_ddim({3, 2}));
+
+  std::vector<int> init_index = {1, 2};
+  paddle::framework::TensorFromVector<int>(init_index, ctx, tensor_index);
+  tensor_index->Resize(paddle::framework::make_ddim({2}));
+
+  ctx.Wait();
+
+  auto out = scope->Var("Out");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+
+  // run
+  f::AttributeMap attrs = {{"validate_indices", true}};
+  auto op = f::OpRegistry::CreateOp(
+      op_type, {{"X", {"X"}}, {"Index", {"Index"}}}, {{"Out", {"Out"}}}, attrs);
+
+  auto place = ctx.GetPlace();
+  op->Run(*scope, place);
+
+  std::vector<T> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+
+  ctx.Wait();
+
+  // ref:https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api/paddle/tensor/manipulation/gather_cn.html#gather
+  for (int i = 0; i < static_cast<int>(out_vec.size()); ++i) {
+    VLOG(3) << "out_vec[" << i << "] : " << out_vec[i];
+  }
+  uint32_t expected_size = 4;
+  EXPECT_EQ((uint32_t)out_vec.size(), expected_size);
+
+  // {3, 4, 5, 6}
+  std::vector<T> expected_out_vec;
+  for (int64_t i = 3; i < 7; ++i) {
+    expected_out_vec.push_back(static_cast<T>(i));
+  }
+  for (uint32_t i = 0; i < out_vec.size(); i++) {
+    EXPECT_EQ(out_vec[i], expected_out_vec[i]);
+  }
+}
+
+template <typename T>
+void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx,
+                 std::string op_type) {
+  // init
+  auto index = scope->Var("Index");
+  auto tensor_index = index->GetMutable<f::LoDTensor>();
+
+  auto x = scope->Var("X");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+
+  auto dout = scope->Var("DOut");
+  auto tensor_dout = dout->GetMutable<f::LoDTensor>();
+
+  std::vector<int> init_index = {0, 1};
+  paddle::framework::TensorFromVector<int>(init_index, ctx, tensor_index);
+  tensor_index->Resize(paddle::framework::make_ddim({2}));
+
+  std::vector<T> init_x = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
+  TensorFromVector(init_x, ctx, tensor_x);
+  tensor_x->Resize(paddle::framework::make_ddim({3, 2}));
+
+  std::vector<T> init_dout = {5.0, 10.0, 2.0, 3.0};
+  TensorFromVector(init_dout, ctx, tensor_dout);
+  tensor_dout->Resize(paddle::framework::make_ddim({2, 2}));
+
+  ctx.Wait();
+
+  auto dx = scope->Var("DX");
+  auto tensor_dx = dx->GetMutable<f::LoDTensor>();
+
+  // run
+  f::AttributeMap attrs;
+  auto op = f::OpRegistry::CreateOp(
+      op_type, {{"X", {"X"}}, {"Index", {"Index"}}, {"Out@GRAD", {"DOut"}}},
+      {{"X@GRAD", {"DX"}}}, attrs);
+
+  auto place = ctx.GetPlace();
+  op->Run(*scope, place);
+
+  std::vector<T> dx_vec;
+  TensorToVector(*tensor_dx, ctx, &dx_vec);
+
+  ctx.Wait();
+
+  uint32_t expected_size = 3 * 2;
+  EXPECT_EQ((uint32_t)dx_vec.size(), expected_size);
+
+  std::vector<T> expected_dx_vec = {5.0, 10.0, 2.0, 3.0, 0.0, 0.0};
+  for (uint32_t i = 0; i < dx_vec.size(); i++) {
+    VLOG(3) << "dx_vec[i]=" << dx_vec[i];
+    EXPECT_EQ(dx_vec[i], expected_dx_vec[i]);
+  }
+}
+
+TEST(gather, NPU_fp32) {
+  f::Scope scope;
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  Compare<float>(&scope, *ctx, "gather");
+}
+
+TEST(gather, NPU_fp16) {
+  f::Scope scope;
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  Compare<p::float16>(&scope, *ctx, "gather");
+}
+
+TEST(gather_grad, NPU_fp32) {
+  f::Scope scope;
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  CompareGrad<float>(&scope, *ctx, "gather_grad");
+}
diff --git a/paddle/fluid/operators/gaussian_random_op.cu b/paddle/fluid/operators/gaussian_random_op.cu
index 7a0c93eb1b2ea..453ae20656f1d 100644
--- a/paddle/fluid/operators/gaussian_random_op.cu
+++ b/paddle/fluid/operators/gaussian_random_op.cu
@@ -11,6 +11,8 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
 #include <thrust/random.h>
 #include <thrust/transform.h>
 #include "paddle/fluid/framework/generator.h"
diff --git a/paddle/fluid/operators/gelu_op_npu.cc b/paddle/fluid/operators/gelu_op_npu.cc
new file mode 100644
index 0000000000000..56aa509177cfd
--- /dev/null
+++ b/paddle/fluid/operators/gelu_op_npu.cc
@@ -0,0 +1,88 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/operators/gelu_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class GeluNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+
+    auto* out = ctx.Output<Tensor>("Out");
+
+    auto place = ctx.GetPlace();
+
+    out->mutable_data<T>(place);
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    auto runner = NpuOpRunner("Gelu", {*x}, {*out}, {});
+    runner.Run(stream);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class GeluGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+
+    auto place = ctx.GetPlace();
+
+    dx->mutable_data<T>(place);
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    Tensor out(x->type());
+    out.mutable_data<T>(x->dims(), place);
+    auto out_runner = NpuOpRunner("Gelu", {*x}, {out}, {});
+    out_runner.Run(stream);
+
+    auto dx_runner = NpuOpRunner("GeluGrad", {*dout, *x, out}, {*dx}, {});
+    dx_runner.Run(stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    gelu, ops::GeluNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::GeluNPUKernel<paddle::platform::NPUDeviceContext,
+                       paddle::platform::float16>);
+
+REGISTER_OP_NPU_KERNEL(
+    gelu_grad,
+    ops::GeluGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::GeluGradNPUKernel<paddle::platform::NPUDeviceContext,
+                           paddle::platform::float16>);
diff --git a/paddle/fluid/operators/gelu_op_npu_test.cc b/paddle/fluid/operators/gelu_op_npu_test.cc
new file mode 100644
index 0000000000000..830dcd5983901
--- /dev/null
+++ b/paddle/fluid/operators/gelu_op_npu_test.cc
@@ -0,0 +1,168 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(gelu);
+USE_OP_DEVICE_KERNEL(gelu, NPU);
+
+template <typename T>
+void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
+  // init
+  auto x = scope->Var("X");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+
+  std::vector<T> init_x;
+  for (int64_t i = 0; i < 10 * 10; ++i) {
+    init_x.push_back(static_cast<T>(1.0));
+  }
+
+  TensorFromVector(init_x, ctx, tensor_x);
+  tensor_x->Resize({10, 10});
+
+  auto out = scope->Var("Out");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+
+  f::AttributeMap attrs;
+
+  ctx.Wait();
+
+  // run
+  auto place = ctx.GetPlace();
+
+  auto op = f::OpRegistry::CreateOp("gelu", {{"X", {"X"}}}, {{"Out", {"Out"}}},
+                                    attrs);
+  op->Run(*scope, place);
+
+  ctx.Wait();
+
+  // eval time
+  struct timeval start, end;
+  gettimeofday(&start, NULL);
+
+  for (int i = 0; i < 100; i++) {
+    op->Run(*scope, place);
+  }
+
+  ctx.Wait();
+
+  gettimeofday(&end, NULL);
+  int micros =
+      (((end.tv_sec - start.tv_sec) * 1000000) + end.tv_usec) - (start.tv_usec);
+  printf("used time: %d\n", micros / 100);
+
+  // eval value
+  std::vector<T> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+
+  float expected = 0.841192;
+  for (uint32_t i = 0; i < out_vec.size(); i++) {
+    EXPECT_FLOAT_EQ(out_vec[i], static_cast<T>(expected));
+  }
+}
+
+template <typename T>
+void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) {
+  auto dout = scope->Var("DOut");
+  auto tensor_dout = dout->GetMutable<f::LoDTensor>();
+
+  auto x = scope->Var("X");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+
+  std::vector<T> init_dout;
+  for (int64_t i = 0; i < 10 * 10; ++i) {
+    init_dout.push_back(static_cast<T>(1.0));
+  }
+
+  std::vector<T> init_x;
+  for (int64_t i = 0; i < 10 * 10; ++i) {
+    init_x.push_back(static_cast<T>(1.0));
+  }
+
+  TensorFromVector(init_dout, ctx, tensor_dout);
+  tensor_dout->Resize({10, 10});
+  TensorFromVector(init_x, ctx, tensor_x);
+  tensor_x->Resize({10, 10});
+
+  auto dx = scope->Var("DX");
+  auto tensor_dx = dx->GetMutable<f::LoDTensor>();
+
+  f::AttributeMap attrs;
+
+  ctx.Wait();
+
+  // run
+  auto place = ctx.GetPlace();
+
+  auto op = f::OpRegistry::CreateOp("gelu_grad",
+                                    {{"Out@GRAD", {"DOut"}}, {"X", {"X"}}},
+                                    {{"X@GRAD", {"DX"}}}, attrs);
+  op->Run(*scope, place);
+
+  ctx.Wait();
+
+  // eval time
+  struct timeval start, end;
+  gettimeofday(&start, NULL);
+
+  for (int i = 0; i < 100; i++) {
+    op->Run(*scope, place);
+  }
+
+  ctx.Wait();
+
+  gettimeofday(&end, NULL);
+  int micros =
+      (((end.tv_sec - start.tv_sec) * 1000000) + end.tv_usec) - (start.tv_usec);
+  printf("used time: %d\n", micros / 100);
+
+  // eval value
+  std::vector<T> dx_vec;
+  TensorToVector(*tensor_dx, ctx, &dx_vec);
+
+  float expected = 1.082964;
+  for (uint32_t i = 0; i < dx_vec.size(); i++) {
+    EXPECT_FLOAT_EQ(dx_vec[i], static_cast<T>(expected));
+  }
+}
+
+TEST(gelu, NPU_fp32) {
+  f::Scope scope;
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  Compare<float>(&scope, *ctx);
+}
+
+TEST(gelu_grad, NPU) {
+  f::Scope scope;
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  CompareGrad<float>(&scope, *ctx);
+}
diff --git a/paddle/fluid/operators/increment_op_npu.cc b/paddle/fluid/operators/increment_op_npu.cc
new file mode 100644
index 0000000000000..7d75e385e8f3b
--- /dev/null
+++ b/paddle/fluid/operators/increment_op_npu.cc
@@ -0,0 +1,69 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/increment_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+#include "paddle/fluid/platform/float16.h"
+
+namespace paddle {
+namespace framework {
+class OpDesc;
+class Variable;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+}  // namespace paddle
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class IncrementalNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* x_tensor = context.Input<framework::Tensor>("X");
+    auto* out_tensor = context.Output<framework::Tensor>("Out");
+    float step = context.Attr<float>("step");
+    out_tensor->mutable_data<T>(context.GetPlace());
+
+    Tensor step_tensor(x_tensor->type());
+
+    step_tensor.mutable_data<T>({1}, context.GetPlace());
+    FillNpuTensorWithConstant<T>(&step_tensor, static_cast<T>(step));
+
+    auto runner =
+        NpuOpRunner("Add", {*x_tensor, step_tensor}, {*out_tensor}, {});
+
+    auto stream =
+        context.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    runner.Run(stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace plat = paddle::platform;
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    increment,
+    ops::IncrementalNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::IncrementalNPUKernel<paddle::platform::NPUDeviceContext, double>,
+    ops::IncrementalNPUKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::IncrementalNPUKernel<paddle::platform::NPUDeviceContext, int64_t>,
+    ops::IncrementalNPUKernel<paddle::platform::NPUDeviceContext,
+                              plat::float16>)
diff --git a/paddle/fluid/operators/increment_op_npu_test.cc b/paddle/fluid/operators/increment_op_npu_test.cc
new file mode 100644
index 0000000000000..bde349b0a33b9
--- /dev/null
+++ b/paddle/fluid/operators/increment_op_npu_test.cc
@@ -0,0 +1,82 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(increment);
+USE_OP_DEVICE_KERNEL(increment, NPU);
+
+template <typename T>
+void Compare(f::Scope* scope, const p::DeviceContext& ctx,
+             std::string op_type) {
+  // init
+  auto x = scope->Var("X");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+
+  std::vector<T> init;
+  init.push_back(static_cast<T>(1.0));
+
+  TensorFromVector(init, ctx, tensor_x);
+  tensor_x->Resize({1});
+
+  ctx.Wait();
+
+  auto place = ctx.GetPlace();
+  auto out = scope->Var("Out");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+
+  f::AttributeMap attr_input = {{"step", static_cast<float>(2.0)}};
+  auto op = f::OpRegistry::CreateOp("increment", {{"X", {"X"}}},
+                                    {{"Out", {"Out"}}}, attr_input);
+
+  op->Run(*scope, place);
+
+  std::vector<T> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+
+  ctx.Wait();
+
+  EXPECT_EQ((uint32_t)out_vec.size(), (uint32_t)1);
+  EXPECT_EQ(out_vec[0], static_cast<T>(3.0));
+}
+
+TEST(increment, NPU_fp32) {
+  f::Scope scope;
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  Compare<float>(&scope, *ctx, "increment");
+}
+
+TEST(increment, NPU_fp64) {
+  f::Scope scope;
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  Compare<double>(&scope, *ctx, "increment");
+}
diff --git a/paddle/fluid/operators/interpolate_op.h b/paddle/fluid/operators/interpolate_op.h
index 5fbde701fcef6..0c90a3869a2a2 100644
--- a/paddle/fluid/operators/interpolate_op.h
+++ b/paddle/fluid/operators/interpolate_op.h
@@ -32,9 +32,11 @@ inline std::vector<int> get_new_shape(
   std::vector<int> vec_new_shape;
   for (size_t i = 0; i < list_new_shape_tensor.size(); ++i) {
     auto tensor = list_new_shape_tensor[i];
-    PADDLE_ENFORCE_EQ(
-        tensor->dims(), framework::make_ddim({1}),
-        platform::errors::InvalidArgument("shape of dim tensor should be [1]"));
+    PADDLE_ENFORCE_EQ(tensor->dims(), framework::make_ddim({1}),
+                      platform::errors::InvalidArgument(
+                          "The shape of dimension tensor should be [1],"
+                          "but received d%.",
+                          tensor->dims()));
     if (platform::is_gpu_place(tensor->place())) {
       framework::Tensor temp;
       TensorCopySync(*tensor, platform::CPUPlace(), &temp);
diff --git a/paddle/fluid/operators/interpolate_v2_op.cc b/paddle/fluid/operators/interpolate_v2_op.cc
index 3362f2474fe25..cb93044ca5844 100644
--- a/paddle/fluid/operators/interpolate_v2_op.cc
+++ b/paddle/fluid/operators/interpolate_v2_op.cc
@@ -14,6 +14,9 @@
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
 
 namespace paddle {
 namespace operators {
@@ -359,13 +362,41 @@ class InterpolateV2Op : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
+    framework::DataLayout layout = framework::DataLayout::kAnyLayout;
+    framework::LibraryType library = framework::LibraryType::kPlain;
+    auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
+
+#ifdef PADDLE_WITH_MKLDNN
+    auto interp_method = ctx.Attr<std::string>("interp_method");
+    // TODO(danqing): support other interp_method
+    if (this->CanMKLDNNBeUsed(ctx, data_type) &&
+        (interp_method == "nearest" || interp_method == "bilinear")) {
+      layout = framework::DataLayout::kMKLDNN;
+      library = framework::LibraryType::kMKLDNN;
+    }
+#endif
+
+    return framework::OpKernelType(data_type, ctx.GetPlace(), layout, library);
   }
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name, const Tensor& tensor,
       const framework::OpKernelType& expected_kernel_type) const override {
+#ifdef PADDLE_WITH_MKLDNN
+    if ((expected_kernel_type.data_layout_ == framework::DataLayout::kMKLDNN) &&
+        (tensor.layout() != framework::DataLayout::kMKLDNN)) {
+      auto attrs = Attrs();
+      auto ar = paddle::framework::AttrReader(attrs);
+      const std::string data_format = ar.Get<std::string>("data_layout");
+      auto dl = framework::StringToDataLayout(data_format);
+      // Some models may have intentionally set "AnyLayout" for pool
+      // op. Treat this as NCHW (default data_format value)
+      if (dl != framework::DataLayout::kAnyLayout) {
+        return framework::OpKernelType(expected_kernel_type.data_type_,
+                                       tensor.place(), dl);
+      }
+    }
+#endif
     if (var_name == "SizeTensor" || var_name == "Scale") {
       return expected_kernel_type;
     }
@@ -436,6 +467,9 @@ class InterpolateV2OpMaker : public framework::OpProtoAndCheckerMaker {
                  "can be \'0\' for src_idx = scale*(dst_indx+0.5)-0.5 , "
                  "can be \'1\' for src_idx = scale*dst_index .")
         .SetDefault(1);
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
     AddComment(R"DOC(
           This operator samples input X to given output shape by using specified
           interpolation method, the interpolation methods can be \"nearest\"
@@ -672,6 +706,8 @@ REGISTER_OP_CPU_KERNEL(bilinear_interp_v2_grad,
                        ops::InterpolateV2GradKernel<double>);
 REGISTER_OP_CPU_KERNEL(nearest_interp_v2, ops::InterpolateV2Kernel<float>,
                        ops::InterpolateV2Kernel<double>,
+                       ops::InterpolateV2Kernel<int>,
+                       ops::InterpolateV2Kernel<int64_t>,
                        ops::InterpolateV2Kernel<uint8_t>);
 REGISTER_OP_CPU_KERNEL(nearest_interp_v2_grad,
                        ops::InterpolateV2GradKernel<float>,
diff --git a/paddle/fluid/operators/interpolate_v2_op.cu b/paddle/fluid/operators/interpolate_v2_op.cu
index 9c19278ac4d33..e5002e72d0edd 100644
--- a/paddle/fluid/operators/interpolate_v2_op.cu
+++ b/paddle/fluid/operators/interpolate_v2_op.cu
@@ -1738,6 +1738,7 @@ REGISTER_OP_CUDA_KERNEL(bilinear_interp_v2_grad,
 REGISTER_OP_CUDA_KERNEL(nearest_interp_v2,
                         ops::InterpolateOpV2CUDAKernel<float>,
                         ops::InterpolateOpV2CUDAKernel<double>,
+                        ops::InterpolateOpV2CUDAKernel<int64_t>,
                         ops::InterpolateOpV2CUDAKernel<int>);
 REGISTER_OP_CUDA_KERNEL(nearest_interp_v2_grad,
                         ops::InterpolateV2GradOpCUDAKernel<float>,
diff --git a/paddle/fluid/operators/interpolate_v2_op.h b/paddle/fluid/operators/interpolate_v2_op.h
index 4e4fd9ff63ba4..ebab5794edc51 100644
--- a/paddle/fluid/operators/interpolate_v2_op.h
+++ b/paddle/fluid/operators/interpolate_v2_op.h
@@ -32,9 +32,11 @@ inline std::vector<int> get_new_shape(
   std::vector<int> vec_new_shape;
   for (size_t i = 0; i < list_new_shape_tensor.size(); ++i) {
     auto tensor = list_new_shape_tensor[i];
-    PADDLE_ENFORCE_EQ(
-        tensor->dims(), framework::make_ddim({1}),
-        platform::errors::InvalidArgument("shape of dim tensor should be [1]"));
+    PADDLE_ENFORCE_EQ(tensor->dims(), framework::make_ddim({1}),
+                      platform::errors::InvalidArgument(
+                          "The shape of dimension tensor should be [1],"
+                          "but received d%.",
+                          tensor->dims()));
     if (platform::is_gpu_place(tensor->place())) {
       framework::Tensor temp;
       TensorCopySync(*tensor, platform::CPUPlace(), &temp);
@@ -795,16 +797,22 @@ static void Interpolate1DCPUFwd(const framework::ExecutionContext& ctx,
     if (scale_tensor != nullptr) {
       auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
       scale_w = scale_data[0];
-      PADDLE_ENFORCE_EQ(scale_w > 0, true, platform::errors::InvalidArgument(
-                                               "scale  of Op(interpolate) "
-                                               "should be greater than 0."));
+      PADDLE_ENFORCE_EQ(
+          scale_w > 0, true,
+          platform::errors::InvalidArgument(
+              "The scale_w in input 'Scale' Tensor of Operator(interpolate) "
+              "should be greater than 0, but received value is %d.",
+              scale_w));
     } else {
       if (scale.size() > 0) {
         scale_w = scale[0];
 
-        PADDLE_ENFORCE_EQ(scale_w > 0, true, platform::errors::InvalidArgument(
-                                                 "scale  of Op(interpolate) "
-                                                 "should be greater than 0."));
+        PADDLE_ENFORCE_EQ(
+            scale_w > 0, true,
+            platform::errors::InvalidArgument(
+                "The scale_w in Attr(scale) of Operator(interpolate) "
+                "should be greater than 0, but received value is %d.",
+                scale_w));
       }
     }
     if (scale_w > 0.) {
@@ -882,18 +890,34 @@ static void Interpolate2DCPUFwd(const framework::ExecutionContext& ctx,
         scale_w = scale_data[0];
       }
       PADDLE_ENFORCE_EQ(
-          scale_w > 0 && scale_h > 0, true,
-          platform::errors::InvalidArgument("scale  of Op(interpolate) "
-                                            "should be greater than 0."));
+          scale_w > 0, true,
+          platform::errors::InvalidArgument(
+              "The scale_w in input 'Scale' Tensor of Operator(interpolate) "
+              "should be greater than 0, but received value is %d.",
+              scale_w));
+      PADDLE_ENFORCE_EQ(
+          scale_h > 0, true,
+          platform::errors::InvalidArgument(
+              "The scale_h in input 'Scale' Tensor of Operator(interpolate) "
+              "should be greater than 0, but received value is %d.",
+              scale_h));
     } else {
       if (scale.size() > 1) {
         scale_h = scale[0];
         scale_w = scale[1];
 
         PADDLE_ENFORCE_EQ(
-            scale_w > 0 && scale_h > 0, true,
-            platform::errors::InvalidArgument("scale  of Op(interpolate) "
-                                              "should be greater than 0."));
+            scale_w > 0, true,
+            platform::errors::InvalidArgument(
+                "The scale_w in Attr(scale) of Operator(interpolate) "
+                "should be greater than 0, but received value is %d.",
+                scale_w));
+        PADDLE_ENFORCE_EQ(
+            scale_h > 0, true,
+            platform::errors::InvalidArgument(
+                "The scale_h in Attr(scale) of Operator(interpolate) "
+                "should be greater than 0, but received value is %d.",
+                scale_h));
       }
     }
     if (scale_h > 0. && scale_w > 0.) {
@@ -998,9 +1022,23 @@ static void Interpolate3DCPUFwd(const framework::ExecutionContext& ctx,
         scale_w = scale_data[0];
       }
       PADDLE_ENFORCE_EQ(
-          scale_w > 0 && scale_h > 0 && scale_d, true,
-          platform::errors::InvalidArgument("scale  of Op(interpolate) "
-                                            "should be greater than 0."));
+          scale_w > 0, true,
+          platform::errors::InvalidArgument(
+              "The scale_w in input 'Scale' Tensor of Operator(interpolate) "
+              "should be greater than 0, but received value is %d.",
+              scale_w));
+      PADDLE_ENFORCE_EQ(
+          scale_h > 0, true,
+          platform::errors::InvalidArgument(
+              "The scale_h in input 'Scale' Tensor of Operator(interpolate) "
+              "should be greater than 0, but received value is %d.",
+              scale_h));
+      PADDLE_ENFORCE_EQ(
+          scale_d > 0, true,
+          platform::errors::InvalidArgument(
+              "The scale_d in input 'Scale' Tensor of Operator(interpolate) "
+              "should be greater than 0, but received value is %d.",
+              scale_d));
     } else {
       if (scale.size() > 1) {
         scale_d = scale[0];
@@ -1008,9 +1046,23 @@ static void Interpolate3DCPUFwd(const framework::ExecutionContext& ctx,
         scale_w = scale[2];
 
         PADDLE_ENFORCE_EQ(
-            scale_w > 0 && scale_h > 0 && scale_d, true,
-            platform::errors::InvalidArgument("scale  of Op(interpolate) "
-                                              "should be greater than 0."));
+            scale_w > 0, true,
+            platform::errors::InvalidArgument(
+                "The scale_w in Attr(scale) of Operator(interpolate) "
+                "should be greater than 0, but received value is %d.",
+                scale_w));
+        PADDLE_ENFORCE_EQ(
+            scale_h > 0, true,
+            platform::errors::InvalidArgument(
+                "The scale_h in Attr(scale) of Operator(interpolate) "
+                "should be greater than 0, but received value is %d.",
+                scale_h));
+        PADDLE_ENFORCE_EQ(
+            scale_d > 0, true,
+            platform::errors::InvalidArgument(
+                "The scale_d in Attr(scale) of Operator(interpolate) "
+                "should be greater than 0, but received value is %d.",
+                scale_d));
       }
     }
     if (scale_w > 0. && scale_h > 0. && scale_d > 0.) {
@@ -1102,15 +1154,21 @@ static void Interpolate1DCPUBwd(const framework::ExecutionContext& ctx,
   if (scale_tensor != nullptr) {
     auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
     scale_w = scale_data[0];
-    PADDLE_ENFORCE_EQ(scale_w > 0, true, platform::errors::InvalidArgument(
-                                             "scale  of Op(interpolate) "
-                                             "should be greater than 0."));
+    PADDLE_ENFORCE_EQ(
+        scale_w > 0, true,
+        platform::errors::InvalidArgument(
+            "The scale_w in input 'Scale' Tensor of Operator(interpolate) "
+            "should be greater than 0, but received value is %d.",
+            scale_w));
   } else {
     if (scale.size() > 0) {
       scale_w = scale[0];
-      PADDLE_ENFORCE_EQ(scale_w > 0, true, platform::errors::InvalidArgument(
-                                               "scale  of Op(interpolate) "
-                                               "should be greater than 0."));
+      PADDLE_ENFORCE_EQ(
+          scale_w > 0, true,
+          platform::errors::InvalidArgument(
+              "The scale_w in Attr(scale) of Operator(interpolate) "
+              "should be greater than 0, but received value is %d.",
+              scale_w));
     }
   }
   if (scale_w > 0.) {
@@ -1188,17 +1246,33 @@ static void Interpolate2DCPUBwd(const framework::ExecutionContext& ctx,
       scale_h = scale_data[0];
     }
     PADDLE_ENFORCE_EQ(
-        scale_w > 0 && scale_h > 0, true,
-        platform::errors::InvalidArgument("scale  of Op(interpolate) "
-                                          "should be greater than 0."));
+        scale_w > 0, true,
+        platform::errors::InvalidArgument(
+            "The scale_w in input 'Scale' Tensor of Operator(interpolate) "
+            "should be greater than 0, but received value is %d.",
+            scale_w));
+    PADDLE_ENFORCE_EQ(
+        scale_h > 0, true,
+        platform::errors::InvalidArgument(
+            "The scale_h in input 'Scale' Tensor of Operator(interpolate) "
+            "should be greater than 0, but received value is %d.",
+            scale_h));
   } else {
     if (scale.size() > 1) {
       scale_h = scale[0];
       scale_w = scale[1];
       PADDLE_ENFORCE_EQ(
-          scale_w > 0 && scale_h > 0, true,
-          platform::errors::InvalidArgument("scale  of Op(interpolate) "
-                                            "should be greater than 0."));
+          scale_w > 0, true,
+          platform::errors::InvalidArgument(
+              "The scale_w in Attr(scale) of Operator(interpolate) "
+              "should be greater than 0, but received value is %d.",
+              scale_w));
+      PADDLE_ENFORCE_EQ(
+          scale_h > 0, true,
+          platform::errors::InvalidArgument(
+              "The scale_h in Attr(scale) of Operator(interpolate) "
+              "should be greater than 0, but received value is %d.",
+              scale_h));
     }
   }
   if (scale_h > 0. && scale_w > 0.) {
@@ -1301,18 +1375,46 @@ static void Interpolate3DCPUBwd(const framework::ExecutionContext& ctx,
       scale_w = scale_data[0];
     }
     PADDLE_ENFORCE_EQ(
-        scale_w > 0 && scale_h > 0 && scale_d > 0, true,
-        platform::errors::InvalidArgument("scale  of Op(interpolate) "
-                                          "should be greater than 0."));
+        scale_w > 0, true,
+        platform::errors::InvalidArgument(
+            "The scale_w in input 'Scale' Tensor of Operator(interpolate) "
+            "should be greater than 0, but received value is %d.",
+            scale_w));
+    PADDLE_ENFORCE_EQ(
+        scale_h > 0, true,
+        platform::errors::InvalidArgument(
+            "The scale_h in input 'Scale' Tensor of Operator(interpolate) "
+            "should be greater than 0, but received value is %d.",
+            scale_h));
+    PADDLE_ENFORCE_EQ(
+        scale_d > 0, true,
+        platform::errors::InvalidArgument(
+            "The scale_d in input 'Scale' Tensor of Operator(interpolate) "
+            "should be greater than 0, but received value is %d.",
+            scale_d));
   } else {
     if (scale.size() > 1) {
       scale_d = scale[0];
       scale_h = scale[1];
       scale_w = scale[2];
       PADDLE_ENFORCE_EQ(
-          scale_w > 0 && scale_h > 0 && scale_d > 0, true,
-          platform::errors::InvalidArgument("scale  of Op(interpolate) "
-                                            "should be greater than 0."));
+          scale_w > 0, true,
+          platform::errors::InvalidArgument(
+              "The scale_w in Attr(scale) of Operator(interpolate) "
+              "should be greater than 0, but received value is %d.",
+              scale_w));
+      PADDLE_ENFORCE_EQ(
+          scale_h > 0, true,
+          platform::errors::InvalidArgument(
+              "The scale_h in Attr(scale) of Operator(interpolate) "
+              "should be greater than 0, but received value is %d.",
+              scale_h));
+      PADDLE_ENFORCE_EQ(
+          scale_d > 0, true,
+          platform::errors::InvalidArgument(
+              "The scale_d in Attr(scale) of Operator(interpolate) "
+              "should be greater than 0, but received value is %d.",
+              scale_d));
     }
   }
   if (scale_d > 0. && scale_h > 0. && scale_w > 0.) {
diff --git a/paddle/fluid/operators/layer_norm_op_npu.cc b/paddle/fluid/operators/layer_norm_op_npu.cc
new file mode 100644
index 0000000000000..c0c228ef22af3
--- /dev/null
+++ b/paddle/fluid/operators/layer_norm_op_npu.cc
@@ -0,0 +1,387 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/layer_norm_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using DDim = framework::DDim;
+
+using DataLayout = framework::DataLayout;
+
+template <typename T>
+class NormDataType;
+
+template <>
+class NormDataType<platform::float16> {
+ public:
+  // The scaling param type is float for HALF and FLOAT tensors
+  using ScalingParamType = const float;
+  using BatchNormParamType = float;
+};
+
+template <>
+class NormDataType<float> {
+ public:
+  using ScalingParamType = const float;
+  using BatchNormParamType = float;
+};
+
+template <typename T>
+using NormDataType = NormDataType<T>;
+template <typename T>
+using LayerNormParamType = typename NormDataType<T>::BatchNormParamType;
+
+template <typename T>
+class LayerNormNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    using U = LayerNormParamType<T>;
+    const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
+    const auto epsilon = ctx.Attr<float>("epsilon");
+    const auto* x = ctx.Input<Tensor>("X");
+    const auto* scale = ctx.Input<Tensor>("Scale");
+    const auto* bias = ctx.Input<Tensor>("Bias");
+    auto* y = ctx.Output<Tensor>("Y");
+    auto* mean = ctx.Output<Tensor>("Mean");
+    auto* variance = ctx.Output<Tensor>("Variance");
+    const auto& x_dims = x->dims();
+    std::vector<int> axes;
+    auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis);
+    int right = static_cast<int>(matrix_dim[1]);
+
+    // The shape of scale and bias should be equal to x.shape[begin_norm_axis:],
+    // required by Ascend.
+    for (auto i = begin_norm_axis; i < x_dims.size(); ++i) {
+      axes.push_back(x_dims[i]);
+    }
+
+    auto place = ctx.GetPlace();
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    Tensor default_scale(x->type());
+    if (!scale) {
+      default_scale.mutable_data<T>(framework::make_ddim(axes), place);
+      Tensor value(x->type());
+      value.mutable_data<T>({1}, place);
+      FillNpuTensorWithConstant<T>(&value, static_cast<T>(1.0));
+      auto runner =
+          NpuOpRunner("FillD", {value}, {default_scale}, {{"dims", axes}});
+      runner.Run(stream);
+      scale = &default_scale;
+    } else {
+      const_cast<Tensor*>(scale)->Resize(framework::make_ddim(axes));
+    }
+
+    Tensor default_bias(x->type());
+    if (!bias) {
+      default_bias.mutable_data<T>(framework::make_ddim(axes), place);
+      Tensor value(x->type());
+      value.mutable_data<T>({1}, place);
+      FillNpuTensorWithConstant<T>(&value, static_cast<T>(0));
+      auto runner =
+          NpuOpRunner("FillD", {value}, {default_bias}, {{"dims", axes}});
+      runner.Run(stream);
+      bias = &default_bias;
+    } else {
+      const_cast<Tensor*>(bias)->Resize(framework::make_ddim(axes));
+    }
+
+    // cast scale from LayerNormParamType to T if needed
+    Tensor cast_scale(x->type());
+    if (x->type() == framework::proto::VarType::FP16 &&
+        scale->type() == framework::proto::VarType::FP32) {
+      cast_scale.Resize(scale->dims());
+      cast_scale.mutable_data<T>(ctx.GetPlace());
+      auto dst_dtype = ConvertToNpuDtype(x->type());
+      auto runner_cast_scale =
+          NpuOpRunner("Cast", {*scale}, {cast_scale},
+                      {{"dst_type", static_cast<int>(dst_dtype)}});
+      runner_cast_scale.Run(stream);
+    } else {
+      cast_scale.ShareDataWith(*scale);
+    }
+
+    // cast bias from LayerNormParamType to T if needed
+    Tensor cast_bias(x->type());
+    if (x->type() == framework::proto::VarType::FP16 &&
+        bias->type() == framework::proto::VarType::FP32) {
+      cast_bias.Resize(bias->dims());
+      cast_bias.mutable_data<T>(ctx.GetPlace());
+      auto dst_dtype = ConvertToNpuDtype(x->type());
+      auto runner_cast_bias =
+          NpuOpRunner("Cast", {*bias}, {cast_bias},
+                      {{"dst_type", static_cast<int>(dst_dtype)}});
+      runner_cast_bias.Run(stream);
+    } else {
+      cast_bias.ShareDataWith(*bias);
+    }
+
+    y->mutable_data<T>(ctx.GetPlace());
+
+    // mean should be of  U type
+    Tensor* tmp_mean = mean;
+    Tensor cast_mean(x->type());
+    if (x->type() == framework::proto::VarType::FP16 &&
+        (scale->type() == framework::proto::VarType::FP32 ||
+         bias->type() == framework::proto::VarType::FP32)) {
+      cast_mean.Resize(mean->dims());
+      cast_mean.mutable_data<T>(ctx.GetPlace());
+      tmp_mean = &cast_mean;
+      mean->mutable_data<U>(ctx.GetPlace());
+    } else {
+      mean->mutable_data<T>(ctx.GetPlace());
+    }
+
+    // same for variance
+    Tensor* tmp_variance = variance;
+    Tensor cast_variance(x->type());
+    if (x->type() == framework::proto::VarType::FP16 &&
+        (scale->type() == framework::proto::VarType::FP32 ||
+         bias->type() == framework::proto::VarType::FP32)) {
+      cast_variance.Resize(variance->dims());
+      cast_variance.mutable_data<T>(ctx.GetPlace());
+      tmp_variance = &cast_variance;
+      variance->mutable_data<U>(ctx.GetPlace());
+    } else {
+      variance->mutable_data<T>(ctx.GetPlace());
+    }
+
+    auto runner = NpuOpRunner("LayerNorm", {*x, cast_scale, cast_bias},
+                              {*y, *tmp_mean, *tmp_variance},
+                              {{"begin_norm_axis", begin_norm_axis},
+                               {"begin_params_axis", begin_norm_axis},
+                               {"epsilon", epsilon}});
+    runner.Run(stream);
+
+    // cast back from FP16 to FP32
+    if (x->type() == framework::proto::VarType::FP16 &&
+        mean->type() == framework::proto::VarType::FP32) {
+      auto dst_dtype = ConvertToNpuDtype(mean->type());
+      auto runner_cast_mean =
+          NpuOpRunner("Cast", {*tmp_mean}, {*mean},
+                      {{"dst_type", static_cast<int>(dst_dtype)}});
+      runner_cast_mean.Run(stream);
+    }
+    // same for variance
+    if (x->type() == framework::proto::VarType::FP16 &&
+        variance->type() == framework::proto::VarType::FP32) {
+      auto dst_dtype = ConvertToNpuDtype(variance->type());
+      auto runner_cast_variance =
+          NpuOpRunner("Cast", {*tmp_variance}, {*variance},
+                      {{"dst_type", static_cast<int>(dst_dtype)}});
+      runner_cast_variance.Run(stream);
+    }
+
+    // revert shape of scale and bias
+    // TODO(zhiqiu): better implementation, use tmp tensor to avoid write input
+    // tensor.
+    const_cast<Tensor*>(scale)->Resize(framework::make_ddim({right}));
+    const_cast<Tensor*>(bias)->Resize(framework::make_ddim({right}));
+  }
+};
+
+template <typename T>
+class LayerNormGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    using U = LayerNormParamType<T>;
+    const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
+    const auto* x = ctx.Input<Tensor>("X");
+    const auto& x_dims = x->dims();
+    const auto* mean = ctx.Input<Tensor>("Mean");
+    const auto* variance = ctx.Input<Tensor>("Variance");
+    const auto* scale = ctx.Input<Tensor>("Scale");
+    const auto* dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dscale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
+    auto* dbias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+
+    auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis);
+    int right = static_cast<int>(matrix_dim[1]);
+
+    std::vector<int> axes;
+    for (auto i = begin_norm_axis; i < x_dims.size(); ++i) {
+      axes.push_back(x_dims[i]);
+    }
+
+    auto place = ctx.GetPlace();
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    // No need to compute any gradient, jusr return
+    if (!dx && !dscale && !dbias) {
+      return;
+    }
+
+    // The rank of mean should be equal to x, required by Ascend.
+    std::vector<int> new_shape;
+    for (auto i = 0; i < begin_norm_axis; ++i) {
+      new_shape.push_back(x_dims[i]);
+    }
+    for (auto i = begin_norm_axis; i < x_dims.size(); ++i) {
+      new_shape.push_back(1);
+    }
+
+    auto mean_dims = mean->dims();
+    const_cast<Tensor*>(mean)->Resize(framework::make_ddim({new_shape}));
+    const_cast<Tensor*>(variance)->Resize(framework::make_ddim({new_shape}));
+
+    Tensor default_scale(x->type());
+    if (!scale) {
+      default_scale.mutable_data<T>(framework::make_ddim(axes), place);
+      Tensor value(x->type());
+      value.mutable_data<T>({1}, place);
+      FillNpuTensorWithConstant<T>(&value, static_cast<T>(1.0));
+      auto runner =
+          NpuOpRunner("FillD", {value}, {default_scale}, {{"dims", axes}});
+      runner.Run(stream);
+      scale = &default_scale;
+    } else {
+      const_cast<Tensor*>(scale)->Resize(framework::make_ddim(axes));
+    }
+
+    // cast scale from LayerNormParamType to T if needed
+    Tensor cast_scale(x->type());
+    if (x->type() == framework::proto::VarType::FP16 &&
+        scale->type() == framework::proto::VarType::FP32) {
+      cast_scale.Resize(scale->dims());
+      cast_scale.mutable_data<T>(ctx.GetPlace());
+      auto dst_dtype = ConvertToNpuDtype(x->type());
+      auto runner_cast_scale =
+          NpuOpRunner("Cast", {*scale}, {cast_scale},
+                      {{"dst_type", static_cast<int>(dst_dtype)}});
+      runner_cast_scale.Run(stream);
+    } else {
+      cast_scale.ShareDataWith(*scale);
+    }
+
+    // cast mean from LayerNormParamType to T if needed
+    Tensor cast_mean(x->type());
+    if (x->type() == framework::proto::VarType::FP16 &&
+        mean->type() == framework::proto::VarType::FP32) {
+      cast_mean.Resize(mean->dims());
+      cast_mean.mutable_data<T>(ctx.GetPlace());
+      auto dst_dtype = ConvertToNpuDtype(x->type());
+      auto runner_cast_mean =
+          NpuOpRunner("Cast", {*mean}, {cast_mean},
+                      {{"dst_type", static_cast<int>(dst_dtype)}});
+      runner_cast_mean.Run(stream);
+    } else {
+      cast_mean.ShareDataWith(*mean);
+    }
+
+    // cast variance from LayerNormParamType to T if needed
+    Tensor cast_variance(x->type());
+    if (x->type() == framework::proto::VarType::FP16 &&
+        variance->type() == framework::proto::VarType::FP32) {
+      cast_variance.Resize(variance->dims());
+      cast_variance.mutable_data<T>(ctx.GetPlace());
+      auto dst_dtype = ConvertToNpuDtype(x->type());
+      auto runner_cast_variance =
+          NpuOpRunner("Cast", {*variance}, {cast_variance},
+                      {{"dst_type", static_cast<int>(dst_dtype)}});
+      runner_cast_variance.Run(stream);
+    } else {
+      cast_variance.ShareDataWith(*variance);
+    }
+
+    Tensor dx_(dy->type()), dscale_(dy->type()), dbias_(dy->type());
+    dx = (dx == nullptr) ? &dx_ : dx;
+    dscale = (dscale == nullptr) ? &dscale_ : dscale;
+    dbias = (dbias == nullptr) ? &dbias_ : dbias;
+
+    dx->Resize(x->dims());
+    dx->mutable_data<T>(ctx.GetPlace());
+
+    dscale->Resize(framework::make_ddim(axes));
+
+    dbias->Resize(framework::make_ddim(axes));
+
+    // dscale should be of  U type
+    Tensor* tmp_dscale = dscale;
+    Tensor cast_dscale(x->type());
+    if (x->type() == framework::proto::VarType::FP16 &&
+        (mean->type() == framework::proto::VarType::FP32 ||
+         variance->type() == framework::proto::VarType::FP32)) {
+      cast_dscale.Resize(dscale->dims());
+      cast_dscale.mutable_data<T>(ctx.GetPlace());
+      tmp_dscale = &cast_dscale;
+      dscale->mutable_data<U>(ctx.GetPlace());
+    } else {
+      dscale->mutable_data<T>(ctx.GetPlace());
+    }
+
+    // same for dbias
+    Tensor* tmp_dbias = dbias;
+    Tensor cast_dbias(x->type());
+    if (x->type() == framework::proto::VarType::FP16 &&
+        (mean->type() == framework::proto::VarType::FP32 ||
+         variance->type() == framework::proto::VarType::FP32)) {
+      cast_dbias.Resize(dbias->dims());
+      cast_dbias.mutable_data<T>(ctx.GetPlace());
+      tmp_dbias = &cast_dbias;
+      dbias->mutable_data<U>(ctx.GetPlace());
+    } else {
+      dbias->mutable_data<T>(ctx.GetPlace());
+    }
+
+    auto runner = NpuOpRunner("LayerNormGrad",
+                              {*dy, *x, cast_variance, cast_mean, cast_scale},
+                              {*dx, *tmp_dscale, *tmp_dbias}, {});
+    runner.Run(stream);
+
+    // cast back from FP16 to FP32
+    if (x->type() == framework::proto::VarType::FP16 &&
+        dscale->type() == framework::proto::VarType::FP32) {
+      auto dst_dtype = ConvertToNpuDtype(dscale->type());
+      auto runner_cast_dscale =
+          NpuOpRunner("Cast", {*tmp_dscale}, {*dscale},
+                      {{"dst_type", static_cast<int>(dst_dtype)}});
+      runner_cast_dscale.Run(stream);
+    }
+    // same for dbias
+    if (x->type() == framework::proto::VarType::FP16 &&
+        dbias->type() == framework::proto::VarType::FP32) {
+      auto dst_dtype = ConvertToNpuDtype(dbias->type());
+      auto runner_cast_dbias =
+          NpuOpRunner("Cast", {*tmp_dbias}, {*dbias},
+                      {{"dst_type", static_cast<int>(dst_dtype)}});
+      runner_cast_dbias.Run(stream);
+    }
+
+    const_cast<Tensor*>(mean)->Resize(mean_dims);
+    const_cast<Tensor*>(variance)->Resize(mean_dims);
+    const_cast<Tensor*>(scale)->Resize(framework::make_ddim({right}));
+    dscale->Resize(framework::make_ddim({right}));
+    dbias->Resize(framework::make_ddim({right}));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(layer_norm, ops::LayerNormNPUKernel<float>,
+                       ops::LayerNormNPUKernel<plat::float16>);
+REGISTER_OP_NPU_KERNEL(layer_norm_grad, ops::LayerNormGradNPUKernel<float>,
+                       ops::LayerNormGradNPUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/load_combine_op_npu.cc b/paddle/fluid/operators/load_combine_op_npu.cc
new file mode 100644
index 0000000000000..4b9b96c23b0b7
--- /dev/null
+++ b/paddle/fluid/operators/load_combine_op_npu.cc
@@ -0,0 +1,25 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/load_combine_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    load_combine,
+    ops::LoadCombineOpKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::LoadCombineOpKernel<paddle::platform::NPUDeviceContext, double>,
+    ops::LoadCombineOpKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::LoadCombineOpKernel<paddle::platform::NPUDeviceContext, int8_t>,
+    ops::LoadCombineOpKernel<paddle::platform::NPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/load_op_npu.cc b/paddle/fluid/operators/load_op_npu.cc
new file mode 100644
index 0000000000000..1f53280345831
--- /dev/null
+++ b/paddle/fluid/operators/load_op_npu.cc
@@ -0,0 +1,24 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/load_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    load, ops::LoadOpKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::LoadOpKernel<paddle::platform::NPUDeviceContext, double>,
+    ops::LoadOpKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::LoadOpKernel<paddle::platform::NPUDeviceContext, int8_t>,
+    ops::LoadOpKernel<paddle::platform::NPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/log_softmax_op.cu b/paddle/fluid/operators/log_softmax_op.cu
index 9136de38caf4c..e4fe92c625640 100644
--- a/paddle/fluid/operators/log_softmax_op.cu
+++ b/paddle/fluid/operators/log_softmax_op.cu
@@ -65,11 +65,6 @@ __global__ void ComputeLogSoftmaxForwardInWarp(T *dst, const T *src,
   constexpr int warp_iter = near_greater_power_of_two / kernel_warp_size;
   int batch_id = blockDim.y * blockIdx.x + threadIdx.y;
 
-  // set effective_warp_id as 1 when warps do effective work,
-  // when warps do ineffective work, effective_warp_id remains unchanged.
-  int effective_warp_id = batch_size - batch_id;
-  if (effective_warp_id > 1) effective_warp_id = 1;
-
   int thread_in_warp_idx = threadIdx.x;
 
   // 1.read data from global memory to registers
@@ -77,7 +72,7 @@ __global__ void ComputeLogSoftmaxForwardInWarp(T *dst, const T *src,
   // set effective_element_count as the num of elements when warps do effective
   // work
   // set effective_element_count as 0, when warps do ineffective work
-  int effective_element_count = (effective_warp_id <= 0) ? 0 : element_count;
+  int effective_element_count = (batch_id < batch_size) ? element_count : 0;
   for (int it = 0; it < warp_iter; ++it) {
     int element_index = thread_in_warp_idx + it * kernel_warp_size;
     if (element_index < effective_element_count) {
@@ -181,6 +176,131 @@ class LogSoftmaxKernel<platform::CUDADeviceContext, T>
   }
 };
 
+// Backward below
+#define LAUNCH_WARP_BACKWARD_COMPUTE(near_greater_power_of_two)              \
+  case near_greater_power_of_two:                                            \
+    ComputeLogSoftmaxBackwardInWarp<                                         \
+        T, AccT, near_greater_power_of_two><<<blocks, threads, 0, stream>>>( \
+        output, grad_output, grad_input, outer_size, dim_size);              \
+    break;
+
+template <typename T, typename AccT, int NearGreaterPowerOfTwo>
+__global__ void ComputeLogSoftmaxBackwardInWarp(const T *output,
+                                                const T *grad_output,
+                                                T *grad_input, int batch_size,
+                                                int element_count) {
+  constexpr int near_greater_power_of_two = NearGreaterPowerOfTwo;
+  constexpr int kernel_warp_size =
+      (near_greater_power_of_two < 32) ? near_greater_power_of_two : 32;
+  constexpr int warp_iter = near_greater_power_of_two / kernel_warp_size;
+  int batch_id = blockDim.y * blockIdx.x + threadIdx.y;
+
+  int thread_in_warp_idx = threadIdx.x % kernel_warp_size;
+
+  // 1.read data from global memory to registers
+  AccT output_register[warp_iter];
+  AccT grad_output_register[warp_iter];
+  int effective_element_count = (batch_id < batch_size) ? element_count : 0;
+  for (int iter = 0; iter < warp_iter; ++iter) {
+    int element_index = thread_in_warp_idx + iter * kernel_warp_size;
+    if (element_index < effective_element_count) {
+      output_register[iter] =
+          static_cast<AccT>(output[batch_id * element_count + element_index]);
+      grad_output_register[iter] = static_cast<AccT>(
+          grad_output[batch_id * element_count + element_index]);
+    } else {
+      output_register[iter] = AccT(0);
+      grad_output_register[iter] = AccT(0);
+    }
+  }
+
+  // 2. For each warp, accumulate all thread registers
+  AccT sum = grad_output_register[0];
+#pragma unroll
+  for (int iter = 1; iter < warp_iter; ++iter) {
+    sum += grad_output_register[iter];
+  }
+  sum = WarpReduceSum<AccT, kernel_warp_size>(sum);
+
+// 3. write result in grad_input
+#pragma unroll
+  for (int iter = 0; iter < warp_iter; ++iter) {
+    int element_index = thread_in_warp_idx + iter * kernel_warp_size;
+    if (element_index < element_count) {
+      grad_input[batch_id * element_count + element_index] = static_cast<T>(
+          (grad_output_register[iter] - std::exp(output_register[iter]) * sum));
+    }
+  }
+}
+
+template <typename T, typename AccT>
+void LaunchSoftmaxBackwardForLastAxis(T *grad_input, const T *grad_output,
+                                      const T *output, int dim_size,
+                                      int outer_size, gpuStream_t stream) {
+  int threads_per_block = 128;
+  int near_greater_power_of_two = GetNearGreaterPowerOfTwo(dim_size);
+  int kernel_warp_size =
+      (near_greater_power_of_two < 32) ? near_greater_power_of_two : 32;
+  int warps_per_block = (threads_per_block / kernel_warp_size);
+  int blocks = (outer_size + warps_per_block - 1) / warps_per_block;
+  dim3 threads(kernel_warp_size, warps_per_block, 1);
+
+  switch (near_greater_power_of_two) {
+    LAUNCH_WARP_BACKWARD_COMPUTE(1);     // dim_size: 1
+    LAUNCH_WARP_BACKWARD_COMPUTE(2);     // dim_size: 2
+    LAUNCH_WARP_BACKWARD_COMPUTE(4);     // dim_size: 3~4
+    LAUNCH_WARP_BACKWARD_COMPUTE(8);     // dim_size: 5~8
+    LAUNCH_WARP_BACKWARD_COMPUTE(16);    // dim_size: 9~16
+    LAUNCH_WARP_BACKWARD_COMPUTE(32);    // dim_size: 17~32
+    LAUNCH_WARP_BACKWARD_COMPUTE(64);    // dim_size: 33~64
+    LAUNCH_WARP_BACKWARD_COMPUTE(128);   // dim_size: 65~128
+    LAUNCH_WARP_BACKWARD_COMPUTE(256);   // dim_size: 129~256
+    LAUNCH_WARP_BACKWARD_COMPUTE(512);   // dim_size: 257~512
+    LAUNCH_WARP_BACKWARD_COMPUTE(1024);  // dim_size: 513~1024
+
+    default:
+      break;
+  }
+}
+
+template <typename T>
+class LogSoftmaxGradKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+  using MPDType = typename details::MPTypeTrait<T>::Type;
+
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    const auto *out = context.Input<framework::Tensor>("Out");
+    const auto *g_out =
+        context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto *g_x = context.Output<framework::Tensor>(framework::GradVarName("X"));
+
+    const auto *out_data = out->data<T>();
+    const auto *g_out_data = g_out->data<T>();
+    auto *g_x_data = g_x->mutable_data<T>(context.GetPlace());
+
+    const int rank = out->dims().size();
+    const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
+
+    int dim_size = out->dims()[axis];
+    int inner_size = 1;
+    for (int i = axis + 1; i < out->dims().size(); ++i) {
+      inner_size *= out->dims()[i];
+    }
+    int outer_size = SizeToAxis(axis, out->dims());
+    gpuStream_t stream = context.cuda_device_context().stream();
+
+    if (inner_size == 1 && dim_size <= 1024 && dim_size * sizeof(T) <= 4096) {
+      LaunchSoftmaxBackwardForLastAxis<T, MPDType>(
+          g_x_data, g_out_data, out_data, dim_size, outer_size, stream);
+    } else {
+      LogSoftmaxGradFunctor<platform::CUDADeviceContext, T>()(
+          context.template device_context<platform::CUDADeviceContext>(), out,
+          g_out, g_x, axis);
+    }
+  }
+};
+
 }  // operators
 }  // paddle
 
diff --git a/paddle/fluid/operators/lookup_table_v2_op_npu.cc b/paddle/fluid/operators/lookup_table_v2_op_npu.cc
new file mode 100644
index 0000000000000..87618b954d232
--- /dev/null
+++ b/paddle/fluid/operators/lookup_table_v2_op_npu.cc
@@ -0,0 +1,96 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <iostream>
+#include <memory>
+#include <string>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class LookupTableV2NPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *ids_t = ctx.Input<framework::LoDTensor>("Ids");      // int tensor
+    auto *output_t = ctx.Output<framework::LoDTensor>("Out");  // float tensor
+    auto *table_t = ctx.Input<framework::LoDTensor>("W");
+
+    // It seems cann 20.1 accepts int64, but cann 20.2+ not.
+    PADDLE_ENFORCE_EQ(ids_t->type(), framework::proto::VarType::INT32,
+                      platform::errors::Unimplemented(
+                          "The index of LookupTableV2 should be int32."));
+
+    auto *table_var = ctx.InputVar("W");
+    PADDLE_ENFORCE_EQ(
+        table_var->IsType<framework::LoDTensor>(), true,
+        platform::errors::InvalidArgument("npu only accept LoDTensor"));
+    output_t->mutable_data<T>(ctx.GetPlace());
+    framework::NPUAttributeMap attr_input = {{"validate_indices", false}};
+
+    auto runner =
+        NpuOpRunner("Gather", {*table_t, *ids_t}, {*output_t}, attr_input);
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    runner.Run(stream);
+  }
+};
+
+template <typename T>
+class LookupTableV2GradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *ids_t = ctx.Input<framework::LoDTensor>("Ids");
+    auto *output_grad_t =
+        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
+    auto *table_grad_t =
+        ctx.Output<framework::LoDTensor>(framework::GradVarName("W"));
+    table_grad_t->mutable_data<T>(ctx.GetPlace());
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    auto runner_zeros =
+        NpuOpRunner("ZerosLike", {*table_grad_t}, {*table_grad_t});
+    runner_zeros.Run(stream);
+
+    // NOTE(zhiqiu): It seems in cann 20.1, the first input and output
+    // can be different tensor, but in cann 20.2+, it does inplace operation.
+    // Thus, the first input and output should be same tensor.
+    auto runner_scatter =
+        NpuOpRunner("ScatterAdd", {*table_grad_t, *ids_t, *output_grad_t},
+                    {*table_grad_t}, {{"use_locking", true}});
+    runner_scatter.Run(stream);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    lookup_table_v2,
+    ops::LookupTableV2NPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::LookupTableV2NPUKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::LookupTableV2NPUKernel<paddle::platform::NPUDeviceContext,
+                                paddle::platform::float16>);
+
+REGISTER_OP_NPU_KERNEL(
+    lookup_table_v2_grad, ops::LookupTableV2GradNPUKernel<float>,
+    ops::LookupTableV2GradNPUKernel<int>,
+    ops::LookupTableV2GradNPUKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/math/depthwise_conv.cu b/paddle/fluid/operators/math/depthwise_conv.cu
index 5fd543b5c6c5c..7c5f59fab0d28 100644
--- a/paddle/fluid/operators/math/depthwise_conv.cu
+++ b/paddle/fluid/operators/math/depthwise_conv.cu
@@ -919,11 +919,10 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T,
         batch_size * output_channels * output_height * output_width;
 #ifdef __HIPCC__
     int block_size = 256;
-    int grid_size = std::min((nums_output + block_size - 1) / block_size, 256);
 #else
     int block_size = 512;
-    int grid_size = (nums_output + block_size - 1) / block_size;
 #endif
+    int grid_size = (nums_output + block_size - 1) / block_size;
 
 #define check_case(c_filter_multiplier, c_stride, c_filter)                    \
   if (c_filter_multiplier == 0 ||                                              \
diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc
index 68179a68574a0..0bdc7b6943422 100644
--- a/paddle/fluid/operators/math/math_function.cc
+++ b/paddle/fluid/operators/math/math_function.cc
@@ -27,6 +27,7 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/operators/math/math_function_impl.h"
+#include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/float16.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 
@@ -49,6 +50,7 @@ template struct SetConstant<platform::CPUDeviceContext, platform::complex128>;
 
 #ifdef PADDLE_WITH_XPU
 template struct SetConstant<platform::XPUDeviceContext, platform::float16>;
+template struct SetConstant<platform::XPUDeviceContext, platform::bfloat16>;
 template struct SetConstant<platform::XPUDeviceContext, float>;
 template struct SetConstant<platform::XPUDeviceContext, double>;
 template struct SetConstant<platform::XPUDeviceContext, uint8_t>;
diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu
index 2b93cd926081e..f94c1bf696cda 100644
--- a/paddle/fluid/operators/math/math_function.cu
+++ b/paddle/fluid/operators/math/math_function.cu
@@ -19,6 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/math_function_impl.h"
+#include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/complex128.h"
 #include "paddle/fluid/platform/complex64.h"
 #include "paddle/fluid/platform/float16.h"
@@ -33,6 +34,7 @@ using complex64 = paddle::platform::complex64;
 using complex128 = paddle::platform::complex128;
 
 template struct SetConstant<platform::CUDADeviceContext, platform::float16>;
+template struct SetConstant<platform::CUDADeviceContext, platform::bfloat16>;
 template struct SetConstant<platform::CUDADeviceContext, float>;
 template struct SetConstant<platform::CUDADeviceContext, double>;
 template struct SetConstant<platform::CUDADeviceContext, uint8_t>;
diff --git a/paddle/fluid/operators/math/matrix_inverse.cc b/paddle/fluid/operators/math/matrix_inverse.cc
index 25bc5d725e1fd..60481491cb4b4 100644
--- a/paddle/fluid/operators/math/matrix_inverse.cc
+++ b/paddle/fluid/operators/math/matrix_inverse.cc
@@ -23,34 +23,10 @@ namespace math {
 
 template <typename T>
 class MatrixInverseFunctor<platform::CPUDeviceContext, T> {
-  using Matrix =
-      Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
-  using EigenMatrixMap = Eigen::Map<Matrix>;
-  using ConstEigenMatrixMap = Eigen::Map<const Matrix>;
-
  public:
   void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& a, framework::Tensor* a_inv) {
-    const auto& mat_dims = a.dims();
-    const int rank = mat_dims.size();
-    int n = mat_dims[rank - 1];
-    int batch_size = rank > 2 ? a.numel() / (n * n) : 1;
-
-    const T* a_ptr = a.data<T>();
-    T* a_inv_ptr = a_inv->mutable_data<T>(context.GetPlace());
-
-    for (int i = 0; i < batch_size; ++i) {
-      ConstEigenMatrixMap mat(a_ptr + i * n * n, n, n);
-      EigenMatrixMap mat_inv(a_inv_ptr + i * n * n, n, n);
-      Eigen::PartialPivLU<Matrix> lu;
-      lu.compute(mat);
-
-      const T min_abs_pivot = lu.matrixLU().diagonal().cwiseAbs().minCoeff();
-      PADDLE_ENFORCE_GT(
-          min_abs_pivot, static_cast<T>(0),
-          platform::errors::InvalidArgument("Input is not invertible."));
-      mat_inv.noalias() = lu.inverse();
-    }
+    compute_inverse_eigen<platform::CPUDeviceContext, T>(context, a, a_inv);
   }
 };
 
diff --git a/paddle/fluid/operators/math/matrix_inverse.cu.cc b/paddle/fluid/operators/math/matrix_inverse.cu.cc
index 7f5df11468055..5deedf084c697 100644
--- a/paddle/fluid/operators/math/matrix_inverse.cu.cc
+++ b/paddle/fluid/operators/math/matrix_inverse.cu.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/operators/math/matrix_inverse.h"
 #include "paddle/fluid/operators/math/blas.h"
 
 namespace paddle {
@@ -32,6 +33,7 @@ class MatrixInverseFunctor<platform::CUDADeviceContext, T> {
  public:
   void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& a, framework::Tensor* a_inv) {
+#ifndef PADDLE_WITH_HIP
     const auto& mat_dims = a.dims();
     const int rank = mat_dims.size();
     int n = mat_dims[rank - 1];
@@ -111,6 +113,9 @@ class MatrixInverseFunctor<platform::CUDADeviceContext, T> {
                             "non-singular matrix",
                             i, info[i], info[i]));
     }
+#else
+    compute_inverse_eigen<platform::CUDADeviceContext, T>(context, a, a_inv);
+#endif
   }
 };
 
diff --git a/paddle/fluid/operators/math/matrix_inverse.h b/paddle/fluid/operators/math/matrix_inverse.h
index f0baf0b250e75..fb58b48366652 100644
--- a/paddle/fluid/operators/math/matrix_inverse.h
+++ b/paddle/fluid/operators/math/matrix_inverse.h
@@ -15,6 +15,8 @@ limitations under the License. */
 #pragma once
 
 #include <string>
+#include "Eigen/Core"
+#include "Eigen/LU"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device_context.h"
 
@@ -22,6 +24,36 @@ namespace paddle {
 namespace operators {
 namespace math {
 
+template <typename DeviceContext, typename T>
+void compute_inverse_eigen(const DeviceContext& context,
+                           const framework::Tensor& a,
+                           framework::Tensor* a_inv) {
+  using Matrix =
+      Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+  using EigenMatrixMap = Eigen::Map<Matrix>;
+  using ConstEigenMatrixMap = Eigen::Map<const Matrix>;
+  const auto& mat_dims = a.dims();
+  const int rank = mat_dims.size();
+  int n = mat_dims[rank - 1];
+  int batch_size = rank > 2 ? a.numel() / (n * n) : 1;
+
+  const T* a_ptr = a.data<T>();
+  T* a_inv_ptr = a_inv->mutable_data<T>(context.GetPlace());
+
+  for (int i = 0; i < batch_size; ++i) {
+    ConstEigenMatrixMap mat(a_ptr + i * n * n, n, n);
+    EigenMatrixMap mat_inv(a_inv_ptr + i * n * n, n, n);
+    Eigen::PartialPivLU<Matrix> lu;
+    lu.compute(mat);
+
+    const T min_abs_pivot = lu.matrixLU().diagonal().cwiseAbs().minCoeff();
+    PADDLE_ENFORCE_GT(
+        min_abs_pivot, static_cast<T>(0),
+        platform::errors::InvalidArgument("Input is not invertible."));
+    mat_inv.noalias() = lu.inverse();
+  }
+}
+
 template <typename DeviceContext, typename T>
 class MatrixInverseFunctor {
  public:
diff --git a/paddle/fluid/operators/matmul_v2_op_npu.cc b/paddle/fluid/operators/matmul_v2_op_npu.cc
new file mode 100644
index 0000000000000..d3022056a47de
--- /dev/null
+++ b/paddle/fluid/operators/matmul_v2_op_npu.cc
@@ -0,0 +1,160 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/operators/matmul_v2_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class MatMulV2NPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::Tensor>("X");
+    auto* y = ctx.Input<framework::Tensor>("Y");
+    auto* out = ctx.Output<framework::Tensor>("Out");
+    bool transpose_x = ctx.Attr<bool>("trans_x");
+    bool transpose_y = ctx.Attr<bool>("trans_y");
+
+    if (x->dims().size() == 2) {
+      out->mutable_data<T>(ctx.GetPlace());
+
+      auto runner = NpuOpRunner(
+          "MatMul", {*x, *y}, {*out},
+          {{"transpose_x1", transpose_x}, {"transpose_x2", transpose_y}});
+
+      auto stream =
+          ctx.template device_context<paddle::platform::NPUDeviceContext>()
+              .stream();
+      runner.Run(stream);
+
+    } else if (x->dims().size() > 2) {
+      out->mutable_data<T>(ctx.GetPlace());
+
+      auto runner =
+          NpuOpRunner("BatchMatMul", {*x, *y}, {*out},
+                      {{"adj_x1", transpose_x}, {"adj_x2", transpose_y}});
+
+      auto stream =
+          ctx.template device_context<paddle::platform::NPUDeviceContext>()
+              .stream();
+      runner.Run(stream);
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class MatMulV2GradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::Tensor>("X");
+    auto* y = ctx.Input<framework::Tensor>("Y");
+    auto* dout = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
+    bool transpose_y = ctx.Attr<bool>("trans_y");
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    if (x->dims().size() == 2) {
+      if (transpose_y) {
+        if (dx) {
+          dx->mutable_data<T>(ctx.GetPlace());
+          auto runner_dx =
+              NpuOpRunner("MatMul", {*dout, *y}, {*dx},
+                          {{"transpose_x1", false}, {"transpose_x2", false}});
+
+          runner_dx.Run(stream);
+        }
+        if (dy) {
+          dy->mutable_data<T>(ctx.GetPlace());
+          auto runner_dy =
+              NpuOpRunner("MatMul", {*dout, *x}, {*dy},
+                          {{"transpose_x1", true}, {"transpose_x2", false}});
+
+          runner_dy.Run(stream);
+        }
+
+      } else {
+        if (dx) {
+          dx->mutable_data<T>(ctx.GetPlace());
+          auto runner_dx =
+              NpuOpRunner("MatMul", {*dout, *y}, {*dx},
+                          {{"transpose_x1", false}, {"transpose_x2", true}});
+
+          runner_dx.Run(stream);
+        }
+        if (dy) {
+          dy->mutable_data<T>(ctx.GetPlace());
+          auto runner_dy =
+              NpuOpRunner("MatMul", {*x, *dout}, {*dy},
+                          {{"transpose_x1", true}, {"transpose_x2", false}});
+
+          runner_dy.Run(stream);
+        }
+      }
+    } else if (x->dims().size() > 2) {
+      if (transpose_y) {
+        if (dx) {
+          dx->mutable_data<T>(ctx.GetPlace());
+          auto runner_dx = NpuOpRunner("BatchMatMul", {*dout, *y}, {*dx},
+                                       {{"adj_x1", false}, {"adj_x2", false}});
+
+          runner_dx.Run(stream);
+        }
+        if (dy) {
+          dy->mutable_data<T>(ctx.GetPlace());
+          auto runner_dy = NpuOpRunner("BatchMatMul", {*dout, *x}, {*dy},
+                                       {{"adj_x1", true}, {"adj_x2", false}});
+
+          runner_dy.Run(stream);
+        }
+      } else {
+        if (dx) {
+          dx->mutable_data<T>(ctx.GetPlace());
+          auto runner_dx = NpuOpRunner("BatchMatMul", {*dout, *y}, {*dx},
+                                       {{"adj_x1", false}, {"adj_x2", true}});
+
+          runner_dx.Run(stream);
+        }
+        if (dy) {
+          dy->mutable_data<T>(ctx.GetPlace());
+          auto runner_dy = NpuOpRunner("BatchMatMul", {*x, *dout}, {*dy},
+                                       {{"adj_x1", true}, {"adj_x2", false}});
+          runner_dy.Run(stream);
+        }
+      }
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    matmul_v2,
+    ops::MatMulV2NPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::MatMulV2NPUKernel<paddle::platform::NPUDeviceContext,
+                           paddle::platform::float16>);
+REGISTER_OP_NPU_KERNEL(
+    matmul_v2_grad,
+    ops::MatMulV2GradNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::MatMulV2GradNPUKernel<paddle::platform::NPUDeviceContext,
+                               paddle::platform::float16>);
diff --git a/paddle/fluid/operators/mean_op_npu.cc b/paddle/fluid/operators/mean_op_npu.cc
new file mode 100644
index 0000000000000..d6e982039fa29
--- /dev/null
+++ b/paddle/fluid/operators/mean_op_npu.cc
@@ -0,0 +1,102 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/mean_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+#include "paddle/fluid/platform/float16.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class MeanNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::LoDTensor>("X");
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
+
+    std::vector<int> axes;
+
+    framework::NPUAttributeMap attr_input = {{"keep_dims", false},
+                                             {"axes", axes}};
+
+    out->mutable_data<T>(ctx.GetPlace());
+
+    auto runner = NpuOpRunner("ReduceMeanD", {*x}, {*out}, attr_input);
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    runner.Run(stream);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class MeanGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto stream =
+        context.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    auto grad = context.Input<Tensor>(framework::GradVarName("Out"));
+
+    PADDLE_ENFORCE_EQ(grad->numel(), 1,
+                      platform::errors::InvalidArgument(
+                          "Mean Gradient Input Tensor len should be 1. But "
+                          "received Out@Grad's elements num is %d.",
+                          grad->numel()));
+
+    auto IG = context.Output<Tensor>(framework::GradVarName("X"));
+    IG->mutable_data<T>(context.GetPlace());
+
+    // ones
+    Tensor ones(grad->type());
+    ones.mutable_data<T>(IG->dims(), context.GetPlace());
+    auto runner_ones = NpuOpRunner("OnesLike", {*IG}, {ones}, {});
+    runner_ones.Run(stream);
+
+    // means
+    Tensor mean_tensor(grad->type());
+    mean_tensor.Resize({1});
+    mean_tensor.mutable_data<T>(context.GetPlace());
+    FillNpuTensorWithConstant<T>(
+        &mean_tensor, static_cast<T>(1.0 / static_cast<float>(IG->numel())));
+
+    // means mul ones
+    Tensor mean_ma(grad->type());
+    mean_ma.Resize(IG->dims());
+    mean_ma.mutable_data<T>(context.GetPlace());
+    auto runner_mul_1 = NpuOpRunner("Mul", {mean_tensor, ones}, {mean_ma}, {});
+    runner_mul_1.Run(stream);
+
+    // and mul grad
+    auto runner_mul_2 = NpuOpRunner("Mul", {mean_ma, *grad}, {*IG}, {});
+    runner_mul_2.Run(stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_NPU_KERNEL(
+    mean, ops::MeanNPUKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::MeanNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::MeanNPUKernel<paddle::platform::NPUDeviceContext, double>,
+    ops::MeanNPUKernel<paddle::platform::NPUDeviceContext, plat::float16>)
+
+REGISTER_OP_NPU_KERNEL(
+    mean_grad, ops::MeanGradNPUKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::MeanGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::MeanGradNPUKernel<paddle::platform::NPUDeviceContext, double>,
+    ops::MeanGradNPUKernel<paddle::platform::NPUDeviceContext, plat::float16>)
diff --git a/paddle/fluid/operators/memcpy_op.cc b/paddle/fluid/operators/memcpy_op.cc
index d10d5bf12e6b4..4e10498efa10c 100644
--- a/paddle/fluid/operators/memcpy_op.cc
+++ b/paddle/fluid/operators/memcpy_op.cc
@@ -105,16 +105,18 @@ class MemcpyOpProtoMaker : public framework::OpProtoAndCheckerMaker {
               "is the same as input X.");
     AddAttr<int>("dst_place_type",
                  "Determine the dst place of tensor copy. "
-                 "By Now it ONLY support CUDAPlace and CUDAPinnedPlace. Other "
-                 "place type is Unimplemented and will cause ERROR."
+                 "By Now it ONLY support CUDAPlace <-> CUDAPinnedPlace or "
+                 "NPUPlace <-> CPUPlace. "
+                 "Other place type is Unimplemented and will cause ERROR."
                  "0: dst is on CPUPlace. "
                  "1: dst is on CUDAPlace. "
                  "2: dst is on CUDAPinnedPlace. "
-                 "3: dst is on XPUPlace. ");
+                 "3: dst is on XPUPlace. "
+                 "4: dst is on NPUPlace. ");
     AddComment(R"DOC(
     Memcpy Operator.
-    By now, it ONLY supports the memcopy between CUDAPinnedPlace and CUDAPlace,
-    and used as an internal op by Recompute-Offload.
+    By now, it ONLY supports the memcopy between CUDAPinnedPlace <-> CUDAPlace or 
+    NPUPlace <-> CPUPlace, and used as an internal op by Recompute-Offload.
     You would have to update it if you want other more capacities.
 
 Out = X,  when type in [LoDTensor]
@@ -146,3 +148,11 @@ REGISTER_OP_CUDA_KERNEL_FUNCTOR(memcpy, float, ops::MemcpyKernel, double,
                                 ops::MemcpyKernel, plat::float16,
                                 ops::MemcpyKernel);
 #endif
+
+#ifdef PADDLE_WITH_ASCEND_CL
+REGISTER_OP_NPU_KERNEL_FUNCTOR(memcpy, float, ops::MemcpyKernel, double,
+                               ops::MemcpyKernel, int, ops::MemcpyKernel,
+                               int64_t, ops::MemcpyKernel, bool,
+                               ops::MemcpyKernel, plat::float16,
+                               ops::MemcpyKernel);
+#endif
diff --git a/paddle/fluid/operators/memcpy_op.h b/paddle/fluid/operators/memcpy_op.h
old mode 100755
new mode 100644
index f81ca05f4380a..63a41cc723731
--- a/paddle/fluid/operators/memcpy_op.h
+++ b/paddle/fluid/operators/memcpy_op.h
@@ -51,7 +51,17 @@ class MemcpyFunctor {
     } else if (dst_place_type_ == 1) {
       framework::TensorCopy(lod_tensor, dev_ctx_.GetPlace(), dev_ctx_,
                             &out_tensor);
-    } else {
+    }
+#ifdef PADDLE_WITH_ASCEND_CL
+    else if (dst_place_type_ == 0) {  // NOLINT
+      framework::TensorCopy(lod_tensor, platform::CPUPlace(), dev_ctx_,
+                            &out_tensor);
+    } else if (dst_place_type_ == 4) {
+      framework::TensorCopy(lod_tensor, dev_ctx_.GetPlace(), dev_ctx_,
+                            &out_tensor);
+    }
+#endif
+    else {  // NOLINT
       PADDLE_THROW(platform::errors::Unimplemented(
           "memcpy dst_place_type: %d is not supported yet.", dst_place_type_));
     }
diff --git a/paddle/fluid/operators/meshgrid_op.cc b/paddle/fluid/operators/meshgrid_op.cc
index 33f71b4adc066..54600e26bb57f 100644
--- a/paddle/fluid/operators/meshgrid_op.cc
+++ b/paddle/fluid/operators/meshgrid_op.cc
@@ -157,3 +157,17 @@ REGISTER_OP_CPU_KERNEL(
     ops::MeshgridGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
     ops::MeshgridGradKernel<paddle::platform::CPUDeviceContext, int>,
     ops::MeshgridGradKernel<paddle::platform::CPUDeviceContext, double>);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+REGISTER_OP_CUDA_KERNEL(
+    meshgrid, ops::MeshgridKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::MeshgridKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::MeshgridKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::MeshgridKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::MeshgridKernel<paddle::platform::CUDADeviceContext, bool>);
+REGISTER_OP_CUDA_KERNEL(
+    meshgrid_grad,
+    ops::MeshgridGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::MeshgridGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::MeshgridGradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::MeshgridGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
+#endif
diff --git a/paddle/fluid/operators/meshgrid_op.cu b/paddle/fluid/operators/meshgrid_op.cu
deleted file mode 100644
index dc813a07f8c8c..0000000000000
--- a/paddle/fluid/operators/meshgrid_op.cu
+++ /dev/null
@@ -1,29 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/meshgrid_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    meshgrid, ops::MeshgridKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MeshgridKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::MeshgridKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::MeshgridKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::MeshgridKernel<paddle::platform::CUDADeviceContext, bool>);
-REGISTER_OP_CUDA_KERNEL(
-    meshgrid_grad,
-    ops::MeshgridGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MeshgridGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::MeshgridGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::MeshgridGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/metrics/accuracy_op_npu.cc b/paddle/fluid/operators/metrics/accuracy_op_npu.cc
new file mode 100644
index 0000000000000..4ffcbaf55314a
--- /dev/null
+++ b/paddle/fluid/operators/metrics/accuracy_op_npu.cc
@@ -0,0 +1,124 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/operators/controlflow/compare_op.h"
+#include "paddle/fluid/operators/metrics/accuracy_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class AccuracyNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* pred = ctx.Input<Tensor>("Out");
+    auto* label = ctx.Input<Tensor>("Label");
+    // auto* logits = ctx.Input<Tensor>("Indices");
+
+    auto* acc = ctx.Output<Tensor>("Accuracy");
+    auto* correct = ctx.Output<Tensor>("Correct");
+    auto* total = ctx.Output<Tensor>("Total");
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    // cast pred
+    Tensor tmp_pred(pred->type());
+    tmp_pred.Resize(pred->dims());
+    tmp_pred.mutable_data<int>(ctx.GetPlace());
+    auto runner_cast_pred =
+        NpuOpRunner("Cast", {*pred}, {tmp_pred},
+                    {{"dst_type", static_cast<int>(ACL_INT32)}});
+    runner_cast_pred.Run(stream);
+
+    // cast label
+    Tensor tmp_label(label->type());
+    tmp_label.Resize(label->dims());
+    tmp_label.mutable_data<int>(ctx.GetPlace());
+    auto runner_cast_label =
+        NpuOpRunner("Cast", {*label}, {tmp_label},
+                    {{"dst_type", static_cast<int>(ACL_INT32)}});
+    runner_cast_label.Run(stream);
+
+    // equal
+    Tensor tmp_equal(label->type());
+    tmp_equal.Resize(label->dims());
+    tmp_equal.mutable_data<bool>(ctx.GetPlace());
+    auto runner_equal =
+        NpuOpRunner("Equal", {tmp_pred, tmp_label}, {tmp_equal}, {});
+    runner_equal.Run(stream);
+
+    // cast equal
+    Tensor tmp_equal_cast(label->type());
+    tmp_equal_cast.Resize(label->dims());
+    tmp_equal_cast.mutable_data<float>(ctx.GetPlace());
+    auto runner_cast_equal =
+        NpuOpRunner("Cast", {tmp_equal}, {tmp_equal_cast},
+                    {{"dst_type", static_cast<float>(ACL_FLOAT)}});
+    runner_cast_equal.Run(stream);
+
+    // acc
+    acc->mutable_data<float>(ctx.GetPlace());
+    std::vector<int> axes_vec_1;
+    auto runner_acc = NpuOpRunner("ReduceMeanD", {tmp_equal_cast}, {*acc},
+                                  {{"keep_dims", false}, {"axes", axes_vec_1}});
+    runner_acc.Run(stream);
+
+    // correct
+    correct->mutable_data<float>(ctx.GetPlace());
+    std::vector<int> axes_vec_2;
+    auto runner_correct =
+        NpuOpRunner("ReduceSumD", {tmp_equal_cast}, {*correct},
+                    {{"keep_dims", false}, {"axes", axes_vec_2}});
+    runner_correct.Run(stream);
+
+    // ones_tensor
+    Tensor ones_tensor(label->type());
+    ones_tensor.Resize(label->dims());
+    ones_tensor.mutable_data<int>(ctx.GetPlace());
+    auto runner_oneslike =
+        NpuOpRunner("OnesLike", {tmp_label}, {ones_tensor}, {});
+    runner_oneslike.Run(stream);
+
+    // ones_tensor_cast
+    Tensor ones_tensor_cast(label->type());
+    ones_tensor_cast.Resize(label->dims());
+    ones_tensor_cast.mutable_data<float>(ctx.GetPlace());
+    auto runner_ones_cast =
+        NpuOpRunner("Cast", {ones_tensor}, {ones_tensor_cast},
+                    {{"dst_type", static_cast<float>(ACL_FLOAT)}});
+    runner_ones_cast.Run(stream);
+
+    // total
+    total->mutable_data<float>(ctx.GetPlace());
+    std::vector<int> axes_vec_3;
+    auto runner_total =
+        NpuOpRunner("ReduceSumD", {ones_tensor_cast}, {*total},
+                    {{"keep_dims", false}, {"axes", axes_vec_3}});
+    runner_total.Run(stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    accuracy, ops::AccuracyNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::AccuracyNPUKernel<paddle::platform::NPUDeviceContext,
+                           paddle::platform::float16>,
+    ops::AccuracyNPUKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::AccuracyNPUKernel<paddle::platform::NPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc
index 64a1903c2da4f..9d80286f4c4ef 100644
--- a/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc
@@ -33,7 +33,7 @@ class InterpolateMKLDNNHandler
     : public platform::MKLDNNHandlerT<T, dnnl::resampling_forward> {
  public:
   InterpolateMKLDNNHandler(const dnnl::algorithm algo,
-                           const paddle::platform::MKLDNNDeviceContext& dev_ctx,
+                           const platform::MKLDNNDeviceContext& dev_ctx,
                            const dnnl::engine engine, platform::Place cpu_place,
                            const Tensor* x, Tensor* z,
                            const std::string& uniq_name)
@@ -94,19 +94,32 @@ class InterpolateMKLDNNKernel : public framework::OpKernel<T> {
         out_dims = out_size_data;
       }
     } else {
-      float scale;
+      std::vector<float> scale;
+      scale.reserve(3);
       auto scale_tensor = ctx.Input<Tensor>("Scale");
       if (scale_tensor != nullptr) {
         auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
-        scale = scale_data[0];
+        scale.resize(3, scale_data[0]);
+        std::copy(scale_data.begin(), scale_data.end(), scale.begin());
       } else {
-        scale = ctx.Attr<float>("scale");
+        std::string op_type = ctx.Type();
+
+        if (op_type.find("v2") == std::string::npos) {  // v1
+          scale.push_back(ctx.Attr<float>("scale"));
+          scale.push_back(scale[0]);
+          scale.push_back(scale[0]);
+        } else {  // v2
+          std::vector<float> scale_attr = ctx.Attr<std::vector<float>>("scale");
+          scale.resize(3, scale_attr[0]);
+          std::copy(scale_attr.begin(), scale_attr.end(), scale.begin());
+        }
       }
-      if (scale > 0) {
+      if (scale[0] > 0.0f && scale[1] > 0.0f && scale[2] > 0.0f) {
+        int j = 0;
         std::vector<int64_t> in_dhw_vec = framework::vectorize(in_dhw_dims);
         std::transform(
             in_dhw_vec.begin(), in_dhw_vec.end(), out_dims.begin(),
-            [&](int64_t i) -> int { return static_cast<int>(i * scale); });
+            [&](int64_t i) -> int { return static_cast<int>(i * scale[j++]); });
       }
     }
 
@@ -172,3 +185,8 @@ REGISTER_OP_KERNEL(nearest_interp, MKLDNN, ::paddle::platform::CPUPlace,
                    ops::InterpolateMKLDNNKernel<float>);
 REGISTER_OP_KERNEL(bilinear_interp, MKLDNN, ::paddle::platform::CPUPlace,
                    ops::InterpolateMKLDNNKernel<float>);
+
+REGISTER_OP_KERNEL(nearest_interp_v2, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::InterpolateMKLDNNKernel<float>);
+REGISTER_OP_KERNEL(bilinear_interp_v2, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::InterpolateMKLDNNKernel<float>);
diff --git a/paddle/fluid/operators/modified_huber_loss_op.cu b/paddle/fluid/operators/modified_huber_loss_op.cu
index 71bfacb928385..3c85da3c52c6c 100644
--- a/paddle/fluid/operators/modified_huber_loss_op.cu
+++ b/paddle/fluid/operators/modified_huber_loss_op.cu
@@ -11,7 +11,9 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include <thrust/device_vector.h>
 #include <thrust/for_each.h>
+#include <thrust/host_vector.h>
 #include <thrust/tuple.h>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/modified_huber_loss_op.h"
diff --git a/paddle/fluid/operators/mul_op_npu.cc b/paddle/fluid/operators/mul_op_npu.cc
new file mode 100644
index 0000000000000..e0736239d40f2
--- /dev/null
+++ b/paddle/fluid/operators/mul_op_npu.cc
@@ -0,0 +1,237 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/operators/mul_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class MulNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::Tensor>("X");
+    auto* y = ctx.Input<framework::Tensor>("Y");
+    auto* out = ctx.Output<framework::Tensor>("Out");
+    int x_num_col_dims = ctx.Attr<int>("x_num_col_dims");
+    int y_num_col_dims = ctx.Attr<int>("y_num_col_dims");
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    if (x_num_col_dims == 1 && y_num_col_dims == 1) {
+      if (x->dims().size() == 2 && y->dims().size() == 2) {
+        out->mutable_data<T>(ctx.GetPlace());
+        auto runner =
+            NpuOpRunner("MatMul", {*x, *y}, {*out},
+                        {{"transpose_x1", false}, {"transpose_x2", false}});
+
+        runner.Run(stream);
+      } else if (x->dims().size() == 3 && y->dims().size() == 2) {
+        // reshape
+        Tensor tmp_x(x->type());
+        int64_t sec_dim = x->dims()[1] * x->dims()[2];
+        int64_t first_dim = x->dims()[0];
+        tmp_x.Resize(framework::make_ddim({first_dim, sec_dim}));
+        tmp_x.mutable_data<T>(ctx.GetPlace());
+        framework::TensorCopy(
+            *x, ctx.GetPlace(),
+            ctx.template device_context<platform::DeviceContext>(), &tmp_x);
+        tmp_x.Resize(framework::make_ddim({first_dim, sec_dim}));
+        out->mutable_data<T>(ctx.GetPlace());
+        // matmul
+        auto runner =
+            NpuOpRunner("MatMul", {tmp_x, *y}, {*out},
+                        {{"transpose_x1", false}, {"transpose_x2", false}});
+        runner.Run(stream);
+      } else {
+        PADDLE_THROW(
+            platform::errors::InvalidArgument("npu error: not suppert dims"));
+      }
+      // to do other
+    } else if (x->dims().size() == 3 && y->dims().size() == 2) {
+      // for example: x.shape=[2, 3, 4] y.shape=[4, 5], expect [2, 3, 5]
+      PADDLE_ENFORCE_EQ(x_num_col_dims, 2,
+                        platform::errors::InvalidArgument(
+                            "now only support x_num_col_dims == 2: but got %d",
+                            x_num_col_dims));
+      // flatten => x.shape=[6, 4]
+      Tensor tmp_x(x->type());
+      int64_t first_dim = x->dims()[0] * x->dims()[1];
+      int64_t sec_dim = x->dims()[2];
+      tmp_x.Resize(framework::make_ddim({first_dim, sec_dim}));
+      tmp_x.mutable_data<T>(ctx.GetPlace());
+      framework::TensorCopy(
+          *x, ctx.GetPlace(),
+          ctx.template device_context<platform::DeviceContext>(), &tmp_x);
+      tmp_x.Resize(framework::make_ddim({first_dim, sec_dim}));
+
+      // matmul [6,4] , [4, 5] => [6, 5]
+      Tensor tmp_matmul(x->type());
+      tmp_matmul.Resize(framework::make_ddim({first_dim, y->dims()[1]}));
+      tmp_matmul.mutable_data<T>(ctx.GetPlace());
+
+      auto runner_matmul =
+          NpuOpRunner("MatMul", {tmp_x, *y}, {tmp_matmul},
+                      {{"transpose_x1", false}, {"transpose_x2", false}});
+
+      runner_matmul.Run(stream);
+      // reshape [6, 5] => [2, 3, 5]
+      (*out).Resize(
+          framework::make_ddim({x->dims()[0], x->dims()[1], y->dims()[1]}));
+      out->mutable_data(ctx.GetPlace(), x->type());
+      framework::TensorCopy(
+          tmp_matmul, ctx.GetPlace(),
+          ctx.template device_context<platform::DeviceContext>(), out);
+      (*out).Resize(
+          framework::make_ddim({x->dims()[0], x->dims()[1], y->dims()[1]}));
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class MulGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::Tensor>("X");
+    auto* y = ctx.Input<framework::Tensor>("Y");
+    auto* dout = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
+    int x_num_col_dims = ctx.Attr<int>("x_num_col_dims");
+    int y_num_col_dims = ctx.Attr<int>("y_num_col_dims");
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    if (x_num_col_dims == 1 && y_num_col_dims == 1) {
+      if (x->dims().size() == 2 && y->dims().size() == 2) {
+        if (dx) {
+          dx->mutable_data<T>(ctx.GetPlace());
+          auto runner_dx =
+              NpuOpRunner("MatMul", {*dout, *y}, {*dx},
+                          {{"transpose_x1", false}, {"transpose_x2", true}});
+
+          runner_dx.Run(stream);
+        }
+
+        if (dy) {
+          dy->mutable_data<T>(ctx.GetPlace());
+          auto runner_dy =
+              NpuOpRunner("MatMul", {*x, *dout}, {*dy},
+                          {{"transpose_x1", true}, {"transpose_x2", false}});
+
+          runner_dy.Run(stream);
+        }
+      } else if (x->dims().size() == 3 && y->dims().size() == 2) {
+        // flatten => x.shape=[6, 4]
+        // matmul
+        if (dx) {
+          // matmul [2, 5] * [12, 5] => [2, 12]
+          dx->mutable_data<T>(ctx.GetPlace());
+          auto dx_dims = dx->dims();
+          dx->Resize(framework::make_ddim({dout->dims()[0], y->dims()[0]}));
+          auto runner_matmul =
+              NpuOpRunner("MatMul", {*dout, *y}, {*dx},
+                          {{"transpose_x1", false}, {"transpose_x2", true}});
+          runner_matmul.Run(stream);
+          // reshape [2, 12] => [2, 3, 4]
+          dx->Resize(dx_dims);
+        }
+
+        if (dy) {
+          // flatten
+          Tensor tmp_x(x->type());
+          int64_t sec_dim = x->dims()[1] * x->dims()[2];
+          int64_t first_dim = x->dims()[0];
+          tmp_x.Resize(framework::make_ddim({first_dim, sec_dim}));
+          tmp_x.mutable_data<T>(ctx.GetPlace());
+          framework::TensorCopy(
+              *x, ctx.GetPlace(),
+              ctx.template device_context<platform::DeviceContext>(), &tmp_x);
+          tmp_x.Resize(framework::make_ddim({first_dim, sec_dim}));
+          dy->mutable_data<T>(ctx.GetPlace());
+          auto runner_dy =
+              NpuOpRunner("MatMul", {tmp_x, *dout}, {*dy},
+                          {{"transpose_x1", true}, {"transpose_x2", false}});
+
+          runner_dy.Run(stream);
+        }
+      }
+    } else if (x->dims().size() == 3 && y->dims().size() == 2) {
+      // for example: x.shape=[2, 3, 4] y.shape=[4, 5], expect [2, 3, 5]
+      PADDLE_ENFORCE_EQ(x_num_col_dims, 2,
+                        platform::errors::InvalidArgument(
+                            "now only support x_num_col_dims == 2: but got %d",
+                            x_num_col_dims));
+      // tmp_dout both used by dx and dy
+      Tensor tmp_dout(x->type());
+      int64_t dout_first_dim = dout->dims()[0] * dout->dims()[1];
+      int64_t dout_sec_dim = dout->dims()[2];
+      tmp_dout.Resize(framework::make_ddim({dout_first_dim, dout_sec_dim}));
+      tmp_dout.mutable_data<T>(ctx.GetPlace());
+      framework::TensorCopy(
+          *dout, ctx.GetPlace(),
+          ctx.template device_context<platform::DeviceContext>(), &tmp_dout);
+      tmp_dout.Resize(framework::make_ddim({dout_first_dim, dout_sec_dim}));
+
+      if (dx) {
+        // tmp_dout * y [6,5] * [4,5] => [6, 4]
+        dx->mutable_data<T>(ctx.GetPlace());
+        auto dx_dims = dx->dims();
+        dx->Resize(framework::make_ddim({dout_first_dim, y->dims()[0]}));
+        auto runner_matmul =
+            NpuOpRunner("MatMul", {tmp_dout, *y}, {*dx},
+                        {{"transpose_x1", false}, {"transpose_x2", true}});
+        runner_matmul.Run(stream);
+        // reshape [2, 12] => [2, 3, 4]
+        dx->Resize(dx_dims);
+      }
+      if (dy) {
+        // flatten x.shape [2,3,4] => [6, 4]
+        Tensor tmp_x(x->type());
+        int64_t first_dim = x->dims()[0] * x->dims()[1];
+        int64_t sec_dim = x->dims()[2];
+        tmp_x.Resize(framework::make_ddim({first_dim, sec_dim}));
+        tmp_x.mutable_data<T>(ctx.GetPlace());
+        framework::TensorCopy(
+            *x, ctx.GetPlace(),
+            ctx.template device_context<platform::DeviceContext>(), &tmp_x);
+        tmp_x.Resize(framework::make_ddim({first_dim, sec_dim}));
+        // mamtul [6,4] [6,5] =>[4,5]
+        dy->mutable_data<T>(ctx.GetPlace());
+        auto runner_dy =
+            NpuOpRunner("MatMul", {tmp_x, tmp_dout}, {*dy},
+                        {{"transpose_x1", true}, {"transpose_x2", false}});
+        runner_dy.Run(stream);
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    mul, ops::MulNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::MulNPUKernel<paddle::platform::NPUDeviceContext,
+                      paddle::platform::float16>);
+REGISTER_OP_NPU_KERNEL(
+    mul_grad, ops::MulGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::MulGradNPUKernel<paddle::platform::NPUDeviceContext,
+                          paddle::platform::float16>);
diff --git a/paddle/fluid/operators/norm_utils.cu.h b/paddle/fluid/operators/norm_utils.cu.h
index 9fcc629233891..843736833f815 100644
--- a/paddle/fluid/operators/norm_utils.cu.h
+++ b/paddle/fluid/operators/norm_utils.cu.h
@@ -32,6 +32,12 @@ namespace cub = hipcub;
 #include "paddle/fluid/platform/cudnn_helper.h"
 #endif
 
+#ifdef __HIPCC__
+#define LAUNCH_BOUNDS(BlockDim) __launch_bounds__(BlockDim)
+#else
+#define LAUNCH_BOUNDS(BlockDim)
+#endif
+
 namespace paddle {
 namespace operators {
 
@@ -58,12 +64,10 @@ using DataLayout = framework::DataLayout;
 //          axis=(n,h,w)))
 
 template <typename T, int BlockDim, framework::DataLayout layout>
-__global__ void DoubleGradComputeDX(const T *x, const T *mean,
-                                    const T *variance, const T *ddx,
-                                    const T *dy, const T *scale,
-                                    const T *ddscale, const int N, const int C,
-                                    const int sample_size, const double epsilon,
-                                    T *dx) {
+__global__ LAUNCH_BOUNDS(BlockDim) void DoubleGradComputeDX(
+    const T *x, const T *mean, const T *variance, const T *ddx, const T *dy,
+    const T *scale, const T *ddscale, const int N, const int C,
+    const int sample_size, const double epsilon, T *dx) {
   const int outer_size = C;
   const int inner_size = N * sample_size;
 
@@ -160,12 +164,10 @@ __global__ void DoubleGradComputeDX(const T *x, const T *mean,
 //           scale * inv_var * (ddx - (x - mean) * inv_var.pow(2) *
 //           np.mean(ddx * (x - mean), axis=(n,h,w)))
 template <typename T, int BlockDim, framework::DataLayout layout>
-__global__ void DoubleGradComputeDDY(const T *x, const T *mean,
-                                     const T *variance, const T *ddscale,
-                                     const T *ddbias, const T *ddx,
-                                     const T *scale, const int N, const int C,
-                                     const int sample_size,
-                                     const double epsilon, T *ddy) {
+__global__ LAUNCH_BOUNDS(BlockDim) void DoubleGradComputeDDY(
+    const T *x, const T *mean, const T *variance, const T *ddscale,
+    const T *ddbias, const T *ddx, const T *scale, const int N, const int C,
+    const int sample_size, const double epsilon, T *ddy) {
   const int outer_size = C;
   const int inner_size = N * sample_size;
 
@@ -238,11 +240,10 @@ __global__ void DoubleGradComputeDDY(const T *x, const T *mean,
 //            inv_var.pow(2) * np.mean(dy * (x-mean), axis=(n,h,w)))) *
 //            ddx
 template <typename T, int BlockDim, framework::DataLayout layout>
-__global__ void DoubleGradComputeDScale(const T *x, const T *mean,
-                                        const T *variance, const T *ddx,
-                                        const T *dy, const int N, const int C,
-                                        const int sample_size,
-                                        const double epsilon, T *dscale) {
+__global__ LAUNCH_BOUNDS(BlockDim) void DoubleGradComputeDScale(
+    const T *x, const T *mean, const T *variance, const T *ddx, const T *dy,
+    const int N, const int C, const int sample_size, const double epsilon,
+    T *dscale) {
   const int outer_size = C;
   const int inner_size = N * sample_size;
 
@@ -302,7 +303,7 @@ __global__ void DoubleGradComputeDScale(const T *x, const T *mean,
 
 // math: dscale = np.sum(ddx * dy, axis=(n,h,w)) * inv_var
 template <typename T, int BlockDim, framework::DataLayout layout>
-__global__ void DoubleGradComputeDScaleWithGlobal(
+__global__ LAUNCH_BOUNDS(BlockDim) void DoubleGradComputeDScaleWithGlobal(
     const T *ddx, const T *variance, const T *dy, const double epsilon,
     const int N, const int C, const int sample_size, T *dscale) {
   int outer_size = C;
@@ -422,8 +423,11 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
     set_constant(dev_ctx, &scale_tmp, static_cast<T>(1));
   }
   const T *scale_data = Scale ? Scale->data<T>() : scale_tmp.data<T>();
-
+#ifdef __HIPCC__
+  const int block = 256;
+#else
   const int block = 512;
+#endif
   int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
   const int max_blocks = std::max(max_threads / block, 1);
   int grid = std::min(C, max_blocks);
@@ -532,6 +536,5 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
     }
   }
 }
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/npu_op_runner.cc b/paddle/fluid/operators/npu_op_runner.cc
index 7af6de5224145..276bfa7b3281b 100644
--- a/paddle/fluid/operators/npu_op_runner.cc
+++ b/paddle/fluid/operators/npu_op_runner.cc
@@ -64,13 +64,23 @@ aclFormat ConvertToNpuFormat(DataLayout layout) {
   return iter->second;
 }
 
+aclrtStream GetCurrentNPUStream(int device_id) {
+  if (device_id == -1) {
+    device_id = platform::GetCurrentNPUDeviceId();
+  }
+  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+  auto *dev_ctx = static_cast<platform::NPUDeviceContext *>(
+      pool.Get(platform::NPUPlace(device_id)));
+  return dev_ctx->stream();
+}
+
 NpuOpRunner::NpuOpRunner(std::string op_type) : op_type_(op_type) {
   attr_ = aclopCreateAttr();
 }
 
 NpuOpRunner::NpuOpRunner(std::string op_type, const std::vector<Tensor> &inputs,
                          const std::vector<Tensor> &outputs,
-                         const AttributeMap &attrs)
+                         const NPUAttributeMap &attrs)
     : op_type_(op_type) {
   attr_ = aclopCreateAttr();
   AddInputs(inputs);
@@ -85,7 +95,7 @@ NpuOpRunner::~NpuOpRunner() {
 const std::string &NpuOpRunner::Type() { return op_type_; }
 
 NpuOpRunner &NpuOpRunner::AddAttr(const std::string &name,
-                                  const Attribute &attr) {
+                                  const NPUAttribute &attr) {
   if (attr.type() == typeid(bool)) {
     PADDLE_ENFORCE_NPU_SUCCESS(
         aclopSetAttrBool(attr_, name.c_str(), BOOST_GET_CONST(bool, attr)));
@@ -135,6 +145,16 @@ NpuOpRunner &NpuOpRunner::AddAttr(const std::string &name,
     }
     PADDLE_ENFORCE_NPU_SUCCESS(
         aclopSetAttrListString(attr_, name.c_str(), s.size(), s.data()));
+  } else if (attr.type() == typeid(std::vector<std::vector<int64_t>>)) {
+    auto a = BOOST_GET_CONST(std::vector<std::vector<int64_t>>, attr);
+    std::vector<int64_t *> data;
+    std::vector<int> num;
+    for (auto &&v : a) {
+      data.push_back(v.data());
+      num.push_back(v.size());
+    }
+    PADDLE_ENFORCE_NPU_SUCCESS(aclopSetAttrListListInt(
+        attr_, name.c_str(), data.size(), num.data(), data.data()));
   } else {
     PADDLE_THROW(platform::errors::Unimplemented(
         "Can not convert attribubte '%s' to convert to aclopAttr", name));
@@ -142,7 +162,7 @@ NpuOpRunner &NpuOpRunner::AddAttr(const std::string &name,
   return *this;
 }
 
-NpuOpRunner &NpuOpRunner::AddAttrs(const AttributeMap &attrs) {
+NpuOpRunner &NpuOpRunner::AddAttrs(const NPUAttributeMap &attrs) {
   for (const auto &pair : attrs) {
     AddAttr(pair.first, pair.second);
   }
@@ -175,6 +195,21 @@ NpuOpRunner &NpuOpRunner::AddInputs(const std::vector<Tensor> &tensors) {
   return *this;
 }
 
+// NOTE(zhiqiu): For operators whose input is a list (such as concat, stack),
+// It is needed to set the name of each input tensor.
+NpuOpRunner &NpuOpRunner::AddInputNames(const std::vector<std::string> &names) {
+  PADDLE_ENFORCE_EQ(names.size(), input_descs_.size(),
+                    platform::errors::InvalidArgument(
+                        "The size of input names should be "
+                        "equal to the size of input descs, but got the size "
+                        "of input names is %d, the size of input descs is %d.",
+                        names.size(), input_descs_.size()));
+  for (size_t i = 0; i < names.size(); ++i) {
+    aclSetTensorDescName(input_descs_[i], names[i].c_str());
+  }
+  return *this;
+}
+
 NpuOpRunner &NpuOpRunner::AddOutputs(const std::vector<Tensor> &tensors) {
   for (auto tensor : tensors) {
     // create aclTensorDesc
@@ -224,18 +259,22 @@ aclTensorDesc *NpuOpRunner::CreateTensorDesc(Tensor tensor) {
   auto format = ConvertToNpuFormat(tensor.layout());
   auto dims = framework::vectorize(tensor.dims());
 
-  VLOG(4) << dtype << " " << dims.size() << " " << dims[0] << "," << dims[1]
-          << " " << format;
+  VLOG(4) << "NPU dtype:" << dtype << " "
+          << "rank:" << dims.size() << " dims:" << tensor.dims()
+          << " format:" << format;
 
   auto *desc = aclCreateTensorDesc(dtype, dims.size(), dims.data(), format);
   PADDLE_ENFORCE_NOT_NULL(
       desc, platform::errors::External("Call aclCreateTensorDesc failed."));
+  PADDLE_ENFORCE_NPU_SUCCESS(aclSetTensorStorageFormat(desc, format));
+  PADDLE_ENFORCE_NPU_SUCCESS(
+      aclSetTensorStorageShape(desc, dims.size(), dims.data()));
   return desc;
 }
 
 aclDataBuffer *NpuOpRunner::CreateDataBuffer(Tensor tensor) {
   void *ptr = tensor.data<void>();
-  VLOG(4) << "ptr: " << ptr << ", size: " << tensor.memory_size();
+  VLOG(4) << "NPU ptr: " << ptr << ", size: " << tensor.memory_size();
   auto *buffer = aclCreateDataBuffer(ptr, tensor.memory_size());
   PADDLE_ENFORCE_NOT_NULL(
       buffer, platform::errors::External("Call aclCreateDataBuffer failed."));
@@ -243,11 +282,17 @@ aclDataBuffer *NpuOpRunner::CreateDataBuffer(Tensor tensor) {
 }
 
 void NpuOpRunner::Run(aclrtStream stream) {
+  if (!stream) {
+    VLOG(4) << "Run with default current npu stream: " << stream;
+    stream = GetCurrentNPUStream();
+  }
+
   VLOG(4) << "op_type: " << op_type_;
   VLOG(4) << "input_desc.size: " << input_descs_.size();
   VLOG(4) << "output_desc.size: " << output_descs_.size();
-  VLOG(4) << "stream: " << stream;
   VLOG(4) << "attr: " << attr_;
+  VLOG(4) << "stream: " << stream;
+
   aclError ret = aclopCompileAndExecute(
       op_type_.c_str(), input_descs_.size(), input_descs_.data(),
       input_buffers_.data(), output_descs_.size(), output_descs_.data(),
@@ -256,5 +301,6 @@ void NpuOpRunner::Run(aclrtStream stream) {
   VLOG(4) << "after aclopCompileAndExecute: " << ret;
   PADDLE_ENFORCE_NPU_SUCCESS(ret);
 }
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/npu_op_runner.h b/paddle/fluid/operators/npu_op_runner.h
index c69d8441e5def..5506ddd89692b 100644
--- a/paddle/fluid/operators/npu_op_runner.h
+++ b/paddle/fluid/operators/npu_op_runner.h
@@ -12,8 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifdef PADDLE_WITH_ASCEND_CL
 #pragma once
 #include <paddle/fluid/framework/operator.h>
+#include <paddle/fluid/framework/type_defs.h>
 
 #include <string>
 #include <vector>
@@ -26,8 +28,8 @@ namespace operators {
 
 using Tensor = framework::Tensor;
 using DataLayout = framework::DataLayout;
-using Attribute = framework::Attribute;
-using AttributeMap = framework::AttributeMap;
+using NPUAttribute = framework::NPUAttribute;
+using NPUAttributeMap = framework::NPUAttributeMap;
 
 class NpuOpRunner {
  public:
@@ -35,15 +37,15 @@ class NpuOpRunner {
   explicit NpuOpRunner(std::string op_type,
                        const std::vector<Tensor> &inputs = {},
                        const std::vector<Tensor> &outputs = {},
-                       const AttributeMap &attrs = {});
+                       const NPUAttributeMap &attrs = {});
 
   ~NpuOpRunner();
 
   const std::string &Type();
 
-  NpuOpRunner &AddAttr(const std::string &name, const Attribute &attr);
+  NpuOpRunner &AddAttr(const std::string &name, const NPUAttribute &attr);
 
-  NpuOpRunner &AddAttrs(const AttributeMap &attrs);
+  NpuOpRunner &AddAttrs(const NPUAttributeMap &attrs);
 
   NpuOpRunner &AddInput(const Tensor &tensor);
 
@@ -51,6 +53,8 @@ class NpuOpRunner {
 
   NpuOpRunner &AddInputs(const std::vector<Tensor> &tensors);
 
+  NpuOpRunner &AddInputNames(const std::vector<std::string> &names);
+
   NpuOpRunner &AddOutputs(const std::vector<Tensor> &tensors);
 
   aclTensorDesc *GetInputDesc(size_t index);
@@ -65,7 +69,7 @@ class NpuOpRunner {
 
   std::vector<aclDataBuffer *> &GetOutputBuffers();
 
-  void Run(aclrtStream stream);
+  void Run(aclrtStream stream = nullptr);
 
  private:
   aclTensorDesc *CreateTensorDesc(Tensor tensor);
@@ -80,5 +84,46 @@ class NpuOpRunner {
   aclopAttr *attr_{nullptr};
 };
 
+aclDataType ConvertToNpuDtype(framework::proto::VarType::Type dtype);
+
+aclrtStream GetCurrentNPUStream(int device_id = -1);
+
+template <typename T>
+void FillNpuTensorWithConstant(Tensor *tensor, T val) {
+  PADDLE_ENFORCE_EQ(
+      tensor->IsInitialized(), true,
+      platform::errors::InvalidArgument("The tensor should be initialized."));
+  PADDLE_ENFORCE_EQ(
+      platform::is_npu_place(tensor->place()), true,
+      platform::errors::InvalidArgument("The tensor should be on NPUPlace."));
+  // do async for better performance
+  if (typeid(float) == typeid(T) || typeid(platform::float16) == typeid(T)) {
+    Tensor tmp(tensor->type());
+    tmp.Resize(tensor->dims());
+    tmp.mutable_data<T>(tensor->place());
+    auto stream = GetCurrentNPUStream(
+        BOOST_GET_CONST(platform::NPUPlace, tensor->place()).device);
+    platform::NPUMemsetAsync(tmp.data<void>(), 0, tmp.numel() * sizeof(T),
+                             stream);
+    auto runner = NpuOpRunner("Power", {tmp}, {*tensor},
+                              {{"power", static_cast<float>(1)},
+                               {"scale", static_cast<float>(0)},
+                               {"shift", static_cast<float>(val)}});
+    runner.Run(stream);
+  } else {
+    T *array = new T[tensor->numel()];
+    for (unsigned int i = 0; i < tensor->numel(); ++i) {
+      array[i] = static_cast<T>(val);
+    }
+    std::vector<T> vec(tensor->numel(), static_cast<T>(val));
+    // do sync copy
+    memory::Copy(BOOST_GET_CONST(platform::NPUPlace, tensor->place()),
+                 tensor->data<void>(), platform::CPUPlace(), array,
+                 tensor->numel() * sizeof(T), nullptr);
+    delete[] array;
+  }
+}
+
 }  // namespace operators
 }  // namespace paddle
+#endif
diff --git a/paddle/fluid/operators/optimizers/adam_op_npu.cc b/paddle/fluid/operators/optimizers/adam_op_npu.cc
new file mode 100644
index 0000000000000..a922a2bca66ad
--- /dev/null
+++ b/paddle/fluid/operators/optimizers/adam_op_npu.cc
@@ -0,0 +1,178 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+#include "paddle/fluid/operators/optimizers/adam_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+template <typename DeviceContext, typename T>
+class AdamNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const auto* param_var = ctx.InputVar("Param");
+    PADDLE_ENFORCE_EQ(param_var->IsType<framework::LoDTensor>(), true,
+                      platform::errors::InvalidArgument(
+                          "The Var(%s)'s type should be LoDTensor, "
+                          "but the received is %s",
+                          ctx.InputNames("Param").front(),
+                          framework::ToTypeName(param_var->Type())));
+    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
+    auto* param = ctx.Input<LoDTensor>("Param");
+    auto* grad_var = ctx.InputVar("Grad");
+    PADDLE_ENFORCE_EQ(grad_var->IsType<framework::LoDTensor>(), true,
+                      platform::errors::InvalidArgument(
+                          "The Grad(%s)'s type should be LoDTensor, "
+                          "but the received is %s",
+                          ctx.InputNames("Grad").front(),
+                          framework::ToTypeName(param_var->Type())));
+    auto* grad = ctx.Input<LoDTensor>("Grad");
+    auto* mom1 = ctx.Input<LoDTensor>("Moment1");
+    auto* mom2 = ctx.Input<LoDTensor>("Moment2");
+    auto* lr = ctx.Input<LoDTensor>("LearningRate");
+
+    auto* beta1_pow = ctx.Input<LoDTensor>("Beta1Pow");
+    auto* beta2_pow = ctx.Input<LoDTensor>("Beta2Pow");
+
+    auto* param_out = ctx.Output<LoDTensor>("ParamOut");
+    auto* mom1_out = ctx.Output<LoDTensor>("Moment1Out");
+    auto* mom2_out = ctx.Output<LoDTensor>("Moment2Out");
+    auto* beta1_pow_out = ctx.Output<LoDTensor>("Beta1PowOut");
+    auto* beta2_pow_out = ctx.Output<LoDTensor>("Beta2PowOut");
+
+    param_out->mutable_data<T>(ctx.GetPlace());
+    mom1_out->mutable_data<T>(ctx.GetPlace());
+    mom2_out->mutable_data<T>(ctx.GetPlace());
+
+    // NOTE(zhiqiu): beta1_pow and beta2_pow may on CPU and not transform place.
+    if (beta1_pow->place() == platform::CPUPlace()) {
+      T beta1 = *beta1_pow->data<T>();
+      // `mutable_data` operation needs to be done after getting data
+      beta1_pow_out->mutable_data<T>(ctx.GetPlace());
+      FillNpuTensorWithConstant<T>(beta1_pow_out, beta1);
+    } else {
+      beta1_pow_out->mutable_data<T>(ctx.GetPlace());
+    }
+    if (beta2_pow->place() == platform::CPUPlace()) {
+      T beta2 = *beta2_pow->data<T>();
+      beta2_pow_out->mutable_data<T>(ctx.GetPlace());
+      FillNpuTensorWithConstant<T>(beta2_pow_out, beta2);
+    } else {
+      beta2_pow_out->mutable_data<T>(ctx.GetPlace());
+    }
+
+    T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
+    if (ctx.HasInput("Beta1Tensor")) {
+      auto* beta1_tensor = ctx.Input<framework::Tensor>("Beta1Tensor");
+      PADDLE_ENFORCE_EQ(beta1_tensor->numel(), 1,
+                        platform::errors::InvalidArgument(
+                            "Input(Beta1Tensor) size must be 1, but get %d",
+                            beta1_tensor->numel()));
+      beta1 = static_cast<T>(GetAttrFromTensor(beta1_tensor));
+    }
+    T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
+    if (ctx.HasInput("Beta2Tensor")) {
+      auto* beta2_tensor = ctx.Input<framework::Tensor>("Beta2Tensor");
+      PADDLE_ENFORCE_EQ(beta2_tensor->numel(), 1,
+                        platform::errors::InvalidArgument(
+                            "Input(Beta2Tensor) size must be 1, but get %d",
+                            beta2_tensor->numel()));
+      beta2 = static_cast<T>(GetAttrFromTensor(beta2_tensor));
+    }
+    VLOG(3) << "beta1_pow.numel() : " << beta1_pow->numel()
+            << "beta2_pow.numel() : " << beta2_pow->numel();
+    VLOG(3) << "param.numel(): " << param->numel();
+
+    PADDLE_ENFORCE_EQ(beta1_pow_out->numel(), 1,
+                      platform::errors::InvalidArgument(
+                          "beta1 pow output size should be 1, but received "
+                          "value is:%d.",
+                          beta1_pow_out->numel()));
+
+    PADDLE_ENFORCE_EQ(beta2_pow_out->numel(), 1,
+                      platform::errors::InvalidArgument(
+                          "beta2 pow output size should be 1, but received "
+                          "value is:%d.",
+                          beta2_pow_out->numel()));
+
+    // reshape
+    Tensor beta1_tensor(framework::proto::VarType::FP32);
+    beta1_tensor.mutable_data<T>({1}, ctx.GetPlace());
+    FillNpuTensorWithConstant<T>(&beta1_tensor, beta1);
+    Tensor beta2_tensor(framework::proto::VarType::FP32);
+    beta2_tensor.mutable_data<T>({1}, ctx.GetPlace());
+    FillNpuTensorWithConstant<T>(&beta2_tensor, beta2);
+
+    Tensor epsilon_tensor(framework::proto::VarType::FP32);
+    TensorFromVector(std::vector<T>{epsilon},
+                     ctx.template device_context<platform::DeviceContext>(),
+                     &epsilon_tensor);
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    auto runner =
+        NpuOpRunner("ApplyAdamD",
+                    {
+                        *param, *mom1, *mom2, *beta1_pow, *beta2_pow, *lr,
+                        beta1_tensor, beta2_tensor, epsilon_tensor, *grad,
+                    },
+                    {
+                        *param_out, *mom1_out, *mom2_out,
+                    },
+                    {});
+    runner.Run(stream);
+
+    // NOTE(zhiqiu): ApplyAdamD updates params inplace, so
+    // if param and param_out is not same, we need to do copy.
+    if (param_out->data<T>() != param->data<T>()) {
+      framework::TensorCopy(
+          *param, ctx.GetPlace(),
+          ctx.template device_context<platform::DeviceContext>(), param_out);
+    }
+    if (mom1_out->data<T>() != mom1->data<T>()) {
+      framework::TensorCopy(
+          *mom1, ctx.GetPlace(),
+          ctx.template device_context<platform::DeviceContext>(), mom1_out);
+    }
+    if (mom2_out->data<T>() != mom2->data<T>()) {
+      framework::TensorCopy(
+          *mom2, ctx.GetPlace(),
+          ctx.template device_context<platform::DeviceContext>(), mom2_out);
+    }
+    auto runner_m1 =
+        NpuOpRunner("Mul", {*beta1_pow, beta1_tensor}, {*beta1_pow_out}, {});
+    runner_m1.Run(stream);
+    auto runner_m2 =
+        NpuOpRunner("Mul", {*beta2_pow, beta2_tensor}, {*beta2_pow_out}, {});
+    runner_m2.Run(stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    adam, ops::AdamNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::AdamNPUKernel<paddle::platform::NPUDeviceContext,
+                       paddle::platform::float16>);
diff --git a/paddle/fluid/operators/optimizers/sgd_op.cc b/paddle/fluid/operators/optimizers/sgd_op.cc
index 569dbcd6a3ee1..9603411ec4513 100644
--- a/paddle/fluid/operators/optimizers/sgd_op.cc
+++ b/paddle/fluid/operators/optimizers/sgd_op.cc
@@ -12,8 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/optimizers/sgd_op.h"
 #include <string>
+
+#include "paddle/fluid/operators/optimizers/sgd_op.h"
+
 namespace paddle {
 namespace operators {
 
@@ -127,4 +129,6 @@ REGISTER_OPERATOR(
     ops::SGDOpInferVarType);
 REGISTER_OP_CPU_KERNEL(
     sgd, ops::SGDOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SGDOpKernel<paddle::platform::CPUDeviceContext,
+                     paddle::platform::bfloat16>,
     ops::SGDOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/optimizers/sgd_op.h b/paddle/fluid/operators/optimizers/sgd_op.h
index 1aaf95efc3250..076121c0e27da 100644
--- a/paddle/fluid/operators/optimizers/sgd_op.h
+++ b/paddle/fluid/operators/optimizers/sgd_op.h
@@ -13,14 +13,220 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/var_type_traits.h"
 #include "paddle/fluid/operators/jit/kernels.h"
+#include "paddle/fluid/platform/bfloat16.h"
 
 namespace paddle {
 namespace operators {
 
+namespace detail {
+
+template <typename T, int VariableTypeId>
+struct sgd_dense_param_kernel {
+  void operator()() const {}
+};
+
+// LodTensor
+template <typename T>
+struct sgd_dense_param_kernel<
+    T, framework::VarTypeTrait<framework::LoDTensor>::kId> {
+  void operator()(const framework::ExecutionContext &ctx) const {
+    VLOG(4) << "[CPU]: sgd_dense_param_kernel<T, LoDTensor>";
+    const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
+    const auto *param = ctx.Input<framework::Tensor>("Param");
+    auto *param_out = ctx.Output<framework::Tensor>("ParamOut");
+    const auto *grad = ctx.Input<framework::Tensor>("Grad");
+
+    const auto sz = param_out->numel();
+    jit::sgd_attr_t attr(1, sz, 1, sz, 1);
+    const T *lr = learning_rate->data<T>();
+    const T *param_data = param->data<T>();
+    const T *grad_data = grad->data<T>();
+    int64_t rows_idx = 0;
+    T *out_data = param_out->mutable_data<T>(ctx.GetPlace());
+
+    auto sgd =
+        jit::KernelFuncs<jit::SgdTuple<T>, platform::CPUPlace>::Cache().At(
+            attr);
+    sgd(lr, param_data, grad_data, &rows_idx, out_data, &attr);
+  }
+};
+
+// SelectedRows
+template <typename T>
+struct sgd_dense_param_kernel<
+    T, framework::VarTypeTrait<framework::SelectedRows>::kId> {
+  void operator()(const framework::ExecutionContext &ctx) const {
+    VLOG(4) << "[CPU]: sgd_dense_param_kernel<T, SelectedRows>";
+    const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
+    const auto *param = ctx.Input<framework::Tensor>("Param");
+    auto *param_out = ctx.Output<framework::Tensor>("ParamOut");
+    const auto *grad = ctx.Input<framework::SelectedRows>("Grad");
+
+    const auto &grad_value = grad->value();
+    const auto &grad_rows = grad->rows();
+    const T *param_data = param->data<T>();
+    const T *grad_data = grad_value.data<T>();
+    const T *lr = learning_rate->data<T>();
+    const int64_t *rows_data = grad_rows.data();
+    T *out_data = param_out->mutable_data<T>(ctx.GetPlace());
+
+    jit::sgd_attr_t attr;
+    attr.param_height = param_out->dims()[0];
+    attr.param_width = param_out->numel() / attr.param_height;
+    attr.grad_height = grad_rows.size();  // note: it is not grad->height()
+    attr.grad_width = grad_value.numel() / attr.grad_height;
+    attr.selected_rows_size = grad_rows.size();
+
+    auto sgd =
+        jit::KernelFuncs<jit::SgdTuple<T>, platform::CPUPlace>::Cache().At(
+            attr);
+    sgd(lr, param_data, grad_data, rows_data, out_data, &attr);
+  }
+};
+
+// LodTensor
+template <>
+struct sgd_dense_param_kernel<
+    platform::bfloat16, framework::VarTypeTrait<framework::LoDTensor>::kId> {
+  void operator()(const framework::ExecutionContext &ctx) const {
+    VLOG(4) << "[CPU]: sgd_dense_param_kernel<bfloat16, LoDTensor>";
+    const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
+    const auto *param = ctx.Input<framework::Tensor>("Param");
+    auto *param_out = ctx.Output<framework::Tensor>("ParamOut");
+    const auto *grad = ctx.Input<framework::Tensor>("Grad");
+    param_out->mutable_data<platform::bfloat16>(ctx.GetPlace());
+
+    auto p = framework::EigenVector<platform::bfloat16>::Flatten(*param);
+    auto g = framework::EigenVector<platform::bfloat16>::Flatten(*grad);
+    auto o = framework::EigenVector<platform::bfloat16>::Flatten(*param_out);
+    const auto *lr = learning_rate->data<platform::bfloat16>();
+
+    o = p - lr[0] * g;
+  }
+};
+
+// SelectedRows
+template <>
+struct sgd_dense_param_kernel<
+    platform::bfloat16, framework::VarTypeTrait<framework::SelectedRows>::kId> {
+  void operator()(const framework::ExecutionContext &ctx) const {
+    VLOG(4) << "[CPU]: sgd_dense_param_kernel<bfloat16, SelectedRows>";
+    const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
+    auto *param_out = ctx.Output<framework::Tensor>("ParamOut");
+    const auto *grad = ctx.Input<framework::SelectedRows>("Grad");
+
+    const auto &grad_value = grad->value();
+    const auto &grad_rows = grad->rows();
+    const auto grad_height = grad->height();
+    const int64_t grad_val_height = static_cast<int64_t>(grad_rows.size());
+    const auto grad_width = grad_value.numel() / grad_val_height;
+
+    const auto *grad_data = grad_value.data<platform::bfloat16>();
+    auto *out_data = param_out->data<platform::bfloat16>();
+    const auto *lr = learning_rate->data<platform::bfloat16>();
+
+    for (size_t i = 0; i < grad_rows.size(); ++i) {
+      PADDLE_ENFORCE_LT(
+          grad_rows[i], grad_height,
+          platform::errors::OutOfRange(
+              "Grad rows index value should be less than grad height."
+              "Got [%s], but expected less than [%s]",
+              grad_rows[i], grad_height));
+      const int64_t row = grad_rows[i];
+      for (int64_t j = 0; j < grad_width; ++j) {
+        out_data[row * grad_width + j] -= lr[0] * grad_data[i * grad_width + j];
+      }
+    }
+  }
+};
+
+template <typename T>
+void sgd_op_invoke_dense_param_kernel(const framework::ExecutionContext &ctx) {
+  const auto *param = ctx.Input<framework::Tensor>("Param");
+  auto *param_out = ctx.Output<framework::Tensor>("ParamOut");
+  const auto *grad_var = ctx.InputVar("Grad");
+
+  if (grad_var->IsType<framework::LoDTensor>()) {
+    const auto *grad = ctx.Input<framework::Tensor>("Grad");
+    const auto sz = param_out->numel();
+    PADDLE_ENFORCE_EQ(param->numel(), sz,
+                      platform::errors::InvalidArgument(
+                          "The input tensor Param's numel of SgdOp "
+                          "should be equal with ParamOut's numel. "
+                          "But received Param's "
+                          "numel = [%s], ParamOut's numel = [%s]",
+                          param->numel(), sz));
+    PADDLE_ENFORCE_EQ(grad->numel(), sz,
+                      platform::errors::InvalidArgument(
+                          "The input tensor Grad's numel of SgdOp "
+                          "should be equal with ParamOut's numel. "
+                          "But received Grad's "
+                          "numel = [%s], ParamOut's numel = [%s]",
+                          grad->numel(), sz));
+
+    sgd_dense_param_kernel<
+        T, framework::VarTypeTrait<framework::LoDTensor>::kId>()(ctx);
+  } else if (grad_var->IsType<framework::SelectedRows>()) {
+    // TODO(qijun): In Sparse SGD operator, in-place update is enforced.
+    // This manual optimization brings difficulty to track data dependency.
+    // It's better to find a more elegant solution.
+    PADDLE_ENFORCE_EQ(param, param_out,
+                      platform::errors::InvalidArgument(
+                          "The input tensor Param of SgdOp "
+                          "should be equal with ParamOut if variable's "
+                          "type is SelectedRows. "));
+    const auto *grad = ctx.Input<framework::SelectedRows>("Grad");
+
+    // for distributed training, a sparse var may be empty,
+    // just skip updating.
+    if (grad->rows().size() == 0) {
+      return;
+    }
+
+    auto out_dims = param_out->dims();
+    PADDLE_ENFORCE_EQ(
+        grad->height(), out_dims[0],
+        platform::errors::InvalidArgument(
+            "The input tensor Grad's height of SgdOp "
+            "should be equal with ParamOut's dims. But received  Grad's "
+            "height [%s] and ParamOut's dims [%s]",
+            grad->height(), out_dims[0]));
+
+    auto &grad_value = grad->value();
+    auto &grad_rows = grad->rows();
+    const auto param_height = param_out->dims()[0];
+    const auto param_width = param_out->numel() / param_height;
+    // note: it is not grad->height()
+    const auto grad_height = static_cast<int64_t>(grad_rows.size());
+    const auto grad_width = grad_value.numel() / grad_height;
+
+    PADDLE_ENFORCE_EQ(
+        grad_width, param_width,
+        platform::errors::InvalidArgument(
+            "The grad_value's numel of SgdOp "
+            "should be equal with param_out's numel. But received "
+            "grad_value's numel [%s] and param_out's numel [%s]",
+            grad_width, param_width));
+
+    sgd_dense_param_kernel<
+        T, framework::VarTypeTrait<framework::SelectedRows>::kId>()(ctx);
+  } else {
+    PADDLE_ENFORCE_EQ(
+        false, true, platform::errors::PermissionDenied(
+                         "Unsupported Variable Type of Grad in SgdOp. Excepted "
+                         "LodTensor or SelectedRows, But received [%s]",
+                         paddle::framework::ToTypeName(grad_var->Type())));
+  }
+}
+
+}  // namespace detail
+
 template <typename DeviceContext, typename T>
 class SGDOpKernel : public framework::OpKernel<T> {
  public:
@@ -38,102 +244,12 @@ class SGDOpKernel<platform::CPUDeviceContext, T>
     const auto *grad_var = ctx.InputVar("Grad");
 
     if (param_var->IsType<framework::LoDTensor>()) {
-      const auto *param = ctx.Input<framework::Tensor>("Param");
-      auto *param_out = ctx.Output<framework::Tensor>("ParamOut");
-      // Actually, all tensors are LoDTensor except SelectedRows.
-      if (grad_var->IsType<framework::LoDTensor>()) {
-        const auto *grad = ctx.Input<framework::Tensor>("Grad");
-        auto sz = param_out->numel();
-        PADDLE_ENFORCE_EQ(param->numel(), sz,
-                          platform::errors::InvalidArgument(
-                              "The input tensor Param's numel of SgdOp "
-                              "should be equal with ParamOut's numel. "
-                              "But received Param's "
-                              "numel = [%s], ParamOut's numel = [%s]",
-                              param->numel(), sz));
-        PADDLE_ENFORCE_EQ(grad->numel(), sz,
-                          platform::errors::InvalidArgument(
-                              "The input tensor Grad's numel of SgdOp "
-                              "should be equal with ParamOut's numel. "
-                              "But received Grad's "
-                              "numel = [%s], ParamOut's numel = [%s]",
-                              grad->numel(), sz));
-
-        jit::sgd_attr_t attr(1, sz, 1, sz, 1);
-        const T *lr = learning_rate->data<T>();
-        const T *param_data = param->data<T>();
-        const T *grad_data = grad->data<T>();
-        int64_t rows_idx = 0;
-        T *out_data = param_out->mutable_data<T>(ctx.GetPlace());
-
-        auto sgd =
-            jit::KernelFuncs<jit::SgdTuple<T>, platform::CPUPlace>::Cache().At(
-                attr);
-        sgd(lr, param_data, grad_data, &rows_idx, out_data, &attr);
-      } else if (grad_var->IsType<framework::SelectedRows>()) {
-        // TODO(qijun): In Sparse SGD operator, in-place update is enforced.
-        // This manual optimization brings difficulty to track data dependency.
-        // It's better to find a more elegant solution.
-        PADDLE_ENFORCE_EQ(param, param_out,
-                          platform::errors::InvalidArgument(
-                              "The input tensor Param of SgdOp "
-                              "should be equal with ParamOut if variable's "
-                              "type is SelectedRows. "));
-        const auto *grad = ctx.Input<framework::SelectedRows>("Grad");
-        auto &grad_rows = grad->rows();
-
-        // for distributed training, a sparse var may be empty,
-        // just skip updating.
-        if (grad_rows.size() == 0) {
-          return;
-        }
-
-        auto out_dims = param_out->dims();
-        PADDLE_ENFORCE_EQ(
-            grad->height(), out_dims[0],
-            platform::errors::InvalidArgument(
-                "The input tensor Grad's height of SgdOp "
-                "should be equal with ParamOut's dims. But received  Grad's "
-                "height [%s] and ParamOut's dims [%s]",
-                grad->height(), out_dims[0]));
-        auto &grad_value = grad->value();
-        const T *param_data = param->data<T>();
-        const T *grad_data = grad_value.data<T>();
-        const T *lr = learning_rate->data<T>();
-        const int64_t *rows_data = grad_rows.data();
-        T *out_data = param_out->mutable_data<T>(ctx.GetPlace());
-
-        jit::sgd_attr_t attr;
-        attr.param_height = out_dims[0];
-        attr.param_width = param_out->numel() / attr.param_height;
-        attr.grad_height = grad_rows.size();  // note: it is not grad->height()
-        attr.grad_width = grad_value.numel() / attr.grad_height;
-        attr.selected_rows_size = grad_rows.size();
-        PADDLE_ENFORCE_EQ(
-            attr.grad_width, attr.param_width,
-            platform::errors::InvalidArgument(
-                "The grad_value's numel of SgdOp "
-                "should be equal with param_out's numel. But received "
-                "grad_value's numel [%s] and param_out's numel [%s]",
-                attr.grad_width, attr.param_width));
-
-        auto sgd =
-            jit::KernelFuncs<jit::SgdTuple<T>, platform::CPUPlace>::Cache().At(
-                attr);
-        sgd(lr, param_data, grad_data, rows_data, out_data, &attr);
-      } else {
-        PADDLE_ENFORCE_EQ(
-            false, true,
-            platform::errors::PermissionDenied(
-                "Unsupported Variable Type of Grad in SgdOp. Excepted "
-                "LodTensor or SelectedRows, But received [%s]",
-                paddle::framework::ToTypeName(grad_var->Type())));
-      }
+      detail::sgd_op_invoke_dense_param_kernel<T>(ctx);
     } else if (param_var->IsType<framework::SelectedRows>()) {
       PADDLE_ENFORCE_EQ(grad_var->IsType<framework::SelectedRows>(), true,
                         platform::errors::InvalidArgument(
-                            "when param is SelectedRows, "
-                            "gradient should also be SelectedRows"));
+                            "When param is SelectedRows, gradient should also "
+                            "be SelectedRows"));
       const auto &param = param_var->Get<framework::SelectedRows>();
       auto *param_out = ctx.Output<framework::SelectedRows>("ParamOut");
       const auto &grad = grad_var->Get<framework::SelectedRows>();
@@ -179,5 +295,6 @@ class SGDOpKernel<platform::CPUDeviceContext, T>
     }
   }
 };
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/optimizers/sgd_op_npu.cc b/paddle/fluid/operators/optimizers/sgd_op_npu.cc
new file mode 100644
index 0000000000000..a8d19148ef520
--- /dev/null
+++ b/paddle/fluid/operators/optimizers/sgd_op_npu.cc
@@ -0,0 +1,63 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/operators/npu_op_runner.h"
+#include "paddle/fluid/operators/optimizers/sgd_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class SGDNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* learning_rate = ctx.Input<framework::LoDTensor>("LearningRate");
+    auto* param_var = ctx.Input<framework::LoDTensor>("Param");
+    auto* grad_var = ctx.Input<framework::LoDTensor>("Grad");
+    auto* param_out = ctx.Output<framework::LoDTensor>("ParamOut");
+
+    param_out->mutable_data<T>(ctx.GetPlace());
+
+    auto runner =
+        NpuOpRunner("ApplyGradientDescent",
+                    {*param_var, *learning_rate, *grad_var}, {*param_out}, {});
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    runner.Run(stream);
+
+    // NOTE(zhiqiu): ApplyGradientDescent updates params inplace, so
+    // if param and param_out is not same, we need to do copy.
+    if (param_out->data<T>() != param_var->data<T>()) {
+      framework::TensorCopy(
+          *param_var, ctx.GetPlace(),
+          ctx.template device_context<platform::DeviceContext>(), param_out);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    sgd, ops::SGDNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::SGDNPUKernel<paddle::platform::NPUDeviceContext, double>,
+    ops::SGDNPUKernel<paddle::platform::NPUDeviceContext,
+                      paddle::platform::float16>);
diff --git a/paddle/fluid/operators/pull_box_sparse_op.h b/paddle/fluid/operators/pull_box_sparse_op.h
index 48903012b595e..77021b8961db5 100644
--- a/paddle/fluid/operators/pull_box_sparse_op.h
+++ b/paddle/fluid/operators/pull_box_sparse_op.h
@@ -47,8 +47,7 @@ static void PullBoxSparseFunctor(const framework::ExecutionContext &ctx) {
   box_ptr->PullSparse(ctx.GetPlace(), all_keys, all_values, slot_lengths,
                       hidden_size, 0);
 #endif
-#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
-    (defined PADDLE_WITH_PSLIB)
+#ifdef PADDLE_WITH_HETERPS
   auto hidden_size = ctx.Attr<int>("size");
   auto gpu_ps_ptr = paddle::framework::PSGPUWrapper::GetInstance();
   gpu_ps_ptr->PullSparse(ctx.GetPlace(), 0, all_keys, all_values, slot_lengths,
@@ -91,8 +90,7 @@ static void PushBoxSparseFunctor(const framework::ExecutionContext &ctx) {
   box_ptr->PushSparseGrad(ctx.GetPlace(), all_keys, all_grad_values,
                           slot_lengths, hidden_size, 0, batch_size);
 #endif
-#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
-    (defined PADDLE_WITH_PSLIB)
+#ifdef PADDLE_WITH_HETERPS
   auto hidden_size = ctx.Attr<int>("size");
   auto gpu_ps_ptr = paddle::framework::PSGPUWrapper::GetInstance();
   gpu_ps_ptr->PushSparseGrad(ctx.GetPlace(), 0, all_keys, all_grad_values,
diff --git a/paddle/fluid/operators/py_func_op.cc b/paddle/fluid/operators/py_func_op.cc
index b3622870d070e..f676348bc0af2 100644
--- a/paddle/fluid/operators/py_func_op.cc
+++ b/paddle/fluid/operators/py_func_op.cc
@@ -178,9 +178,10 @@ class PyFuncOpVarTypeInference : public framework::StaticGraphVarTypeInference {
 class PyFuncOpShapeInference : public framework::InferShapeBase {
  public:
   void operator()(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(!ctx->IsRuntime(), true,
-                      platform::errors::InvalidArgument(
-                          "Infer shape cannot be called in runtime."));
+    PADDLE_ENFORCE_EQ(
+        !ctx->IsRuntime(), true,
+        platform::errors::InvalidArgument("Shape inference cannot be called at "
+                                          "run time in 'py_func' operator."));
   }
 };
 
diff --git a/paddle/fluid/operators/py_layer_op.cc b/paddle/fluid/operators/py_layer_op.cc
new file mode 100644
index 0000000000000..0d5c23bed6016
--- /dev/null
+++ b/paddle/fluid/operators/py_layer_op.cc
@@ -0,0 +1,197 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+
+#include "paddle/fluid/operators/py_layer_op.h"
+
+namespace paddle {
+namespace operators {
+
+namespace py = ::pybind11;
+
+void RunPyObject(py::object *py_object,
+                 const std::vector<framework::Variable *> &ins,
+                 std::vector<framework::Variable *> *outs) {
+  py::gil_scoped_acquire guard;
+
+  auto py_function = py_object->attr("backward");
+
+  py::tuple inputs(ins.size());
+  for (size_t i = 0; i < ins.size(); i++) {
+    auto in_var = ins[i];
+    if (in_var != nullptr) {
+      auto name = paddle::string::Sprintf("generator_custom_py_layer_%d@GRAD",
+                                          static_cast<int>(i));
+
+      std::shared_ptr<imperative::VariableWrapper> temp_wrap =
+          std::make_shared<imperative::VariableWrapper>(name, *in_var);
+      temp_wrap->InnerSetOverridedStopGradient(true);
+      std::shared_ptr<imperative::VarBase> temp_varbase =
+          std::make_shared<imperative::VarBase>(temp_wrap);
+      try {
+        inputs[i] = py::cast(temp_varbase).ptr();
+      } catch (py::cast_error &) {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "The output of `PyLayer.backward` should be `Tensor`."));
+      }
+    }
+  }
+
+  auto py_result = py_function(*py_object, *inputs);
+
+  if (PyTuple_Check(py_result.ptr()) || PyList_Check(py_result.ptr())) {
+    auto result_tuple = py_result.cast<py::tuple>();
+    if (result_tuple.size() != outs->size()) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "The number of outputs of `PyLayer.backward` should be %d, but "
+          "received %d.",
+          outs->size(), result_tuple.size()));
+    }
+    for (size_t i = 0; i < result_tuple.size(); i++) {
+      if (Py_None != result_tuple[i].ptr()) {
+        try {
+          auto result_var =
+              result_tuple[i].cast<std::shared_ptr<imperative::VarBase>>();
+          *(*outs)[i] = result_var->Var();
+        } catch (py::cast_error &) {
+          PADDLE_THROW(platform::errors::Unimplemented(
+              "The output of `PyLayer.backward` should be `Tensor`."));
+        }
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "The output of `PyLayer.backward` can not be `None`."));
+      }
+    }
+  } else {
+    if (Py_None != py_result.ptr()) {
+      try {
+        auto result_var =
+            py_result.cast<std::shared_ptr<imperative::VarBase>>();
+        *((*outs)[0]) = result_var->Var();
+      } catch (py::cast_error &) {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "The output of `PyLayer.backward` should be `Tensor`."));
+      }
+    } else {
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "The output of `PyLayer.backward` can not be `None`."));
+    }
+  }
+}
+
+void PyLayerGradOpMaker<paddle::imperative::OpBase>::Apply(
+    GradOpPtr<paddle::imperative::OpBase> grad_op) const {
+  grad_op->SetType("py_layer");
+  auto &inner_op = grad_op->InnerOp();
+  auto py_layer_op_const = dynamic_cast<const PyLayerOp *>(&inner_op);
+
+  if (py_layer_op_const) {
+    auto py_layer_op = const_cast<PyLayerOp *>(py_layer_op_const);
+    py_layer_op->SetPyLayerContext(py_context_);
+
+  } else {
+    PADDLE_THROW(platform::errors::Fatal(
+        "PyLayerGradOpMaker can't cast %s to PyLayerOp*.",
+        typeid(&inner_op).name()));
+  }
+
+  auto fwd_out_grads = this->OutputGrad("Out");
+  using return_type = decltype(fwd_out_grads);
+  return_type bwd_ins;
+
+  bwd_ins.insert(bwd_ins.begin(), fwd_out_grads.begin(), fwd_out_grads.end());
+
+  auto bwd_outs = this->InputGrad("X", false);
+
+  grad_op->SetInput("X", bwd_ins);
+  grad_op->SetOutput("Out", bwd_outs);
+}
+
+class PyLayerOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "Inputs of PyLayer op.").AsDuplicable();
+    AddOutput("Out", "Outputs of PyLayer op").AsDuplicable();
+    AddComment(R"DOC("PyLayer Op")DOC");
+  }
+};
+
+template <typename DeviceContext, typename T>
+class PyLayerOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto &op_ = ctx.GetOp();
+    auto pylayer_op = dynamic_cast<const PyLayerOp *>(&op_);
+    if (pylayer_op) {
+      auto py_layer_context = pylayer_op->GetPyLayerContext();
+      py::object bk_ctx(py::handle(py_layer_context->GetMutableCtx()), true);
+      auto &input_vars = ctx.MultiInputVar("X");
+      auto output_vars = ctx.MultiOutputVar("Out");
+      RunPyObject(&bk_ctx, input_vars, &output_vars);
+
+    } else {
+      PADDLE_THROW(platform::errors::Fatal(
+          "PyLayerOpKernel can't cast %s to PyLayer*.", typeid(&op_).name()));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(py_layer, ops::PyLayerOp, ops::PyLayerOpMaker,
+                  ops::PyLayerGradOpMaker<paddle::imperative::OpBase>,
+                  ops::PyLayerGradOpMaker<paddle::framework::OpDesc>);
+
+REGISTER_OP_CPU_KERNEL(
+    py_layer, ops::PyLayerOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::PyLayerOpKernel<paddle::platform::CPUDeviceContext,
+                         ::paddle::platform::float16>,
+    ops::PyLayerOpKernel<paddle::platform::CPUDeviceContext,
+                         ::paddle::platform::bfloat16>,
+    ops::PyLayerOpKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::PyLayerOpKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::PyLayerOpKernel<paddle::platform::CPUDeviceContext, int64_t>,
+
+    ops::PyLayerOpKernel<paddle::platform::CPUDeviceContext, bool>,
+    ops::PyLayerOpKernel<paddle::platform::CPUDeviceContext, uint8_t>,
+    ops::PyLayerOpKernel<paddle::platform::CPUDeviceContext, int16_t>,
+    ops::PyLayerOpKernel<paddle::platform::CPUDeviceContext, int8_t>,
+    ops::PyLayerOpKernel<paddle::platform::CPUDeviceContext,
+                         ::paddle::platform::complex64>,
+    ops::PyLayerOpKernel<paddle::platform::CPUDeviceContext,
+                         ::paddle::platform::complex128>);
+#ifdef PADDLE_WITH_CUDA
+REGISTER_OP_CUDA_KERNEL(
+    py_layer, ops::PyLayerOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::PyLayerOpKernel<paddle::platform::CUDADeviceContext,
+                         ::paddle::platform::float16>,
+    ops::PyLayerOpKernel<paddle::platform::CUDADeviceContext,
+                         ::paddle::platform::bfloat16>,
+    ops::PyLayerOpKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::PyLayerOpKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::PyLayerOpKernel<paddle::platform::CUDADeviceContext, int64_t>,
+
+    ops::PyLayerOpKernel<paddle::platform::CUDADeviceContext, bool>,
+    ops::PyLayerOpKernel<paddle::platform::CUDADeviceContext, uint8_t>,
+    ops::PyLayerOpKernel<paddle::platform::CUDADeviceContext, int16_t>,
+    ops::PyLayerOpKernel<paddle::platform::CUDADeviceContext, int8_t>,
+    ops::PyLayerOpKernel<paddle::platform::CUDADeviceContext,
+                         ::paddle::platform::complex64>,
+    ops::PyLayerOpKernel<paddle::platform::CUDADeviceContext,
+                         ::paddle::platform::complex128>);
+#endif  // PADDLE_WITH_CUDA
diff --git a/paddle/fluid/operators/py_layer_op.h b/paddle/fluid/operators/py_layer_op.h
new file mode 100644
index 0000000000000..133435aa84d71
--- /dev/null
+++ b/paddle/fluid/operators/py_layer_op.h
@@ -0,0 +1,105 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <functional>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/python_headers.h"
+
+namespace paddle {
+namespace operators {
+namespace py = ::pybind11;
+
+class PyLayerContext {
+ public:
+  explicit PyLayerContext(PyObject* context) : context_(context) {
+    Py_INCREF(context_);
+  }
+
+  PyLayerContext() = delete;
+
+  PyObject* GetMutableCtx() { return context_; }
+
+ private:
+  PyObject* context_;
+};
+
+class PyLayerOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    VLOG(3) << "`InferShape` of `PyLayer` is an empty function, and it cannot "
+               "infer the shape of the output tensors.";
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
+    return framework::OpKernelType(data_type, ctx.device_context());
+  }
+
+ public:
+  void SetPyLayerContext(const std::shared_ptr<PyLayerContext>& py_context) {
+    py_context_ = py_context;
+  }
+  const std::shared_ptr<PyLayerContext>& GetPyLayerContext() const {
+    return py_context_;
+  }
+
+ private:
+  std::shared_ptr<PyLayerContext> py_context_;
+};
+
+template <typename T>
+class PyLayerGradOpMaker {};
+template <>
+class PyLayerGradOpMaker<paddle::framework::OpDesc>
+    : public framework::SingleGradOpMaker<paddle::framework::OpDesc> {
+ public:
+  using framework::SingleGradOpMaker<
+      paddle::framework::OpDesc>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<paddle::framework::OpDesc> grad_op) const override {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "`PyLayer` don't support static graph mode."));
+  }
+};
+
+template <>
+class PyLayerGradOpMaker<paddle::imperative::OpBase>
+    : public framework::SingleGradOpMaker<paddle::imperative::OpBase> {
+ public:
+  using framework::SingleGradOpMaker<
+      paddle::imperative::OpBase>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<paddle::imperative::OpBase> grad_op) const override;
+
+ public:
+  void SetPyLayerContext(const std::shared_ptr<PyLayerContext>& py_context) {
+    py_context_ = py_context;
+  }
+
+ private:
+  std::shared_ptr<PyLayerContext> py_context_;
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/range_op_npu.cc b/paddle/fluid/operators/range_op_npu.cc
new file mode 100644
index 0000000000000..a9a2effd2eb9d
--- /dev/null
+++ b/paddle/fluid/operators/range_op_npu.cc
@@ -0,0 +1,88 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_ASCEND_CL
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+#include "paddle/fluid/operators/range_op.h"
+#include "paddle/fluid/operators/utils.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class RangeNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* start_t = context.Input<framework::Tensor>("Start");
+    auto* end_t = context.Input<framework::Tensor>("End");
+    auto* step_t = context.Input<framework::Tensor>("Step");
+    auto* out = context.Output<framework::Tensor>("Out");
+
+    framework::Tensor n;
+    framework::TensorCopy(
+        *start_t, platform::CPUPlace(),
+        context.template device_context<platform::DeviceContext>(), &n);
+    context.template device_context<paddle::platform::NPUDeviceContext>()
+        .Wait();
+    T start = n.data<T>()[0];
+    framework::TensorCopy(
+        *end_t, platform::CPUPlace(),
+        context.template device_context<platform::DeviceContext>(), &n);
+    context.template device_context<paddle::platform::NPUDeviceContext>()
+        .Wait();
+    T end = n.data<T>()[0];
+    framework::TensorCopy(
+        *step_t, platform::CPUPlace(),
+        context.template device_context<platform::DeviceContext>(), &n);
+    context.template device_context<paddle::platform::NPUDeviceContext>()
+        .Wait();
+    T step = n.data<T>()[0];
+
+    int64_t size = 0;
+    GetSize(start, end, step, &size);
+
+    out->Resize(framework::make_ddim({size}));
+    out->mutable_data<T>(context.GetPlace());
+
+    std::vector<T> odata;
+    T value = start;
+    for (int64_t i = 0; i < size; ++i) {
+      odata.push_back(value);
+      value += step;
+    }
+
+    framework::TensorFromVector(odata, context.device_context(), out);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    range, ops::RangeNPUKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::RangeNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::RangeNPUKernel<paddle::platform::NPUDeviceContext, double>)
+
+#endif
diff --git a/paddle/fluid/operators/range_op_npu_test.cc b/paddle/fluid/operators/range_op_npu_test.cc
new file mode 100644
index 0000000000000..f2f395314c0cc
--- /dev/null
+++ b/paddle/fluid/operators/range_op_npu_test.cc
@@ -0,0 +1,92 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(range);
+USE_OP_DEVICE_KERNEL(range, NPU);
+
+template <typename T>
+void Compare(f::Scope* scope, const p::DeviceContext& ctx,
+             std::string op_type) {
+  // init
+  auto start = scope->Var("Start");
+  auto tensor_start = start->GetMutable<f::LoDTensor>();
+  std::vector<T> init_start;
+  init_start.push_back(static_cast<T>(1));
+  TensorFromVector(init_start, ctx, tensor_start);
+  tensor_start->Resize({1});
+
+  auto end = scope->Var("End");
+  auto tensor_end = end->GetMutable<f::LoDTensor>();
+  std::vector<T> init_end;
+  init_end.push_back(static_cast<T>(10));
+  TensorFromVector(init_end, ctx, tensor_end);
+  tensor_end->Resize({1});
+
+  auto step = scope->Var("Step");
+  auto tensor_step = step->GetMutable<f::LoDTensor>();
+  std::vector<T> init_step;
+  init_step.push_back(static_cast<T>(2));
+  TensorFromVector(init_step, ctx, tensor_step);
+  tensor_step->Resize({1});
+
+  ctx.Wait();
+
+  auto place = ctx.GetPlace();
+  auto out = scope->Var("Out");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+
+  // run
+  auto op = f::OpRegistry::CreateOp(
+      op_type, {{"Start", {"Start"}}, {"End", {"End"}}, {"Step", {"Step"}}},
+      {{"Out", {"Out"}}}, {});
+
+  op->Run(*scope, place);
+
+  std::vector<T> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+  ctx.Wait();
+
+  EXPECT_EQ(static_cast<T>(out_vec.size()), static_cast<T>(5));
+  EXPECT_EQ(static_cast<T>(out_vec[0]), static_cast<T>(1.0));
+  EXPECT_EQ(static_cast<T>(out_vec[1]), static_cast<T>(3.0));
+  EXPECT_EQ(static_cast<T>(out_vec[2]), static_cast<T>(5.0));
+  EXPECT_EQ(static_cast<T>(out_vec[3]), static_cast<T>(7.0));
+  EXPECT_EQ(static_cast<T>(out_vec[4]), static_cast<T>(9.0));
+}
+
+TEST(range, NPU) {
+  f::Scope scope;
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  Compare<int>(&scope, *ctx, "range");
+}
diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc
index b29493404f453..f5d55791d86c6 100644
--- a/paddle/fluid/operators/reader/buffered_reader.cc
+++ b/paddle/fluid/operators/reader/buffered_reader.cc
@@ -53,9 +53,25 @@ BufferedReader::BufferedReader(
     stream_ = platform::CudaStreamResourcePool::Instance().New(dev_idx);
   }
 #endif
+
+#ifdef PADDLE_WITH_ASCEND_CL
+  if (platform::is_npu_place(place_)) {
+    int dev_idx = BOOST_GET_CONST(platform::NPUPlace, place_).device;
+    compute_stream_ =
+        ((platform::NPUDeviceContext *)(platform::DeviceContextPool::Instance()
+                                            .Get(place_)))
+            ->stream();
+    events_.resize(buffer_size);
+    for (auto &event : events_) {
+      event = platform::NpuEventResourcePool::Instance().New(dev_idx);
+    }
+    stream_ = platform::NpuStreamResourcePool::Instance().New(dev_idx);
+  }
+#endif
   is_same_place_ = false;
   cpu_buffer_.resize(buffer_size);
   cuda_buffer_.resize(buffer_size);
+  npu_buffer_.resize(buffer_size);
   ReadTillBufferFullAsync();
 }
 
@@ -196,7 +212,59 @@ void BufferedReader::ReadAsync(size_t i) {
 #endif
       }
     }
-#endif  // @} End Group GPU Place
+#endif
+
+#ifdef PADDLE_WITH_ASCEND_CL
+    if (platform::is_npu_place(place_)) {
+      TensorVec &npu = npu_buffer_[i];
+      if (npu.empty()) {
+        npu.resize(cpu.size());
+      } else {
+        PADDLE_ENFORCE_EQ(
+            npu.size(), cpu.size(),
+            platform::errors::InvalidArgument(
+                "Input tensor number on NPU and CPU devices are not matched. "
+                "The number on NPU is %d, on CPU is %d",
+                npu.size(), cpu.size()));
+      }
+
+      std::vector<void *> npu_ptrs;
+      npu_ptrs.reserve(cpu.size());
+      for (size_t i = 0; i < cpu.size(); ++i) {
+        npu[i].Resize(cpu[i].dims());
+        npu[i].set_layout(cpu[i].layout());
+        npu_ptrs.emplace_back(npu[i].mutable_data(place_, cpu[i].type()));
+      }
+
+      platform::SetNPUDeviceId(
+          BOOST_GET_CONST(platform::NPUPlace, place_).device);
+      PADDLE_ENFORCE_NPU_SUCCESS(
+          aclrtRecordEvent(events_[i].get(), compute_stream_));
+      PADDLE_ENFORCE_NPU_SUCCESS(
+          aclrtStreamWaitEvent(stream_.get(), events_[i].get()));
+
+      platform::RecordEvent record_event("BufferedReader:MemoryCopy");
+      for (size_t i = 0; i < cpu.size(); ++i) {
+        auto cpu_place = cpu[i].place();
+        auto cpu_ptr = cpu[i].data<void>();
+        auto npu_ptr = npu_ptrs[i];
+        auto size =
+            cpu[i].numel() * paddle::framework::SizeOfType(cpu[i].type());
+        if ((platform::is_npu_place(cpu_place))) {
+          memory::Copy(BOOST_GET_CONST(platform::NPUPlace, place_), npu_ptr,
+                       BOOST_GET_CONST(platform::NPUPlace, cpu_place), cpu_ptr,
+                       size, stream_.get());
+        } else {
+          memory::Copy(BOOST_GET_CONST(platform::NPUPlace, place_), npu_ptr,
+                       BOOST_GET_CONST(platform::CPUPlace, cpu_place), cpu_ptr,
+                       size, stream_.get());
+          PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeStream(stream_.get()));
+        }
+        npu[i].set_lod(cpu[i].lod());
+      }
+      PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeStream(stream_.get()));
+    }
+#endif
     return i;
   }));
 }
@@ -228,9 +296,13 @@ void BufferedReader::ReadNextImpl(std::vector<framework::LoDTensor> *out) {
     return;
   }
 
-  *out = std::move((platform::is_gpu_place(place_) && !is_same_place_)
-                       ? cuda_buffer_[i]
-                       : cpu_buffer_[i]);
+  if (platform::is_gpu_place(place_) && !is_same_place_) {
+    *out = std::move(cuda_buffer_[i]);
+  } else if (platform::is_npu_place(place_) && !is_same_place_) {
+    *out = std::move(npu_buffer_[i]);
+  } else {
+    *out = std::move(cpu_buffer_[i]);
+  }
 
   // Do not push current position into ReadAsync. Push the previous position
   // Since all computation in fluid are async, change the data of
diff --git a/paddle/fluid/operators/reader/buffered_reader.h b/paddle/fluid/operators/reader/buffered_reader.h
index fbc46aceb8130..9f7b0e753281e 100644
--- a/paddle/fluid/operators/reader/buffered_reader.h
+++ b/paddle/fluid/operators/reader/buffered_reader.h
@@ -25,7 +25,10 @@
 #include "paddle/fluid/platform/cuda_resource_pool.h"
 #include "paddle/fluid/platform/gpu_info.h"
 #endif
-
+#ifdef PADDLE_WITH_ASCEND_CL
+#include "paddle/fluid/platform/npu_info.h"
+#include "paddle/fluid/platform/npu_resource_pool.h"
+#endif
 namespace paddle {
 namespace operators {
 namespace reader {
@@ -67,12 +70,19 @@ class BufferedReader : public framework::DecoratedReader {
   bool is_same_place_;
   std::vector<TensorVec> cpu_buffer_;
   std::vector<TensorVec> cuda_buffer_;
+  std::vector<TensorVec> npu_buffer_;
   size_t prev_pos_{-1UL};
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   gpuStream_t compute_stream_;
   std::shared_ptr<platform::CudaStreamObject> stream_;
   std::vector<std::shared_ptr<platform::CudaEventObject>> events_;
 #endif
+
+#ifdef PADDLE_WITH_ASCEND_CL
+  aclrtStream compute_stream_;
+  std::shared_ptr<platform::NpuStreamObject> stream_;
+  std::vector<std::shared_ptr<platform::NpuEventObject>> events_;
+#endif
 };
 
 }  // namespace reader
diff --git a/paddle/fluid/operators/reader/reader_blocking_queue_test.cc b/paddle/fluid/operators/reader/reader_blocking_queue_test.cc
index 98a68ca69cafd..1aa93c80387e6 100644
--- a/paddle/fluid/operators/reader/reader_blocking_queue_test.cc
+++ b/paddle/fluid/operators/reader/reader_blocking_queue_test.cc
@@ -68,7 +68,7 @@ TEST(BlockingQueue, SenderBlockingTest) {
       ++send_count;
     }
   });
-  std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+  std::this_thread::sleep_for(std::chrono::milliseconds(1500));
   q.Close();
   sender.join();
   EXPECT_EQ(send_count, queue_cap);
diff --git a/paddle/fluid/operators/reduce_ops/CMakeLists.txt b/paddle/fluid/operators/reduce_ops/CMakeLists.txt
index 92107c9dc442e..846d362fb522d 100644
--- a/paddle/fluid/operators/reduce_ops/CMakeLists.txt
+++ b/paddle/fluid/operators/reduce_ops/CMakeLists.txt
@@ -42,3 +42,7 @@ endif()
 if(WITH_ROCM)
     hip_test(check_reduce_rank_test SRCS check_reduce_rank_test.cu DEPS tensor)
 endif()
+
+if(WITH_ASCEND_CL)
+    cc_test(reduce_any_op_npu_test SRCS reduce_any_op_npu_test.cc DEPS op_registry reduce_any_op scope device_context enforce executor)
+endif()
diff --git a/paddle/fluid/operators/reduce_ops/mkldnn/reduce_max_mkldnn_op.cc b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_max_mkldnn_op.cc
new file mode 100644
index 0000000000000..b18c16c8c71f7
--- /dev/null
+++ b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_max_mkldnn_op.cc
@@ -0,0 +1,34 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class ReduceMaxMKLDNNKernel : public ReduceMKLDNNKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    this->RunKernel(ctx, dnnl::algorithm::reduction_max);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_KERNEL(reduce_max, MKLDNN, paddle::platform::CPUPlace,
+                   ops::ReduceMaxMKLDNNKernel<float>,
+                   ops::ReduceMaxMKLDNNKernel<paddle::platform::bfloat16>);
diff --git a/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mean_mkldnn_op.cc b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mean_mkldnn_op.cc
new file mode 100644
index 0000000000000..33daeea8599c6
--- /dev/null
+++ b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mean_mkldnn_op.cc
@@ -0,0 +1,63 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class ReduceMeanMKLDNNKernel : public ReduceMKLDNNKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    this->RunKernel(ctx, dnnl::algorithm::reduction_mean);
+  }
+};
+
+template <typename T>
+class ReduceMeanGradMKLDNNKernel : public ReduceGradMKLDNNKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const auto* input_x = ctx.Input<Tensor>("X");
+    auto input_dims = framework::vectorize(input_x->dims());
+    auto reduce_dims = ctx.Attr<std::vector<int>>("dim");
+
+    int number_of_elements = 1;
+    if (!ctx.Attr<bool>("reduce_all")) {
+      for (size_t i = 0; i < reduce_dims.size(); ++i) {
+        reduce_dims[i] = (reduce_dims[i] >= 0)
+                             ? reduce_dims[i]
+                             : input_dims.size() + reduce_dims[i];
+        number_of_elements *= input_dims[reduce_dims[i]];
+      }
+    } else {
+      number_of_elements = input_x->numel();
+    }
+
+    this->RunKernel(ctx, dnnl::algorithm::binary_add, 0.0f,
+                    1.0L / number_of_elements);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_KERNEL(reduce_mean, MKLDNN, paddle::platform::CPUPlace,
+                   ops::ReduceMeanMKLDNNKernel<float>,
+                   ops::ReduceMeanMKLDNNKernel<paddle::platform::bfloat16>);
+
+REGISTER_OP_KERNEL(reduce_mean_grad, MKLDNN, paddle::platform::CPUPlace,
+                   ops::ReduceMeanGradMKLDNNKernel<float>,
+                   ops::ReduceMeanGradMKLDNNKernel<paddle::platform::bfloat16>);
diff --git a/paddle/fluid/operators/reduce_ops/mkldnn/reduce_min_mkldnn_op.cc b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_min_mkldnn_op.cc
new file mode 100644
index 0000000000000..ce63a1485471f
--- /dev/null
+++ b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_min_mkldnn_op.cc
@@ -0,0 +1,34 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class ReduceMinMKLDNNKernel : public ReduceMKLDNNKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    this->RunKernel(ctx, dnnl::algorithm::reduction_min);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_KERNEL(reduce_min, MKLDNN, paddle::platform::CPUPlace,
+                   ops::ReduceMinMKLDNNKernel<float>,
+                   ops::ReduceMinMKLDNNKernel<paddle::platform::bfloat16>);
diff --git a/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h
new file mode 100644
index 0000000000000..58416f479c043
--- /dev/null
+++ b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h
@@ -0,0 +1,185 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/mkldnn_reuse.h"
+
+namespace paddle {
+namespace operators {
+
+using paddle::framework::LoDTensor;
+using paddle::framework::Tensor;
+using platform::to_void_cast;
+
+template <typename T>
+class ReduceMKLDNNKernel : public framework::OpKernel<T> {
+ public:
+  void RunKernel(const framework::ExecutionContext& ctx,
+                 dnnl::algorithm reduction_type) const {
+    auto& dev_ctx =
+        ctx.template device_context<platform::MKLDNNDeviceContext>();
+    const auto& onednn_engine = dev_ctx.GetEngine();
+
+    const auto* input = ctx.Input<LoDTensor>("X");
+    auto* output = ctx.Output<Tensor>("Out");
+
+    auto reduce_dims = ctx.Attr<std::vector<int>>("dim");
+    bool reduce_all = ctx.Attr<bool>("reduce_all");
+    bool keep_dim = ctx.Attr<bool>("keep_dim");
+
+    std::vector<int64_t> output_dims =
+        CalculateOutputDims(input, output, reduce_dims, reduce_all, keep_dim);
+
+    auto input_dims = framework::vectorize(input->dims());
+
+    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
+
+    // oneDNN reduce op does not support edge case in which memory is being
+    // copied without actual reduction.
+    // In that case reorder must be executed to maintain compatibility with
+    // PaddlePaddle reduce op
+    if (input_dims == output_dims) {
+      mkldnn::memory::data_type input_type =
+          framework::ToMKLDNNDataType(input->type());
+      std::string key = platform::CreateKey(
+          dev_ctx, input_dims, input->format(), input->format(), input_type);
+      platform::ReorderMKLDNNHandler reorder_handler(
+          input_dims, input->type(), input_type, dev_ctx, onednn_engine, key);
+
+      auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory(
+          input->format(), platform::to_void_cast(input->data<T>()));
+
+      auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory(
+          output, input->format(), ctx.GetPlace());
+
+      auto reorder_p = reorder_handler.AcquireReorder(reorder_src_memory_p,
+                                                      reorder_dst_memory_p);
+
+      platform::RecordEvent record_reorder("int_reorder",
+                                           platform::EventRole::kUniqueOp);
+
+      reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p);
+      astream.wait();
+
+      output->set_layout(framework::DataLayout::kMKLDNN);
+      output->set_format(
+          platform::GetMKLDNNFormat(reorder_dst_memory_p->get_desc().reshape(
+              paddle::framework::vectorize<int64_t>(output->dims()))));
+    } else {
+      platform::ReductionMKLDNNHandler<T> handler(
+          reduction_type, 0.0f, 0.0f, dev_ctx, onednn_engine, ctx.GetPlace(),
+          input, output, ctx.InputName("X"), output_dims);
+
+      auto src_memory_p = handler.AcquireSrcMemory(input);
+      auto dst_memory_p = handler.AcquireDstMemory(output);
+
+      std::unordered_map<int, dnnl::memory> reduction_args = {
+          {DNNL_ARG_SRC, *src_memory_p}, {DNNL_ARG_DST, *dst_memory_p}};
+
+      auto reduction_p = handler.AcquireForwardPrimitive();
+
+      reduction_p->execute(astream, reduction_args);
+      astream.wait();
+      output->set_layout(framework::DataLayout::kMKLDNN);
+      output->set_format(
+          platform::GetMKLDNNFormat(dst_memory_p->get_desc().reshape(
+              paddle::framework::vectorize<int64_t>(output->dims()))));
+    }
+  }
+
+ private:
+  std::vector<int64_t> CalculateOutputDims(const Tensor* input,
+                                           const Tensor* output,
+                                           std::vector<int>& reduce_dims,
+                                           bool reduce_all,
+                                           bool keep_dim) const {
+    if (keep_dim) return framework::vectorize(output->dims());
+
+    if (reduce_all)
+      return std::vector<int64_t>(framework::vectorize(input->dims()).size(),
+                                  1);
+
+    std::vector<int64_t> output_dims(framework::vectorize(input->dims()));
+    for (size_t i = 0; i < reduce_dims.size(); ++i) {
+      reduce_dims[i] = (reduce_dims[i] >= 0)
+                           ? reduce_dims[i]
+                           : input->dims().size() + reduce_dims[i];
+      output_dims[reduce_dims[i]] = 1;
+    }
+
+    return output_dims;
+  }
+};
+
+template <typename T>
+class ReduceGradMKLDNNKernel : public framework::OpKernel<T> {
+ public:
+  void RunKernel(const framework::ExecutionContext& ctx,
+                 dnnl::algorithm binary_type, float scale_x,
+                 float scale_y) const {
+    const auto& dev_ctx =
+        ctx.template device_context<platform::MKLDNNDeviceContext>();
+    const auto& onednn_engine = dev_ctx.GetEngine();
+
+    auto dims = ctx.Attr<std::vector<int>>("dim");
+    auto* input_dy = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* output_dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+
+    output_dx->mutable_data<T>(ctx.GetPlace());
+    output_dx->set_format(getPlainFormatTag(output_dx));
+    output_dx->set_layout(input_dy->layout());
+
+    platform::BroadcastDataMKLDNNHandler<T> handler(
+        binary_type, dev_ctx, onednn_engine, ctx.GetPlace(), output_dx,
+        input_dy, scale_x, scale_y,
+        ctx.InputName(framework::GradVarName("Out")));
+
+    const auto src_dx_memory = handler.AcquireSrcMemory(output_dx);
+    const auto src_dy_memory = handler.AcquireSecondSrcMemory(input_dy);
+    const auto binary_prim = handler.AcquireForwardPrimitive();
+
+    const std::unordered_map<int, dnnl::memory> args = {
+        {DNNL_ARG_SRC_0, *src_dx_memory},
+        {DNNL_ARG_SRC_1, *src_dy_memory},
+        {DNNL_ARG_DST, *src_dx_memory}};
+
+    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
+    binary_prim->execute(astream, args);
+    astream.wait();
+  }
+
+ protected:
+  mkldnn::memory::format_tag getPlainFormatTag(const Tensor* tensor) const {
+    auto tensor_dims_size = tensor->dims().size();
+    PADDLE_ENFORCE_EQ(
+        tensor_dims_size <= 5 && tensor_dims_size >= 1, true,
+        platform::errors::InvalidArgument(
+            "Dims for reduction_grad oneDNN op must be in range <1, 5>"));
+
+    switch (tensor_dims_size) {
+      case 1:
+        return mkldnn::memory::format_tag::a;
+      case 2:
+        return mkldnn::memory::format_tag::ab;
+      case 3:
+        return mkldnn::memory::format_tag::abc;
+      case 4:
+        return mkldnn::memory::format_tag::abcd;
+    }
+
+    return mkldnn::memory::format_tag::abcde;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/reduce_ops/mkldnn/reduce_sum_mkldnn_op.cc b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_sum_mkldnn_op.cc
new file mode 100644
index 0000000000000..e62edcf559677
--- /dev/null
+++ b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_sum_mkldnn_op.cc
@@ -0,0 +1,46 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class ReduceSumMKLDNNKernel : public ReduceMKLDNNKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    this->RunKernel(ctx, dnnl::algorithm::reduction_sum);
+  }
+};
+
+template <typename T>
+class ReduceSumGradMKLDNNKernel : public ReduceGradMKLDNNKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    this->RunKernel(ctx, dnnl::algorithm::binary_add, 0.0f, 1.0f);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_KERNEL(reduce_sum, MKLDNN, paddle::platform::CPUPlace,
+                   ops::ReduceSumMKLDNNKernel<float>,
+                   ops::ReduceSumMKLDNNKernel<paddle::platform::bfloat16>);
+
+REGISTER_OP_KERNEL(reduce_sum_grad, MKLDNN, paddle::platform::CPUPlace,
+                   ops::ReduceSumGradMKLDNNKernel<float>,
+                   ops::ReduceSumGradMKLDNNKernel<paddle::platform::bfloat16>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu.cc
new file mode 100644
index 0000000000000..39e74c908ae7a
--- /dev/null
+++ b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu.cc
@@ -0,0 +1,55 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+
+template <typename T>
+class ReduceAnyNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const Tensor* x = ctx.Input<Tensor>("X");
+    auto* out = ctx.Output<Tensor>("Out");
+
+    bool keep_dim = ctx.Attr<bool>("keep_dim");
+    auto dims = ctx.Attr<std::vector<int>>("dim");
+
+    out->mutable_data<T>(ctx.GetPlace());
+
+    // set attr
+    NPUAttributeMap attr = {{"keep_dims", keep_dim}, {"axes", dims}};
+
+    auto runner = NpuOpRunner("ReduceAnyD", {*x}, {*out}, attr);
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    runner.Run(stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(reduce_any, ops::ReduceAnyNPUKernel<bool>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc
new file mode 100644
index 0000000000000..1eeeb5e1f8aa1
--- /dev/null
+++ b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc
@@ -0,0 +1,83 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <memory>
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+using Tensor = paddle::framework::Tensor;
+
+USE_OP(reduce_any);
+USE_OP_DEVICE_KERNEL(reduce_any, NPU);
+
+template <typename T>
+void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
+  // init
+  auto x = scope->Var("X");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+  std::vector<bool> init_x = {true, false, false, false};
+  f::TensorFromVector<bool>(init_x, ctx, tensor_x);
+  tensor_x->Resize(paddle::framework::make_ddim({2}));
+
+  ctx.Wait();
+
+  auto place = ctx.GetPlace();
+  auto out = scope->Var("Out");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+
+  // run
+  std::vector<int> axes;
+  f::AttributeMap attrs = {{"axes", axes}, {"keep_dims", true}};
+  auto op = f::OpRegistry::CreateOp("reduce_any", {{"X", {"X"}}},
+                                    {{"Out", {"Out"}}}, attrs);
+
+  op->Run(*scope, place);
+
+  ctx.Wait();
+
+  std::vector<bool> out_vec;
+  f::TensorToVector<bool>(*tensor_out, ctx, &out_vec);
+
+  ctx.Wait();
+
+  std::vector<bool> expected_vec = {true};
+  EXPECT_EQ(out_vec.size(), expected_vec.size());
+  for (uint32_t i = 0; i < out_vec.size(); i++) {
+    EXPECT_EQ(out_vec[i], expected_vec[i]);
+  }
+}
+
+TEST(reduce_any, NPU) {
+  f::Scope scope;
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  Compare<bool>(&scope, *ctx);
+}
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h
index 25f9453571ac6..913d941df8810 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.h
@@ -489,6 +489,30 @@ class ReduceOp : public framework::OperatorWithKernel {
       }
     }
   }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    // choose cudnn kernel if the runtime supported.
+    auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
+
+    if (ctx.Input<paddle::framework::LoDTensor>("X")->dims().size() > 5)
+      return framework::OpKernelType(input_data_type, ctx.GetPlace());
+
+#ifdef PADDLE_WITH_MKLDNN
+    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
+      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
+                                     framework::DataLayout::kMKLDNN,
+                                     framework::LibraryType::kMKLDNN);
+    }
+#endif
+
+    if (input_data_type == framework::proto::VarType::FP16) {
+      PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
+                        platform::errors::InvalidArgument(
+                            "float16 can only be used on GPU place"));
+    }
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+  }
 };
 
 class ReduceOpUseInputPlace : public ReduceOp {
@@ -535,15 +559,44 @@ class ReduceGradOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
+    auto input_data_type = OperatorWithKernel::IndicateVarDataType(
+        ctx, framework::GradVarName("Out"));
+
+#ifdef PADDLE_WITH_MKLDNN
+    auto CanMKLDNNReduceGradBeUsed = [&]() {
+      auto dx_dims = ctx.Input<Tensor>("X")->dims();
+
+      if (dx_dims.size() > 5) return false;  // max 5D tensor is supported
+
+      if (ctx.Attr<bool>("reduce_all") ||
+          ((int)ctx.Attr<std::vector<int>>("dim").size() == dx_dims.size()))
+        return true;
+
+      auto dy_dims = ctx.Input<Tensor>(framework::GradVarName("Out"))->dims();
+
+      // Subtensor must be on rightmost part of the bigger tensor
+      for (int i = 0; i < dy_dims.size(); ++i) {
+        if (dx_dims[dx_dims.size() - dy_dims.size() + i] != dy_dims[i]) {
+          return false;
+        }
+      }
+      return true;
+    };
+    if (this->CanMKLDNNBeUsed(ctx, input_data_type) &&
+        CanMKLDNNReduceGradBeUsed()) {
+      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
+                                     framework::DataLayout::kMKLDNN,
+                                     framework::LibraryType::kMKLDNN);
+    }
+#endif
+
     int in_dtype = ctx.Attr<int>("in_dtype");
     if (in_dtype >= 0) {
       return framework::OpKernelType(
           static_cast<framework::proto::VarType::Type>(in_dtype),
           ctx.GetPlace());
     }
-    return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
-                                       ctx, framework::GradVarName("Out")),
-                                   ctx.GetPlace());
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 };
 
@@ -579,6 +632,9 @@ class ReduceOpMaker : public framework::OpProtoAndCheckerMaker {
         "(int, default -1)"
         "The dtype of output, default value is -1, the dtype is same as intput")
         .SetDefault(-1);
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
     AddComment(string::Sprintf(R"DOC(
 %s Operator.
 
diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc
new file mode 100644
index 0000000000000..f3b6e69a48bcb
--- /dev/null
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc
@@ -0,0 +1,149 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/operators/npu_op_runner.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
+#include "paddle/fluid/operators/unsqueeze_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class ReduceSumNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::Tensor>("X");
+    auto* out = ctx.Output<framework::Tensor>("Out");
+    bool reduce_all = ctx.Attr<bool>("reduce_all");
+    bool keep_dims = ctx.Attr<bool>("keep_dim");
+    auto dims = ctx.Attr<std::vector<int>>("dim");
+
+    out->mutable_data<T>(ctx.GetPlace());
+
+    // special case
+    if (x->dims().size() == 1 && keep_dims == false) {
+      keep_dims = true;
+    }
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    framework::Tensor cast_x;
+    framework::Tensor cast_out;
+    // NOTE: ReduceSumD only supports fp32 and fp16
+    if (x->type() != framework::proto::VarType::FP32 &&
+        x->type() != framework::proto::VarType::FP16) {
+      cast_x.Resize(x->dims());
+      cast_x.mutable_data<float>(ctx.GetPlace());
+      auto dst_dtype = ConvertToNpuDtype(framework::proto::VarType::FP32);
+      auto runner_cast = NpuOpRunner(
+          "Cast", {*x}, {cast_x}, {{"dst_type", static_cast<int>(dst_dtype)}});
+      runner_cast.Run(stream);
+
+      cast_out.Resize(out->dims());
+      cast_out.mutable_data<float>(ctx.GetPlace());
+    } else {
+      cast_x.ShareDataWith(*x);
+      cast_out.ShareDataWith(*out);
+    }
+
+    if (reduce_all) {
+      std::vector<int> dim_vec;
+      for (int i = 0; i < x->dims().size(); i++) {
+        dim_vec.push_back(i);
+      }
+
+      auto runner = NpuOpRunner("ReduceSumD", {cast_x}, {cast_out},
+                                {{"axes", dim_vec}, {"keep_dims", keep_dims}});
+      runner.Run(stream);
+
+    } else {
+      auto runner = NpuOpRunner("ReduceSumD", {cast_x}, {cast_out},
+                                {{"axes", dims}, {"keep_dims", keep_dims}});
+      runner.Run(stream);
+    }
+
+    if (x->type() != framework::proto::VarType::FP32 &&
+        x->type() != framework::proto::VarType::FP16) {
+      auto dst_dtype = ConvertToNpuDtype(out->type());
+      auto runner_cast =
+          NpuOpRunner("Cast", {cast_out}, {*out},
+                      {{"dst_type", static_cast<int>(dst_dtype)}});
+      runner_cast.Run(stream);
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class ReduceSumGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::Tensor>("X");
+    auto* out_grad =
+        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* x_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    bool reduce_all = ctx.Attr<bool>("reduce_all");
+    bool keep_dims = ctx.Attr<bool>("keep_dim");
+    auto dims = ctx.Attr<std::vector<int>>("dim");
+
+    x_grad->mutable_data<T>(ctx.GetPlace());
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    if (keep_dims || reduce_all) {
+      auto runner = NpuOpRunner("BroadcastToD", {*out_grad}, {*x_grad},
+                                {{"shape", framework::vectorize(x->dims())}});
+      runner.Run(stream);
+    } else {
+      framework::DDim out_dims;
+      out_dims = UnsqueezeKernel<DeviceContext, T>::GetOutputShape(
+          dims, out_grad->dims());
+
+      Tensor out_grad_tmp(out_grad->type());
+      out_grad_tmp.Resize(out_dims);
+      out_grad_tmp.mutable_data<T>(ctx.GetPlace());
+      framework::TensorCopy(
+          *out_grad, ctx.GetPlace(),
+          ctx.template device_context<platform::DeviceContext>(),
+          &out_grad_tmp);
+      out_grad_tmp.Resize(out_dims);
+
+      auto runner = NpuOpRunner("BroadcastToD", {out_grad_tmp}, {*x_grad},
+                                {{"shape", framework::vectorize(x->dims())}});
+      runner.Run(stream);
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    reduce_sum,
+    ops::ReduceSumNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::ReduceSumNPUKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::ReduceSumNPUKernel<paddle::platform::NPUDeviceContext,
+                            paddle::platform::float16>);
+REGISTER_OP_NPU_KERNEL(
+    reduce_sum_grad,
+    ops::ReduceSumGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::ReduceSumGradNPUKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::ReduceSumGradNPUKernel<paddle::platform::NPUDeviceContext,
+                                paddle::platform::float16>);
diff --git a/paddle/fluid/operators/reshape_op_npu.cc b/paddle/fluid/operators/reshape_op_npu.cc
new file mode 100644
index 0000000000000..79a4cd116f3b9
--- /dev/null
+++ b/paddle/fluid/operators/reshape_op_npu.cc
@@ -0,0 +1,86 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class Reshape2NPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::Tensor>("X");
+    auto* out = ctx.Output<framework::Tensor>("Out");
+    auto list_new_shape_tensor =
+        ctx.MultiInput<framework::Tensor>("ShapeTensor");
+    if (list_new_shape_tensor.size() > 0) {
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Input(ShapeTensor) is not supported on NPU."));
+    }
+    PADDLE_ENFORCE_EQ(ctx.Input<framework::LoDTensor>("Shape"), nullptr,
+                      platform::errors::Unimplemented(
+                          "Input(Shape) is not supported on NPU."));
+    auto shape = out->dims();
+    out->mutable_data(ctx.GetPlace(), x->type());
+    framework::TensorCopy(
+        *x, ctx.GetPlace(),
+        ctx.template device_context<platform::DeviceContext>(), out);
+    out->Resize(shape);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class Reshape2GradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* d_x = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto in_dims = d_x->dims();
+
+    d_x->mutable_data(ctx.GetPlace(), d_out->type());
+    framework::TensorCopy(
+        *d_out, ctx.GetPlace(),
+        ctx.template device_context<platform::DeviceContext>(), d_x);
+    d_x->Resize(in_dims);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    reshape2, ops::Reshape2NPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::Reshape2NPUKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::Reshape2NPUKernel<paddle::platform::NPUDeviceContext, int64_t>,
+    ops::Reshape2NPUKernel<paddle::platform::NPUDeviceContext, bool>,
+    ops::Reshape2NPUKernel<paddle::platform::NPUDeviceContext, double>,
+    ops::Reshape2NPUKernel<paddle::platform::NPUDeviceContext, uint8_t>,
+    ops::Reshape2NPUKernel<paddle::platform::NPUDeviceContext,
+                           paddle::platform::float16>);
+REGISTER_OP_NPU_KERNEL(
+    reshape2_grad,
+    ops::Reshape2GradNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::Reshape2GradNPUKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::Reshape2GradNPUKernel<paddle::platform::NPUDeviceContext, int64_t>,
+    ops::Reshape2GradNPUKernel<paddle::platform::NPUDeviceContext, bool>,
+    ops::Reshape2GradNPUKernel<paddle::platform::NPUDeviceContext, double>,
+    ops::Reshape2GradNPUKernel<paddle::platform::NPUDeviceContext, uint8_t>,
+    ops::Reshape2GradNPUKernel<paddle::platform::NPUDeviceContext,
+                               paddle::platform::float16>);
diff --git a/paddle/fluid/operators/rnn_op_xpu.cc b/paddle/fluid/operators/rnn_op_xpu.cc
new file mode 100644
index 0000000000000..fb82d18e62f3b
--- /dev/null
+++ b/paddle/fluid/operators/rnn_op_xpu.cc
@@ -0,0 +1,314 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/utils.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/xpu_header.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using DDim = framework::DDim;
+
+using TensorList = std::vector<framework::Tensor>;
+
+template <typename TensorType, typename T>
+void reset_parameter_vector(const std::vector<TensorType>& raw_params_vec,
+                            const int& num_layers, const bool& is_bidirec,
+                            std::vector<std::vector<T*>>* params_vec) {
+  // the parameter raw seuquence is [FWhi, FWhh, BWhi, BWhh] * num_layers
+  // + [FBhi, FBhh, BBhi, BBhh] * num_layers, we will reset the parameter to
+  // ([FWhi, FWhh, FBhi, FBhh] + [BWhi, BWhh, BBhi, BBhh]) * num_layers
+  const int& direction_num = is_bidirec ? 2 : 1;
+  const int& layer_weight_size = 4 * direction_num;
+  const int& all_weight_size = num_layers * layer_weight_size;
+  const int& bias_start_idx = all_weight_size / 2;
+  for (int i = 0; i < num_layers; i++) {
+    params_vec->at(i).resize(layer_weight_size);
+    for (int j = 0; j < layer_weight_size; j++) {
+      int k = j % 4;
+      const int& section = j / 4;
+      int tensor_idx = i * 2 * direction_num + section * 2 + k % 2;
+      if (k >= 2) {
+        tensor_idx += bias_start_idx;
+      }
+      using remove_cv_t = typename std::remove_cv<T>::type;
+      params_vec->at(i)[j] =
+          raw_params_vec[tensor_idx]->template data<remove_cv_t>();
+    }
+  }
+}
+
+template <typename DeviceContext, typename T>
+class RnnXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<Tensor>("Input");
+    auto pre_state = ctx.MultiInput<Tensor>("PreState");
+    auto weight_list = ctx.MultiInput<framework::Tensor>("WeightList");
+    auto state = ctx.MultiOutput<Tensor>("State");
+    auto* output = ctx.Output<Tensor>("Out");
+    auto* reserve_data = ctx.Output<Tensor>("Reserve");
+    const int& num_layers = ctx.Attr<int>("num_layers");
+    const bool& is_bidirec = ctx.Attr<bool>("is_bidirec");
+    const int& hidden_size = ctx.Attr<int>("hidden_size");
+    const std::string& mode = ctx.Attr<std::string>("mode");
+
+    bool has_seq_length = ctx.HasInput("SequenceLength");
+    const Tensor* sequence_length = nullptr;
+    if (has_seq_length) {
+      sequence_length = ctx.Input<Tensor>("SequenceLength");
+    }
+
+    PADDLE_ENFORCE_EQ(
+        mode, "LSTM",
+        platform::errors::InvalidArgument(
+            "XPU only support LSTM mode now, current mode is %s", mode));
+
+    PADDLE_ENFORCE_EQ(is_bidirec, false,
+                      platform::errors::InvalidArgument(
+                          "XPU only support unidirectional LSTM now"));
+
+    PADDLE_ENFORCE_EQ(
+        num_layers, 1,
+        platform::errors::InvalidArgument(
+            "XPU only support 1 layer LSTM now, current layer num is %s",
+            num_layers));
+
+    auto init_h = pre_state[0];
+    auto init_c = pre_state[1];
+    auto last_h = state[0];
+    auto last_c = state[1];
+
+    // check shape
+    int seq_len = input->dims()[0];
+    int batch_size = input->dims()[1];
+    int input_dim = input->dims()[2];
+
+    PADDLE_ENFORCE_EQ(
+        init_h->dims()[0], num_layers,
+        platform::errors::InvalidArgument("The num_layers of in RNN layer must"
+                                          " be the same as first dim of init "
+                                          "hidden, but received num_layers:%d,"
+                                          " dim:%d",
+                                          num_layers, init_h->dims()[0]));
+
+    PADDLE_ENFORCE_EQ(
+        init_c->dims()[0], num_layers,
+        platform::errors::InvalidArgument(
+            "The num_layers of in RNN layer must"
+            " be the same as first dim of cell state hidden, but received"
+            " num_layers:%d, dim:%d",
+            num_layers, init_c->dims()[0]));
+
+    std::vector<std::vector<const T*>> parameter_lists;
+    parameter_lists.resize(num_layers);
+    reset_parameter_vector(weight_list, num_layers, is_bidirec,
+                           &parameter_lists);
+
+    // init the output and allocate the memory
+    output->mutable_data<T>(ctx.GetPlace());
+    last_h->mutable_data<T>(ctx.GetPlace());
+    last_c->mutable_data<T>(ctx.GetPlace());
+    reserve_data->Resize({seq_len * batch_size * hidden_size * 5});
+    reserve_data->mutable_data<T>(ctx.GetPlace());
+
+    // get ptr from tensor
+    auto x = input->data<T>();
+    auto h_0 = init_h->data<T>();
+    auto c_0 = init_c->data<T>();
+    auto w_x = parameter_lists[0][0];
+    auto w_h = parameter_lists[0][1];
+    auto b_x = parameter_lists[0][2];
+    auto b_h = parameter_lists[0][3];
+    auto y = output->data<T>();
+    auto last_h_ptr = last_h->data<T>();
+    auto last_c_ptr = last_c->data<T>();
+    auto i_f_g_o = reserve_data->data<T>();
+    auto c = i_f_g_o + seq_len * batch_size * hidden_size * 4;
+
+    std::vector<int> seq_len_tensor(batch_size, seq_len);
+    if (has_seq_length) {
+      seq_len_tensor = operators::GetDataFromTensor(sequence_length);
+    }
+
+    // run kernel
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    int r = xpu::lstm_train<T, T, int16_t>(
+        dev_ctx.x_context(), (const T*)x, (const T*)h_0, (const T*)c_0,
+        (const T*)w_x, (const T*)w_h, (const T*)b_x, (const T*)b_h,
+        reinterpret_cast<T*>(y), reinterpret_cast<T*>(last_h_ptr),
+        reinterpret_cast<T*>(last_c_ptr), batch_size, input_dim, hidden_size,
+        seq_len, seq_len_tensor, nullptr, nullptr, nullptr, nullptr,
+        reinterpret_cast<T*>(i_f_g_o), reinterpret_cast<T*>(c));
+    PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
+                      platform::errors::External("RnnXPU(lstm) return wrong "
+                                                 "value[%d %s]",
+                                                 r, XPUAPIErrorMsg[r]));
+  }
+};
+
+template <typename DeviceContext, typename T>
+class RnnXPUGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    // get the tensor pointer for the input
+    auto* input = ctx.Input<Tensor>("Input");
+    auto pre_state = ctx.MultiInput<Tensor>("PreState");
+    auto weight_list = ctx.MultiInput<framework::Tensor>("WeightList");
+    auto* output = ctx.Input<Tensor>("Out");
+    auto* reserve_data = ctx.Input<Tensor>("Reserve");
+    const int& num_layers = ctx.Attr<int>("num_layers");
+    const bool& is_bidirec = ctx.Attr<bool>("is_bidirec");
+    const int& hidden_size = ctx.Attr<int>("hidden_size");
+    const std::string& mode = ctx.Attr<std::string>("mode");
+
+    bool has_seq_length = ctx.HasInput("SequenceLength");
+    const Tensor* sequence_length = nullptr;
+    if (has_seq_length) {
+      sequence_length = ctx.Input<Tensor>("SequenceLength");
+    }
+
+    PADDLE_ENFORCE_EQ(
+        mode, "LSTM",
+        platform::errors::InvalidArgument(
+            "XPU only support LSTM mode now, current mode is %s", mode));
+
+    PADDLE_ENFORCE_EQ(is_bidirec, false,
+                      platform::errors::InvalidArgument(
+                          "XPU only support unidirectional LSTM now"));
+
+    PADDLE_ENFORCE_EQ(
+        num_layers, 1,
+        platform::errors::InvalidArgument(
+            "XPU only support 1 layer LSTM now, current layer num is %s",
+            num_layers));
+
+    auto init_h = pre_state[0];
+    auto init_c = pre_state[1];
+
+    auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto state_grad = ctx.MultiInput<Tensor>(framework::GradVarName("State"));
+    auto last_h_grad = state_grad[0];
+    auto last_c_grad = state_grad[1];
+
+    // get the tensor pointer for the output
+    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
+    auto weight_grad_list = ctx.MultiOutput<framework::Tensor>(
+        framework::GradVarName("WeightList"));
+    auto pre_state_grad =
+        ctx.MultiOutput<Tensor>(framework::GradVarName("PreState"));
+    Tensor* init_h_grad = nullptr;
+    Tensor* init_c_grad = nullptr;
+    if (pre_state_grad.size() > 0) {  // has gradient
+      init_h_grad = pre_state_grad[0];
+      init_c_grad = pre_state_grad[1];
+    }
+
+    // check shape
+    int seq_len = input->dims()[0];
+    int batch_size = input->dims()[1];
+    int input_dim = input->dims()[2];
+
+    PADDLE_ENFORCE_EQ(
+        init_h->dims()[0], num_layers,
+        platform::errors::InvalidArgument("The num_layers of in RNN layer must"
+                                          " be the same as first dim of init "
+                                          "hidden, but received num_layers:%d,"
+                                          " dim:%d",
+                                          num_layers, init_h->dims()[0]));
+
+    PADDLE_ENFORCE_EQ(
+        init_c->dims()[0], num_layers,
+        platform::errors::InvalidArgument(
+            "The num_layers of in RNN layer must"
+            " be the same as first dim of cell state hidden, but received"
+            " num_layers:%d, dim:%d",
+            num_layers, init_c->dims()[0]));
+
+    std::vector<std::vector<const T*>> parameter_lists;
+    parameter_lists.resize(num_layers);
+    reset_parameter_vector(weight_list, num_layers, is_bidirec,
+                           &parameter_lists);
+
+    for (unsigned int i = 0; i < weight_grad_list.size(); ++i) {
+      weight_grad_list[i]->mutable_data<T>(ctx.GetPlace());
+    }
+    std::vector<std::vector<T*>> parameter_lists_grad;
+    parameter_lists_grad.resize(num_layers);
+    reset_parameter_vector(weight_grad_list, num_layers, is_bidirec,
+                           &parameter_lists_grad);
+
+    // allocate the memory and initization the input_grad
+    input_grad->mutable_data<T>(input->dims(), ctx.GetPlace());
+    if (init_h_grad) {
+      init_h_grad->mutable_data<T>(init_h->dims(), ctx.GetPlace());
+    }
+    if (init_c_grad) {
+      init_c_grad->mutable_data<T>(init_c->dims(), ctx.GetPlace());
+    }
+
+    // get ptr from tensor
+    auto x = input->data<T>();
+    auto h_0 = init_h->data<T>();
+    auto c_0 = init_c->data<T>();
+    auto w_x = parameter_lists[0][0];
+    auto w_h = parameter_lists[0][1];
+    auto y = output->data<T>();
+    auto y_grad = output_grad->data<T>();
+    auto last_h_grad_ptr = last_h_grad->data<T>();
+    auto last_c_grad_ptr = last_c_grad->data<T>();
+    auto x_grad = input_grad->data<T>();
+    auto h_0_grad = init_h_grad ? init_h_grad->data<T>() : nullptr;
+    auto c_0_grad = init_c_grad ? init_c_grad->data<T>() : nullptr;
+    auto w_x_grad = parameter_lists_grad[0][0];
+    auto w_h_grad = parameter_lists_grad[0][1];
+    auto b_x_grad = parameter_lists_grad[0][2];
+    auto b_h_grad = parameter_lists_grad[0][3];
+    auto i_f_g_o = reserve_data->data<T>();
+    auto c = i_f_g_o + seq_len * batch_size * hidden_size * 4;
+
+    std::vector<int> seq_len_tensor(batch_size, seq_len);
+    if (has_seq_length) {
+      seq_len_tensor = operators::GetDataFromTensor(sequence_length);
+    }
+
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    int r = xpu::lstm_grad<T, T, int16_t>(
+        dev_ctx.x_context(), (const T*)x, (const T*)h_0, (const T*)c_0,
+        (const T*)w_x, (const T*)w_h, (const T*)y, (const T*)y_grad,
+        (const T*)last_h_grad_ptr, (const T*)last_c_grad_ptr,
+        reinterpret_cast<T*>(x_grad), reinterpret_cast<T*>(h_0_grad),
+        reinterpret_cast<T*>(c_0_grad), w_x_grad, w_h_grad, b_x_grad, b_h_grad,
+        batch_size, input_dim, hidden_size, seq_len, seq_len_tensor, nullptr,
+        nullptr, nullptr, nullptr, i_f_g_o, c);
+    PADDLE_ENFORCE_EQ(
+        r, xpu::Error_t::SUCCESS,
+        platform::errors::External("RnnXPUGrad(lstm) return wrong "
+                                   "value[%d %s]",
+                                   r, XPUAPIErrorMsg[r]));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(
+    rnn, ops::RnnXPUKernel<paddle::platform::XPUDeviceContext, float>);
+REGISTER_OP_XPU_KERNEL(
+    rnn_grad, ops::RnnXPUGradKernel<paddle::platform::XPUDeviceContext, float>);
+
+#endif  // PADDLE_WITH_XPU
diff --git a/paddle/fluid/operators/save_combine_op_npu.cc b/paddle/fluid/operators/save_combine_op_npu.cc
new file mode 100644
index 0000000000000..1fb136a5110db
--- /dev/null
+++ b/paddle/fluid/operators/save_combine_op_npu.cc
@@ -0,0 +1,24 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/save_combine_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    save_combine,
+    ops::SaveCombineOpKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::SaveCombineOpKernel<paddle::platform::NPUDeviceContext, double>,
+    ops::SaveCombineOpKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::SaveCombineOpKernel<paddle::platform::NPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/save_op_npu.cc b/paddle/fluid/operators/save_op_npu.cc
new file mode 100644
index 0000000000000..90db1a0bb85d6
--- /dev/null
+++ b/paddle/fluid/operators/save_op_npu.cc
@@ -0,0 +1,28 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/save_op.h"
+#include "paddle/fluid/platform/float16.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    save, ops::SaveOpKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::SaveOpKernel<paddle::platform::NPUDeviceContext, double>,
+    ops::SaveOpKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::SaveOpKernel<paddle::platform::NPUDeviceContext, uint8_t>,
+    ops::SaveOpKernel<paddle::platform::NPUDeviceContext, int8_t>,
+    ops::SaveOpKernel<paddle::platform::NPUDeviceContext, int64_t>,
+    ops::SaveOpKernel<paddle::platform::NPUDeviceContext,
+                      paddle::platform::float16>);
diff --git a/paddle/fluid/operators/scale_op_npu.cc b/paddle/fluid/operators/scale_op_npu.cc
new file mode 100644
index 0000000000000..cbfd11834ae47
--- /dev/null
+++ b/paddle/fluid/operators/scale_op_npu.cc
@@ -0,0 +1,71 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/operators/npu_op_runner.h"
+#include "paddle/fluid/operators/scale_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class ScaleNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::Tensor>("X");
+    auto* out = ctx.Output<framework::Tensor>("Out");
+    auto scale = static_cast<float>(ctx.Attr<float>("scale"));
+    auto bias = static_cast<float>(ctx.Attr<float>("bias"));
+    auto bias_after_scale = ctx.Attr<bool>("bias_after_scale");
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    float _power = 1.0;
+    VLOG(4) << "scale:" << scale << ", bias:" << bias
+            << " ,bias_after_scale:" << bias_after_scale;
+    if (bias_after_scale) {
+      out->mutable_data<T>(ctx.GetPlace());
+      auto runner =
+          NpuOpRunner("Power", {*x}, {*out},
+                      {{"power", _power}, {"scale", scale}, {"shift", bias}});
+
+      runner.Run(stream);
+    } else {
+      Tensor tmp_x(x->type());
+      tmp_x.Resize(x->dims());
+      tmp_x.mutable_data<T>(ctx.GetPlace());
+      auto runner_tmp = NpuOpRunner("Adds", {*x}, {tmp_x}, {{"value", bias}});
+      runner_tmp.Run(stream);
+
+      out->mutable_data<T>(ctx.GetPlace());
+      float _bias = 0.0;
+      auto runner =
+          NpuOpRunner("Power", {tmp_x}, {*out},
+                      {{"power", _power}, {"scale", scale}, {"shift", _bias}});
+      runner.Run(stream);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    scale, ops::ScaleNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::ScaleNPUKernel<paddle::platform::NPUDeviceContext,
+                        paddle::platform::float16>);
diff --git a/paddle/fluid/operators/scatter_nd_add_op.cc b/paddle/fluid/operators/scatter_nd_add_op.cc
index 144e7ceae20c1..2d23e81717abb 100644
--- a/paddle/fluid/operators/scatter_nd_add_op.cc
+++ b/paddle/fluid/operators/scatter_nd_add_op.cc
@@ -50,10 +50,15 @@ class ScatterNdAddOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_LE(
         index_dims[index_dims_size - 1], ref_dims_size,
         platform::errors::InvalidArgument(
-            "Input(Index).shape[-1] should be no greater than Input(X).rank"));
+            "The last dimension of Input(Index)'s shape should be no greater "
+            "than the rank of Input(X), but received the last dimension of "
+            "Input(Index)'s shape is %d, the rank of Input(X) is %d.",
+            index_dims[index_dims_size - 1], ref_dims_size));
     PADDLE_ENFORCE_GE(index_dims_size, 2UL,
                       platform::errors::InvalidArgument(
-                          "The rank of Input(Index) should be greater than 1"));
+                          "The rank of Input(Index) should be greater than 1, "
+                          "but received the rank of Input(Index) is %d.",
+                          index_dims_size));
 
     // update.shape = index.shape[:-1] + output.shape[index.shape[-1]:]
     std::vector<int64_t> r_updates_dims;
@@ -66,12 +71,21 @@ class ScatterNdAddOp : public framework::OperatorWithKernel {
 
     PADDLE_ENFORCE_EQ(
         r_updates_dims.size(), updates_dims_size,
-        platform::errors::InvalidArgument("Updates has wrong shape"));
+        platform::errors::InvalidArgument(
+            "Updates has wrong shape. The shape of Updates and Input(Updates) "
+            "should be same, but received the shape of Updates is %d, "
+            "the shape of Input(Updates) is %d.",
+            r_updates_dims.size(), updates_dims_size));
 
     for (int64_t i = 0; i < updates_dims_size; ++i) {
       PADDLE_ENFORCE_EQ(
           r_updates_dims[i], updates_dims[i],
-          platform::errors::InvalidArgument("Updates has wrong shape"));
+          platform::errors::InvalidArgument(
+              "Updates has wrong shape. The dimensions of Updates and "
+              "Input(Updates) should match, but received Updates's"
+              "%d-th dimension is %d, Input(Updates)'s %d-th "
+              "dimension is %d.",
+              i, r_updates_dims[i], i, updates_dims[i]));
     }
     ctx->SetOutputDim("Out", ref_dims);
     ctx->ShareLoD("X", /*->*/ "Out");
diff --git a/paddle/fluid/operators/scatter_op.cc b/paddle/fluid/operators/scatter_op.cc
index 3fc40d41c3081..f0faa0c579833 100644
--- a/paddle/fluid/operators/scatter_op.cc
+++ b/paddle/fluid/operators/scatter_op.cc
@@ -41,15 +41,24 @@ class ScatterOp : public framework::OperatorWithKernel {
     auto ref_dims = ctx->GetInputDim("X");
     PADDLE_ENFORCE_EQ(
         ctx->GetInputDim("Ids").size(), 1,
-        platform::errors::InvalidArgument("Update Ids should be 1-D."));
+        platform::errors::InvalidArgument(
+            "The size of Input(Ids)'s shape should be equal to 1, but "
+            "received the rank of Input(Ids) is %d.",
+            ctx->GetInputDim("Ids").size()));
     PADDLE_ENFORCE_EQ(
         ref_dims.size(), updates_dims.size(),
         platform::errors::InvalidArgument(
-            "Rerence and Updates should have the same shape size."));
-    PADDLE_ENFORCE_EQ(ctx->GetInputDim("Updates")[0],
-                      ctx->GetInputDim("Ids")[0],
-                      platform::errors::InvalidArgument(
-                          "Updates and Ids should have same batch-size."));
+            "Input(X) and Input(Updates) should have the same shape size, "
+            "but received the size of Input(x)'s shape is %d, the size of "
+            "Input(Updates)'s shape is %d.",
+            ref_dims.size(), updates_dims.size()));
+    PADDLE_ENFORCE_EQ(
+        ctx->GetInputDim("Updates")[0], ctx->GetInputDim("Ids")[0],
+        platform::errors::InvalidArgument(
+            "Input(Updates) and Input(Ids) should have same batch-size, but"
+            " received Input(Updates)'s batch-size is %d, Input(Ids)'s "
+            "batch-size is %d.",
+            ctx->GetInputDim("Updates")[0], ctx->GetInputDim("Ids")[0]));
     ctx->SetOutputDim("Out", ref_dims);
     ctx->ShareLoD("X", /*->*/ "Out");
   }
diff --git a/paddle/fluid/operators/scatter_op_npu.cc b/paddle/fluid/operators/scatter_op_npu.cc
new file mode 100644
index 0000000000000..e2e49acb94c7b
--- /dev/null
+++ b/paddle/fluid/operators/scatter_op_npu.cc
@@ -0,0 +1,75 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_ASCEND_CL
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/operators/kron_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+#include "paddle/fluid/operators/scatter_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class ScatterNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* index = ctx.Input<Tensor>("Ids");
+    auto* updates = ctx.Input<Tensor>("Updates");
+    bool overwrite = ctx.Attr<bool>("overwrite");
+
+    auto* out = ctx.Output<Tensor>("Out");
+
+    auto place = ctx.GetPlace();
+    out->mutable_data<T>(place);
+
+    framework::Tensor tmp_tensor(index->type());
+    const auto index_dims = index->dims();
+    if (index_dims.size() == 1) {
+      tmp_tensor.ShareDataWith(*index);
+      std::vector<int64_t> new_dim = {index_dims[0], 1};
+      tmp_tensor.Resize(framework::make_ddim(new_dim));
+      index = &tmp_tensor;
+    }
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    if (overwrite) {
+      auto runner_update = NpuOpRunner("TensorScatterUpdate",
+                                       {*x, *index, *updates}, {*out}, {});
+      runner_update.Run(stream);
+    } else {
+      auto runner_add =
+          NpuOpRunner("TensorScatterAdd", {*x, *index, *updates}, {*out}, {});
+      runner_add.Run(stream);
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    scatter, ops::ScatterNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::ScatterNPUKernel<paddle::platform::NPUDeviceContext,
+                          paddle::platform::float16>);
+#endif
diff --git a/paddle/fluid/operators/shape_op_npu.cc b/paddle/fluid/operators/shape_op_npu.cc
new file mode 100644
index 0000000000000..061849db6ada8
--- /dev/null
+++ b/paddle/fluid/operators/shape_op_npu.cc
@@ -0,0 +1,60 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/operators/npu_op_runner.h"
+#include "paddle/fluid/operators/shape_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using SelectedRows = framework::SelectedRows;
+
+template <typename DeviceContext, typename T>
+class ShapeNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in_var = ctx.InputVar("Input");
+    framework::DDim in_dims;
+    if (in_var->IsType<SelectedRows>()) {
+      in_dims = in_var->Get<SelectedRows>().value().dims();
+    } else {
+      in_dims = in_var->Get<LoDTensor>().dims();
+    }
+    auto* out_t = ctx.Output<Tensor>("Out");
+    out_t->Resize({in_dims.size()});
+    // to do: cpuplace?
+    auto out_data = out_t->mutable_data<int32_t>(platform::CPUPlace());
+    for (int i = 0; i < in_dims.size(); ++i) {
+      out_data[i] = in_dims[i];
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    shape, ops::ShapeNPUKernel<paddle::platform::NPUDeviceContext, bool>,
+    ops::ShapeNPUKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::ShapeNPUKernel<paddle::platform::NPUDeviceContext, int8_t>,
+    ops::ShapeNPUKernel<paddle::platform::NPUDeviceContext, uint8_t>,
+    ops::ShapeNPUKernel<paddle::platform::NPUDeviceContext, int64_t>,
+    ops::ShapeNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::ShapeNPUKernel<paddle::platform::NPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/slice_op_npu.cc b/paddle/fluid/operators/slice_op_npu.cc
new file mode 100644
index 0000000000000..9974536da9acb
--- /dev/null
+++ b/paddle/fluid/operators/slice_op_npu.cc
@@ -0,0 +1,136 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the Licnse. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+#include "paddle/fluid/operators/slice_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+void UpdateAttr(const framework::DDim in_dims, const std::vector<int> axes,
+                const std::vector<int> starts, const std::vector<int> ends,
+                std::vector<int>* offsets, std::vector<int>* size) {
+  int cnt = 0;
+  for (int i = 0; i < in_dims.size(); ++i) {
+    int start = 0;
+    int end = in_dims[i];
+    int axis = axes[cnt];
+
+    if (axis == i) {
+      start = starts[cnt];
+      if (start < 0) {
+        start = (start + in_dims[i]);
+      }
+      start = std::max(start, static_cast<int>(0));
+      end = ends[cnt];
+      if (end < 0) {
+        end = (end + in_dims[i]);
+      }
+      end = std::min(end, static_cast<int>(in_dims[i]));
+      cnt++;
+    }
+
+    (*offsets)[i] = start;
+    (*size)[i] = end - start;
+  }
+}
+
+template <typename DeviceContext, typename T>
+class SliceNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<Tensor>("Input");
+    auto* out = ctx.Output<Tensor>("Out");
+
+    auto axes = ctx.Attr<std::vector<int>>("axes");
+    auto starts = ctx.Attr<std::vector<int>>("starts");
+    auto ends = ctx.Attr<std::vector<int>>("ends");
+
+    out->mutable_data<T>(ctx.GetPlace());
+
+    auto in_dims = input->dims();
+    std::vector<int> offsets(in_dims.size());
+    std::vector<int> size(in_dims.size());
+
+    UpdateAttr(in_dims, axes, starts, ends, &offsets, &size);
+
+    auto runner = NpuOpRunner("SliceD", {*input}, {*out},
+                              {{"offsets", offsets}, {"size", size}});
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    runner.Run(stream);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class SliceGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<Tensor>("Input");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dinput = ctx.Output<Tensor>(framework::GradVarName("Input"));
+
+    auto axes = ctx.Attr<std::vector<int>>("axes");
+    auto starts = ctx.Attr<std::vector<int>>("starts");
+    auto ends = ctx.Attr<std::vector<int>>("ends");
+
+    auto in_dims = input->dims();
+    int rank = in_dims.size();
+
+    std::vector<int> offsets(rank);
+    std::vector<int> size(rank);
+    UpdateAttr(in_dims, axes, starts, ends, &offsets, &size);
+
+    std::vector<std::vector<int64_t>> paddings(rank, std::vector<int64_t>(2));
+    for (int i = 0; i < rank; ++i) {
+      paddings[i][0] = static_cast<int64_t>(offsets[i]);
+      paddings[i][1] = static_cast<int64_t>(in_dims[i] - size[i] - offsets[i]);
+    }
+
+    dinput->mutable_data<T>(ctx.GetPlace());
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    auto runner =
+        NpuOpRunner("PadD", {*dout}, {*dinput}, {{"paddings", paddings}});
+    runner.Run(stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    slice, ops::SliceNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::SliceNPUKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::SliceNPUKernel<paddle::platform::NPUDeviceContext,
+                        paddle::platform::float16>);
+
+REGISTER_OP_NPU_KERNEL(
+    slice_grad,
+    ops::SliceGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::SliceGradNPUKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::SliceGradNPUKernel<paddle::platform::NPUDeviceContext,
+                            paddle::platform::float16>);
diff --git a/paddle/fluid/operators/softmax_cudnn_op.cu b/paddle/fluid/operators/softmax_cudnn_op.cu
index b62d71bdbc4db..83b7b78aaec90 100644
--- a/paddle/fluid/operators/softmax_cudnn_op.cu
+++ b/paddle/fluid/operators/softmax_cudnn_op.cu
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/math/math_cuda_utils.h"
+#include "paddle/fluid/operators/softmax_impl.cuh"
 #include "paddle/fluid/operators/softmax_op.h"
 #include "paddle/fluid/platform/cuda_device_function.h"
 #ifdef PADDLE_WITH_HIP
@@ -21,7 +23,6 @@ limitations under the License. */
 #else
 #include "paddle/fluid/platform/cudnn_helper.h"
 #endif
-#include "paddle/fluid/platform/gpu_launch_config.h"
 
 namespace paddle {
 namespace platform {
@@ -37,288 +38,414 @@ using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
 using DataLayout = platform::DataLayout;
 using Tensor = framework::Tensor;
 
-#define LAUNCH_SOFTMAX_WARP_FORWARD(Log2Elements)                  \
-  case Log2Elements:                                               \
-    WarpSoftmaxForward<T, float, Log2Elements><<<                  \
-        blocks, threads, 0, ctx.cuda_device_context().stream()>>>( \
-        out_data, x->data<T>(), N, dim, dim);                      \
-    break;
-
-#define LAUNCH_SOFTMAX_WARP_BACKWARD(Log2Elements)                 \
-  case Log2Elements:                                               \
-    softmax_warp_backward<T, float, Log2Elements><<<               \
-        blocks, threads, 0, ctx.cuda_device_context().stream()>>>( \
-        dx_data, mul_grad.data<T>(), out->data<T>(), N, dim, dim); \
-    break;
-
-static inline int SizeOutAxis(const int axis, DDim dims) {
-  int size = 1;
-  for (int i = axis + 1; i < dims.size(); i++) {
-    size *= dims[i];
-  }
-  return size;
-}
-
-int log2_ceil(int value) {
-  int log2_value = 0;
-  while ((1 << log2_value) < value) ++log2_value;
-  return log2_value;
-}
-
-template <typename T, int VLEN>
-union vec_t {
-  static_assert(sizeof(T) == -1, "vec_t is only available by specialization.");
+// Vectorization trait 4 * sizeof(T)
+template <typename T>
+class VecT4 {};
+template <>
+class VecT4<double> {
+ public:
+  using Type = long4;
 };
-
 template <>
-union vec_t<float, 4> {
-  float4 s;
-  float v[4];
+class VecT4<float> {
+ public:
+  using Type = int4;
+};
+template <>
+class VecT4<platform::float16> {
+ public:
+  using Type = int2;
 };
 
+// Vectorization trait 2 * sizeof(T)
+template <typename T>
+class VecT2 {};
 template <>
-union vec_t<platform::float16, 4> {
-  int2 s;
-  platform::float16 v[4];
+class VecT2<double> {
+ public:
+  using Type = int4;
+};
+template <>
+class VecT2<float> {
+ public:
+  using Type = int2;
+};
+template <>
+class VecT2<platform::float16> {
+ public:
+  using Type = int;
 };
 
-template <typename T, typename VECT, int VPT, int WARP_PER_BLOCK>
-__global__ void VecSoftmaxForward(T* dst, const T* src, const int batch_size,
-                                  const int softmax_ele) {
-  int offset = blockIdx.x * softmax_ele * WARP_PER_BLOCK;
-  int idx = threadIdx.x * VPT;
-
-  VECT buf = reinterpret_cast<const VECT*>(&src[offset + idx])[0];
-  T* bufp = reinterpret_cast<T*>(&buf);
-  float4 val4;
-  float* val4p = reinterpret_cast<float*>(&val4);
-  for (int i = 0; i < VPT; ++i) {
-    val4p[i] = static_cast<float>(bufp[i]);
-  }
-  float val = val4.x + val4.y + val4.z + val4.w;
-  float max_val = math::warpReduceMax<float>(
-      max(max(val4.x, val4.y), max(val4.z, val4.w)), 0xffffffff);
-  float4 tmp4 = make_float4(__expf(val4.x - max_val), __expf(val4.y - max_val),
-                            __expf(val4.z - max_val), __expf(val4.w - max_val));
-  float* tmp4p = reinterpret_cast<float*>(&tmp4);
-  float invsum = 1.f / (math::warpReduceSum<float>(
-                            tmp4.x + tmp4.y + tmp4.z + tmp4.w, 0xffffffff) +
-                        1e-6f);
-  for (int i = 0; i < VPT; ++i) {
-    bufp[i] = static_cast<T>(tmp4p[i] * invsum);
-  }
-  reinterpret_cast<VECT*>(&dst[offset + idx])[0] = buf;
+int static inline log2_ceil(int value) {
+  int log2_value = 0;
+  while ((1 << log2_value) < value) ++log2_value;
+  return log2_value;
 }
 
-template <typename T, int WARP_BATCH, int WARP_SIZE_SOFTMAX>
-__device__ __forceinline__ void warp_reduce_sum(T* sum) {
+/*
+Core function of computing softmax forward for axis=-1.
+The computation includes
+  - Compute maximum of batch: maxvalue_{i} = max_j src_{i,j}
+  - Compute sum of exp batch: s_{i} = sum_{j}{ exp(src_{i,j} - maxvalue_{i} }
+  - Compute: (a_{i,j} - maxvalue_{i}) / s_{i}
+One warp (32 threads) is used to compute 1 or 2 batch (kBatchSize).
+For reduction max (sum), firstly compute max (sum) to one warp, then use shuffle
+api to compute max (sum) in one warp.
+*/
+template <typename T, typename VecT, typename AccT, int Log2Elements,
+          bool LogMode = false>
+__global__ void WarpSoftmaxForward(T* softmax, const T* src,
+                                   const int batch_size, const int stride,
+                                   const int element_count) {
+  constexpr int kDimCeil = 1 << Log2Elements;
+  constexpr int kWarpSize = (kDimCeil < 32) ? kDimCeil : 32;
+  constexpr int kVSize = sizeof(VecT) / sizeof(T);
+  constexpr int kIterations = kDimCeil / kWarpSize;
+  constexpr int kIterationsV =
+      (kIterations >= kVSize) ? (kIterations / kVSize) : 1;
+  constexpr int kBatchSize = (kDimCeil <= 32) ? 2 : 1;
+
+  int first_batch = (blockDim.y * blockIdx.x + threadIdx.y) * kBatchSize;
+
+  // max index to read
+  int idx_max_v[kBatchSize];
 #pragma unroll
-  for (int offset = WARP_SIZE_SOFTMAX / 2; offset > 0; offset /= 2) {
-#pragma unroll
-    for (int i = 0; i < WARP_BATCH; ++i) {
-      T sum_val = platform::CudaShuffleXorSync(0xFFFFFFFF, sum[i], offset);
-      sum[i] = sum[i] + sum_val;
-    }
+  for (int i = 0; i < kBatchSize; i++) {
+    int idx_max = ((i + first_batch) < batch_size) ? element_count : 0;
+    idx_max_v[i] = idx_max / kVSize;
   }
-}
 
-template <typename T, int WARP_BATCH, int WARP_SIZE_SOFTMAX>
-__device__ __forceinline__ void warp_reduce_max(T* sum) {
+  // read data from global memory
+  AccT srcdata[kBatchSize][kIterationsV][kVSize];
+
+#pragma unroll
+  for (int i = 0; i < kBatchSize; ++i) {
+// read data
+#pragma unroll
+    for (int it = 0; it < kIterationsV; ++it) {
+      int src_idx = threadIdx.x + it * kWarpSize;
+      if (kVSize == 1) {
+        if (src_idx < idx_max_v[i]) {
+          srcdata[i][it][0] =
+              static_cast<AccT>(src[(first_batch + i) * stride + src_idx]);
+        } else {
+          srcdata[i][it][0] = -std::numeric_limits<AccT>::infinity();
+        }
+      } else {
+        const VecT* src_v =
+            reinterpret_cast<const VecT*>(&src[(first_batch + i) * stride]);
+        if (src_idx < idx_max_v[i]) {
+          VecT srctmp = src_v[src_idx];
+          const T* srcinptr = reinterpret_cast<const T*>(&srctmp);
 #pragma unroll
-  for (int offset = WARP_SIZE_SOFTMAX / 2; offset > 0; offset /= 2) {
+          for (int s = 0; s < kVSize; s++) {
+            srcdata[i][it][s] = static_cast<AccT>(srcinptr[s]);
+          }
+        } else {
 #pragma unroll
-    for (int i = 0; i < WARP_BATCH; ++i) {
-      T max_val = platform::CudaShuffleXorSync(0xFFFFFFFF, sum[i], offset);
-      sum[i] = max(sum[i], max_val);
+          for (int s = 0; s < kVSize; s++) {
+            srcdata[i][it][s] = -std::numeric_limits<AccT>::infinity();
+          }
+        }
+      }
     }
   }
-}
-
-template <typename T, typename AccT, int Log2Elements>
-__global__ void WarpSoftmaxForward(T* dst, const T* src, const int batch_size,
-                                   const int stride, const int element_count) {
-  constexpr int next_power_of_two = 1 << Log2Elements;
-  constexpr int warp_size_softmax =
-      (next_power_of_two < 32) ? next_power_of_two : 32;
-  constexpr int WARP_ITERATIONS = next_power_of_two / warp_size_softmax;
-  constexpr int WARP_BATCH = (next_power_of_two <= 128) ? 2 : 1;
 
-  int first_batch = (blockDim.y * blockIdx.x + threadIdx.y) * WARP_BATCH;
-
-  int local_batches = batch_size - first_batch;
-  if (local_batches > WARP_BATCH) {
-    local_batches = WARP_BATCH;
-  }
-
-  int local_idx = threadIdx.x;
-
-  src += first_batch * stride + local_idx;
-  dst += first_batch * stride + local_idx;
+  // compute max value
+  AccT max_value[kBatchSize];
+#pragma unroll
+  for (int i = 0; i < kBatchSize; ++i) {
+    // it = 0
+    AccT valmax = srcdata[i][0][0];
+#pragma unroll
+    for (int s = 1; s < kVSize; ++s) {
+      valmax = (valmax > srcdata[i][0][s]) ? valmax : srcdata[i][0][s];
+    }
+    max_value[i] = valmax;
 
-  // load data from global memory
-  AccT elements[WARP_BATCH][WARP_ITERATIONS];
-  for (int i = 0; i < WARP_BATCH; ++i) {
-    int batch_element_count = (i >= local_batches) ? 0 : element_count;
-    for (int it = 0; it < WARP_ITERATIONS; ++it) {
-      int element_index = local_idx + it * warp_size_softmax;
-      if (element_index < batch_element_count) {
-        elements[i][it] =
-            static_cast<float>(src[i * element_count + it * warp_size_softmax]);
-      } else {
-        elements[i][it] = -std::numeric_limits<AccT>::infinity();
+// it = 1, 2, ...
+#pragma unroll
+    for (int it = 1; it < kIterationsV; ++it) {
+      AccT valmax = srcdata[i][it][0];
+#pragma unroll
+      for (int s = 1; s < kVSize; ++s) {
+        valmax = (valmax > srcdata[i][it][s]) ? valmax : srcdata[i][it][s];
       }
+      max_value[i] = (max_value[i] > valmax) ? max_value[i] : valmax;
     }
   }
+  WarpReduceMax<AccT, kBatchSize, kWarpSize>(max_value);
 
-  // compute max_value
-  AccT max_value[WARP_BATCH];
+  // compute sum
+  AccT sum[kBatchSize];
 #pragma unroll
-  for (int i = 0; i < WARP_BATCH; ++i) {
-    max_value[i] = elements[i][0];
+  for (int i = 0; i < kBatchSize; ++i) {
+    // it = 0
+    if (LogMode) {
+      sum[i] = std::exp(srcdata[i][0][0] - max_value[i]);
+    } else {
+      srcdata[i][0][0] = std::exp(srcdata[i][0][0] - max_value[i]);
+      sum[i] = srcdata[i][0][0];
+    }
 #pragma unroll
-    for (int it = 1; it < WARP_ITERATIONS; ++it) {
-      max_value[i] =
-          (max_value[i] > elements[i][it]) ? max_value[i] : elements[i][it];
+    for (int s = 1; s < kVSize; ++s) {
+      if (LogMode) {
+        sum[i] += std::exp(srcdata[i][0][s] - max_value[i]);
+      } else {
+        srcdata[i][0][s] = std::exp(srcdata[i][0][s] - max_value[i]);
+        sum[i] += srcdata[i][0][s];
+      }
     }
-  }
-  warp_reduce_max<AccT, WARP_BATCH, warp_size_softmax>(max_value);
 
-  AccT sum[WARP_BATCH]{0.0f};
+// it = 1, 2, ...
 #pragma unroll
-  for (int i = 0; i < WARP_BATCH; ++i) {
+    for (int it = 1; it < kIterationsV; ++it) {
 #pragma unroll
-    for (int it = 0; it < WARP_ITERATIONS; ++it) {
-      elements[i][it] = (std::exp((elements[i][it] - max_value[i])));
-      sum[i] += elements[i][it];
+      for (int s = 0; s < kVSize; ++s) {
+        if (LogMode) {
+          sum[i] += std::exp(srcdata[i][it][s] - max_value[i]);
+        } else {
+          srcdata[i][it][s] = std::exp(srcdata[i][it][s] - max_value[i]);
+          sum[i] += srcdata[i][it][s];
+        }
+      }
     }
   }
-  warp_reduce_sum<AccT, WARP_BATCH, warp_size_softmax>(sum);
+  WarpReduceSum<AccT, kBatchSize, kWarpSize>(sum);
 
-// store result
+// write result to global memory
 #pragma unroll
-  for (int i = 0; i < WARP_BATCH; ++i) {
-    if (i >= local_batches) break;
+  for (int i = 0; i < kBatchSize; ++i) {
+    if (LogMode) {
+      sum[i] = std::log(sum[i]);
+    }
+
 #pragma unroll
-    for (int it = 0; it < WARP_ITERATIONS; ++it) {
-      int element_index = local_idx + it * warp_size_softmax;
-      if (element_index < element_count) {
-        dst[i * element_count + it * warp_size_softmax] =
-            elements[i][it] / sum[i];
+    for (int it = 0; it < kIterationsV; ++it) {
+      int idx = threadIdx.x + it * kWarpSize;
+      if (kVSize == 1) {
+        if (idx < idx_max_v[i]) {
+          if (LogMode) {
+            softmax[(first_batch + i) * stride + idx] =
+                srcdata[i][it][0] - max_value[i] - sum[i];
+          } else {
+            softmax[(first_batch + i) * stride + idx] =
+                srcdata[i][it][0] / sum[i];
+          }
+        } else {
+          break;
+        }
       } else {
-        break;
+        VecT* softmax_v =
+            reinterpret_cast<VecT*>(&softmax[(first_batch + i) * stride]);
+        VecT tmpdata;
+        T* tmpptr = reinterpret_cast<T*>(&tmpdata);
+#pragma unroll
+        for (int s = 0; s < kVSize; ++s) {
+          if (LogMode) {
+            tmpptr[s] = srcdata[i][it][s] - max_value[i] - sum[i];
+          } else {
+            tmpptr[s] = srcdata[i][it][s] / sum[i];
+          }
+        }
+
+        if (idx < idx_max_v[i]) {
+          softmax_v[idx] = tmpdata;
+        } else {
+          break;
+        }
       }
     }
   }
 }
 
-template <typename T, typename AccT, int Log2Elements>
-__global__ void softmax_warp_backward(T* gradInput, const T* grad,
-                                      const T* output, int batch_size,
-                                      int stride, int element_count) {
-  constexpr int next_power_of_two = 1 << Log2Elements;
-  constexpr int warp_size_softmax =
-      (next_power_of_two < 32) ? next_power_of_two : 32;
-  constexpr int WARP_ITERATIONS = next_power_of_two / warp_size_softmax;
-  constexpr int WARP_BATCH = (next_power_of_two <= 128) ? 2 : 1;
-
-  int first_batch = (blockDim.y * blockIdx.x + threadIdx.y) * WARP_BATCH;
-
+/*
+Core function of computing softmax backward for axis=-1.
+The computation includes
+  - Compute sum of exp batch: s_{i} = sum_{j} {src_{i,j} * grad_{i,j}
+  - Compute src_{i,j} * ( grad_{i,j}) - s_{i} )
+One warp (32 threads) is used to compute 1 or 2 batch (kBatchSize).
+For reduction max (sum), firstly compute max (sum) to one warp, then use shuffle
+api to compute max (sum) in one warp.
+*/
+template <typename T, typename VecT, typename AccT, int Log2Elements,
+          bool LogMode = false>
+__global__ void WarpSoftmaxBackward(T* dst, const T* grad, const T* src,
+                                    int batch_size, int stride,
+                                    int element_count) {
+  constexpr int kVSize = sizeof(VecT) / sizeof(T);
+  constexpr int kDimCeil = 1 << Log2Elements;
+  constexpr int kWarpSize = (kDimCeil < 32) ? kDimCeil : 32;
+  constexpr int kIterations = kDimCeil / kWarpSize;
+  constexpr int kBatchSize = (kDimCeil <= 128) ? 2 : 1;
+  constexpr int kIterationsV =
+      (kIterations >= kVSize) ? (kIterations / kVSize) : 1;
+  int element_count_v = element_count / kVSize;
+
+  int first_batch = (blockDim.y * blockIdx.x + threadIdx.y) * kBatchSize;
   int local_batches = batch_size - first_batch;
-  if (local_batches > WARP_BATCH) {
-    local_batches = WARP_BATCH;
+  if (local_batches > kBatchSize) {
+    local_batches = kBatchSize;
   }
 
-  int local_idx = threadIdx.x % warp_size_softmax;
-
-  int thread_offset = first_batch * stride + local_idx;
-  grad += thread_offset;
-  output += thread_offset;
-  gradInput += thread_offset;
-
-  // load data from global memory
-  AccT grad_reg[WARP_BATCH][WARP_ITERATIONS];
-  AccT output_reg[WARP_BATCH][WARP_ITERATIONS];
-  for (int i = 0; i < WARP_BATCH; ++i) {
-    int batch_element_count = (i >= local_batches) ? 0 : element_count;
-    for (int it = 0; it < WARP_ITERATIONS; ++it) {
-      int element_index = local_idx + it * warp_size_softmax;
-      if (element_index < batch_element_count) {
-        grad_reg[i][it] =
-            static_cast<AccT>(grad[i * element_count + it * warp_size_softmax]);
-        output_reg[i][it] = static_cast<AccT>(
-            output[i * element_count + it * warp_size_softmax]);
+  // read data from global memory
+  VecT src_reg[kBatchSize][kIterationsV];
+  VecT grad_reg[kBatchSize][kIterationsV];
+
+  for (int i = 0; i < kBatchSize; ++i) {
+    const VecT* src_v =
+        reinterpret_cast<const VecT*>(&src[(first_batch + i) * stride]);
+    const VecT* grad_v =
+        reinterpret_cast<const VecT*>(&grad[(first_batch + i) * stride]);
+
+    // max index to read
+    int idx_max = (i < local_batches) ? element_count : 0;
+    int idx_max_v = idx_max / kVSize;
+
+    // read data
+    for (int it = 0; it < kIterationsV; ++it) {
+      int src_idx = threadIdx.x + it * kWarpSize;
+      if (src_idx < idx_max_v) {
+        src_reg[i][it] = src_v[src_idx];
+        grad_reg[i][it] = grad_v[src_idx];
       } else {
-        grad_reg[i][it] = AccT(0);
-        output_reg[i][it] = AccT(0);
+#pragma unroll
+        for (int s = 0; s < kVSize; s++) {
+          reinterpret_cast<T*>(&src_reg[i][it])[s] = 0.0;
+          reinterpret_cast<T*>(&grad_reg[i][it])[s] = 0.0;
+        }
       }
     }
   }
 
-  AccT sum[WARP_BATCH];
+  // compute sum
+  AccT sum[kBatchSize]{0.0};
+#pragma unroll
+  for (int i = 0; i < kBatchSize; ++i) {
 #pragma unroll
-  for (int i = 0; i < WARP_BATCH; ++i) {
-    sum[i] = grad_reg[i][0];
+    for (int it = 0; it < kIterationsV; ++it) {
+      T* gradptr = reinterpret_cast<T*>(&grad_reg[i][it]);
+      T* srcptr = reinterpret_cast<T*>(&src_reg[i][it]);
 #pragma unroll
-    for (int it = 1; it < WARP_ITERATIONS; ++it) {
-      sum[i] += grad_reg[i][it];
+      for (int s = 0; s < kVSize; ++s) {
+        if (LogMode) {
+          sum[i] += static_cast<AccT>(gradptr[s]);
+        } else {
+          sum[i] += static_cast<AccT>(gradptr[s] * srcptr[s]);
+        }
+      }
     }
   }
-  warp_reduce_sum<AccT, WARP_BATCH, warp_size_softmax>(sum);
+  WarpReduceSum<AccT, kBatchSize, kWarpSize>(sum);
 
-// store result
+// write result
 #pragma unroll
-  for (int i = 0; i < WARP_BATCH; ++i) {
+  for (int i = 0; i < kBatchSize; ++i) {
     if (i >= local_batches) break;
+
+    VecT* dst_v = reinterpret_cast<VecT*>(&dst[(first_batch + i) * stride]);
+
+    // max index to write
+    int idx_max = (i < local_batches) ? element_count : 0;
+    int idx_max_v = idx_max / kVSize;
+
 #pragma unroll
-    for (int it = 0; it < WARP_ITERATIONS; ++it) {
-      int element_index = local_idx + it * warp_size_softmax;
-      if (element_index < element_count) {
-        // compute gradients
-        gradInput[i * element_count + it * warp_size_softmax] =
-            (grad_reg[i][it] - output_reg[i][it] * sum[i]);
+    for (int it = 0; it < kIterationsV; ++it) {
+      VecT tmpdata;
+      T* tmpptr = reinterpret_cast<T*>(&tmpdata);
+      T* gradptr = reinterpret_cast<T*>(&grad_reg[i][it]);
+      T* srcptr = reinterpret_cast<T*>(&src_reg[i][it]);
+#pragma unroll
+      for (int s = 0; s < kVSize; ++s) {
+        if (LogMode) {
+          tmpptr[s] = static_cast<AccT>(gradptr[s]) -
+                      std::exp(static_cast<AccT>(srcptr[s])) * sum[i];
+        } else {
+          tmpptr[s] = static_cast<AccT>(srcptr[s]) *
+                      (static_cast<AccT>(gradptr[s]) - sum[i]);
+        }
+      }
+
+      int idx = threadIdx.x + it * kWarpSize;
+      if (idx < idx_max_v) {
+        dst_v[idx] = tmpdata;
       }
     }
   }
 }
 
-template <typename T>
-__global__ void MultiplyCUDAKernel(T* C, const T* A, const T* B, int N) {
-  CUDA_KERNEL_LOOP(i, N) {
-    C[i] = static_cast<T>(static_cast<float>(A[i]) * static_cast<float>(B[i]));
+#define SOFTMAX_WARP_FORWARD_CASE(Log2Elements, AccT)                         \
+  case Log2Elements:                                                          \
+    WarpSoftmaxForward<                                                       \
+        T, VecT, AccT, Log2Elements,                                          \
+        LogMode><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>( \
+        dst, src, batch_size, stride, element_count);                         \
+    break;
+
+/*
+  Wrapper of softmax formward with template instantiation on size of input.
+*/
+template <typename T, typename VecT, bool LogMode>
+void SwitchWarpSoftmaxForward(const int blocks, const dim3 threads,
+                              const framework::ExecutionContext& ctx, T* dst,
+                              const T* src, const int batch_size,
+                              const int stride, const int element_count,
+                              int Log2Elements) {
+  using AccT = typename details::MPTypeTrait<T>::Type;
+  switch (Log2Elements) {
+    SOFTMAX_WARP_FORWARD_CASE(0, AccT);
+    SOFTMAX_WARP_FORWARD_CASE(1, AccT);
+    SOFTMAX_WARP_FORWARD_CASE(2, AccT);
+    SOFTMAX_WARP_FORWARD_CASE(3, AccT);
+    SOFTMAX_WARP_FORWARD_CASE(4, AccT);
+    SOFTMAX_WARP_FORWARD_CASE(5, AccT);
+    SOFTMAX_WARP_FORWARD_CASE(6, AccT);
+    SOFTMAX_WARP_FORWARD_CASE(7, AccT);
+    SOFTMAX_WARP_FORWARD_CASE(8, AccT);
+    SOFTMAX_WARP_FORWARD_CASE(9, AccT);
+    default:
+      break;
   }
 }
 
-template <typename T, int VPT, int WARP_PER_BLOCK>
-__global__ void VecSoftmaxBackward(T* dst, const T* grad, const T* src,
-                                   const int batch_size,
-                                   const int softmax_ele) {
-  const int offset =
-      blockIdx.x * softmax_ele * WARP_PER_BLOCK + threadIdx.x * VPT;
-
-  float local_sum_gy = 0.f;
-  vec_t<T, VPT> local_grad;
-  vec_t<T, VPT> local_src;
-
-  local_grad.s =
-      reinterpret_cast<const decltype(local_grad.s)*>(&grad[offset])[0];
-  local_src.s = reinterpret_cast<const decltype(local_src.s)*>(&src[offset])[0];
-
-  for (int i = 0; i < VPT; ++i) {
-    local_sum_gy += static_cast<float>(local_grad.v[i]) *
-                    static_cast<float>(local_src.v[i]);
-  }
-  float sum_gy = math::warpReduceSum<float>(local_sum_gy, 0xffffffff);
+#define SOFTMAX_WARP_BACKWARD_CASE(Log2Elements, AccT)                        \
+  case Log2Elements:                                                          \
+    WarpSoftmaxBackward<                                                      \
+        T, VecT, AccT, Log2Elements,                                          \
+        LogMode><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>( \
+        dst, grad, src, batch_size, stride, element_count);                   \
+    break;
 
-  vec_t<T, VPT> local_dst;
-  for (int i = 0; i < VPT; ++i) {
-    local_dst.v[i] =
-        static_cast<T>(static_cast<float>(local_src.v[i]) *
-                       (static_cast<float>(local_grad.v[i]) - sum_gy));
+/*
+Wrapper of softmax backward with template instantiation on size of input.
+*/
+template <typename T, typename VecT, bool LogMode>
+void SwitchWarpSoftmaxBackward(const int blocks, const dim3 threads,
+                               const framework::ExecutionContext& ctx, T* dst,
+                               const T* grad, const T* src,
+                               const int batch_size, const int stride,
+                               const int element_count, int Log2Elements) {
+  using AccT = typename details::MPTypeTrait<T>::Type;
+  switch (Log2Elements) {
+    SOFTMAX_WARP_BACKWARD_CASE(0, AccT);
+    SOFTMAX_WARP_BACKWARD_CASE(1, AccT);
+    SOFTMAX_WARP_BACKWARD_CASE(2, AccT);
+    SOFTMAX_WARP_BACKWARD_CASE(3, AccT);
+    SOFTMAX_WARP_BACKWARD_CASE(4, AccT);
+    SOFTMAX_WARP_BACKWARD_CASE(5, AccT);
+    SOFTMAX_WARP_BACKWARD_CASE(6, AccT);
+    SOFTMAX_WARP_BACKWARD_CASE(7, AccT);
+    SOFTMAX_WARP_BACKWARD_CASE(8, AccT);
+    SOFTMAX_WARP_BACKWARD_CASE(9, AccT);
+    default:
+      break;
   }
-  reinterpret_cast<decltype(local_dst.s)*>(&dst[offset])[0] = local_dst.s;
 }
 
-template <typename T>
+#undef SOFTMAX_WARP_FORWARD_CASE
+#undef SOFTMAX_WARP_BACKWARD_CASE
+
+template <typename T, bool LogMode = false>
 class SoftmaxCUDNNKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -335,60 +462,39 @@ class SoftmaxCUDNNKernel : public framework::OpKernel<T> {
     const int D = SizeOutAxis(axis, dims);
 
     constexpr int max_dim = 320;
-    bool optimize = false;
     constexpr int warps_per_block = 4;
+
     if (D == 1 && dim <= max_dim && sizeof(T) <= 4) {
-      if (dim == 128 && N % warps_per_block == 0) {
-        optimize = true;
-        // a warp for a batch, 4 elements for a thread, only support the softmax
-        // dim size = 128 currently
-        if (sizeof(T) == 2) {
-          VecSoftmaxForward<T, int2, 4, warps_per_block><<<
-              N / warps_per_block, warps_per_block * WARP_SIZE, 0,
-              ctx.cuda_device_context().stream()>>>(out_data, x->data<T>(), N,
-                                                    dim);
-        } else if (sizeof(T) == 4) {
-          VecSoftmaxForward<T, int4, 4, warps_per_block><<<
-              N / warps_per_block, warps_per_block * WARP_SIZE, 0,
-              ctx.cuda_device_context().stream()>>>(out_data, x->data<T>(), N,
-                                                    dim);
-        } else {
-          assert(false && "not support");
-        }
-      } else if (dim < max_dim) {
-        optimize = true;
-        int log2_elements = static_cast<int>(log2_ceil(dim));
-        const int next_power_of_two = 1 << log2_elements;
-
-        int warp_size = (next_power_of_two < 32) ? next_power_of_two : 32;
-
-        int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;
-
-        // use 128 threads per block to maximimize gpu utilization
-        constexpr int threads_per_block = 128;
-
-        int warps_per_block = (threads_per_block / warp_size);
-        int batches_per_block = warps_per_block * batches_per_warp;
-        int blocks = (N + batches_per_block - 1) / batches_per_block;
-        dim3 threads(warp_size, warps_per_block, 1);
-
-        switch (log2_elements) {
-          LAUNCH_SOFTMAX_WARP_FORWARD(0);  // 1
-          LAUNCH_SOFTMAX_WARP_FORWARD(1);  // 2
-          LAUNCH_SOFTMAX_WARP_FORWARD(2);  // 4
-          LAUNCH_SOFTMAX_WARP_FORWARD(3);  // 8
-          LAUNCH_SOFTMAX_WARP_FORWARD(4);  // 16
-          LAUNCH_SOFTMAX_WARP_FORWARD(5);  // 32
-          LAUNCH_SOFTMAX_WARP_FORWARD(6);  // 64
-          LAUNCH_SOFTMAX_WARP_FORWARD(7);  // 128
-          LAUNCH_SOFTMAX_WARP_FORWARD(8);  // 256
-          LAUNCH_SOFTMAX_WARP_FORWARD(9);  // 512
-          default:
-            break;
-        }
+      const int kDimLog2 = static_cast<int>(log2_ceil(dim));
+      const int kDimCeil = 1 << kDimLog2;
+      int kWarpSize = (kDimCeil < 32) ? kDimCeil : 32;
+      int batches_per_warp = (kDimCeil <= 32) ? 2 : 1;
+
+      // use 128 threads per block to maximimize gpu utilization
+      constexpr int threads_per_block = 128;
+
+      int warps_per_block = (threads_per_block / kWarpSize);
+      int batches_per_block = warps_per_block * batches_per_warp;
+      int blocks = (N + batches_per_block - 1) / batches_per_block;
+      dim3 threads(kWarpSize, warps_per_block, 1);
+
+      // vectorization read/write
+      using T4 = typename VecT4<T>::Type;
+      using T2 = typename VecT2<T>::Type;
+      if (dim % 4 == 0) {
+        SwitchWarpSoftmaxForward<T, T4, LogMode>(blocks, threads, ctx, out_data,
+                                                 x->data<T>(), N, dim, dim,
+                                                 kDimLog2);
+      } else if (dim % 2 == 0) {
+        SwitchWarpSoftmaxForward<T, T2, LogMode>(blocks, threads, ctx, out_data,
+                                                 x->data<T>(), N, dim, dim,
+                                                 kDimLog2);
+      } else {
+        SwitchWarpSoftmaxForward<T, T, LogMode>(blocks, threads, ctx, out_data,
+                                                x->data<T>(), N, dim, dim,
+                                                kDimLog2);
       }
-    }
-    if (!optimize) {
+    } else {
       ScopedTensorDescriptor desc;
       std::vector<int> tensor_dims = {N, dim, D, 1};
       DataLayout layout = DataLayout::kNCHW;
@@ -405,22 +511,37 @@ class SoftmaxCUDNNKernel : public framework::OpKernel<T> {
 #ifdef PADDLE_WITH_HIP
       auto mode = axis == rank - 1 ? MIOPEN_SOFTMAX_MODE_INSTANCE
                                    : MIOPEN_SOFTMAX_MODE_CHANNEL;
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSoftmaxForward(
-          handle, platform::CudnnDataType<T>::kOne(), desc_, x->data<T>(),
-          platform::CudnnDataType<T>::kZero(), desc_, out_data));
+      if (LogMode) {
+        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSoftmaxForward_V2(
+            handle, platform::CudnnDataType<T>::kOne(), desc_, x->data<T>(),
+            platform::CudnnDataType<T>::kZero(), desc_, out_data,
+            MIOPEN_SOFTMAX_LOG, mode));
+      } else {
+        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSoftmaxForward_V2(
+            handle, platform::CudnnDataType<T>::kOne(), desc_, x->data<T>(),
+            platform::CudnnDataType<T>::kZero(), desc_, out_data,
+            MIOPEN_SOFTMAX_ACCURATE, mode));
+      }
 #else
       auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE
                                    : CUDNN_SOFTMAX_MODE_CHANNEL;
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSoftmaxForward(
-          handle, CUDNN_SOFTMAX_ACCURATE, mode,
-          platform::CudnnDataType<T>::kOne(), desc_, x->data<T>(),
-          platform::CudnnDataType<T>::kZero(), desc_, out_data));
+      if (LogMode) {
+        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSoftmaxForward(
+            handle, CUDNN_SOFTMAX_LOG, mode, platform::CudnnDataType<T>::kOne(),
+            desc_, x->data<T>(), platform::CudnnDataType<T>::kZero(), desc_,
+            out_data));
+      } else {
+        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSoftmaxForward(
+            handle, CUDNN_SOFTMAX_ACCURATE, mode,
+            platform::CudnnDataType<T>::kOne(), desc_, x->data<T>(),
+            platform::CudnnDataType<T>::kZero(), desc_, out_data));
+      }
 #endif
     }
   }
 };
 
-template <typename T>
+template <typename T, bool LogMode = false>
 class SoftmaxGradCUDNNKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -437,78 +558,38 @@ class SoftmaxGradCUDNNKernel : public framework::OpKernel<T> {
     const int N = SizeToAxis(axis, dims);
     const int D = SizeOutAxis(axis, dims);
 
+    constexpr int max_dim = 320;
     constexpr int warps_per_block = 4;
-    constexpr bool warp_softmax_available =
-        std::is_same<T, float>::value ||
-        std::is_same<T, platform::float16>::value;
-    bool optimize = false;
-    if (D == 1 && warp_softmax_available) {
-      if (dim == 128 && N % warps_per_block == 0) {
-        optimize = true;
-        if (std::is_same<T, float>::value) {
-          VecSoftmaxBackward<float, 4, warps_per_block><<<
-              N / warps_per_block, warps_per_block * WARP_SIZE, 0,
-              ctx.cuda_device_context().stream()>>>(dx->data<float>(),
-                                                    dout->data<float>(),
-                                                    out->data<float>(), N, dim);
-        } else if (std::is_same<T, platform::float16>::value) {
-          VecSoftmaxBackward<platform::float16, 4, warps_per_block><<<
-              N / warps_per_block, warps_per_block * WARP_SIZE, 0,
-              ctx.cuda_device_context().stream()>>>(
-              dx->data<platform::float16>(), dout->data<platform::float16>(),
-              out->data<platform::float16>(), N, dim);
-        } else {
-          PADDLE_ENFORCE_EQ(
-              warp_softmax_available, true,
-              platform::errors::Unimplemented(
-                  "Warp softmax backward is only available for fp32 and fp16"));
-        }
-      } else if (dim < 40 && dim % 32 != 0) {
-        optimize = true;
-        Tensor mul_grad;
-        int numel = N * dim;
-        mul_grad.mutable_data<T>({numel}, ctx.GetPlace());
-
-        auto stream = ctx.cuda_device_context().stream();
-        auto& dev_ctx =
-            ctx.template device_context<platform::CUDADeviceContext>();
-        auto config = GetGpuLaunchConfig1D(dev_ctx, numel);
-
-        MultiplyCUDAKernel<T><<<config.block_per_grid.x,
-                                config.thread_per_block.x, 0, stream>>>(
-            mul_grad.data<T>(), dout->data<T>(), out->data<T>(), numel);
-
-        int log2_elements = log2_ceil(dim);
-        const int next_power_of_two = 1 << log2_elements;
-
-        int warp_size = (next_power_of_two < 32) ? next_power_of_two : 32;
-
-        int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;
-
-        constexpr int threads_per_block = 128;
-
-        int warps_per_block = (threads_per_block / warp_size);
-        int batches_per_block = warps_per_block * batches_per_warp;
-        int blocks = (N + batches_per_block - 1) / batches_per_block;
-        dim3 threads(warp_size, warps_per_block, 1);
-
-        switch (log2_elements) {
-          LAUNCH_SOFTMAX_WARP_BACKWARD(0);  // 1
-          LAUNCH_SOFTMAX_WARP_BACKWARD(1);  // 2
-          LAUNCH_SOFTMAX_WARP_BACKWARD(2);  // 4
-          LAUNCH_SOFTMAX_WARP_BACKWARD(3);  // 8
-          LAUNCH_SOFTMAX_WARP_BACKWARD(4);  // 16
-          LAUNCH_SOFTMAX_WARP_BACKWARD(5);  // 32
-          LAUNCH_SOFTMAX_WARP_BACKWARD(6);  // 64
-          LAUNCH_SOFTMAX_WARP_BACKWARD(7);  // 128
-          LAUNCH_SOFTMAX_WARP_BACKWARD(8);  // 256
-          LAUNCH_SOFTMAX_WARP_BACKWARD(9);  // 512
-          default:
-            break;
-        }
+
+    if (D == 1 && dim <= max_dim && sizeof(T) <= 4) {
+      const int kDimLog2 = log2_ceil(dim);
+      const int kDimCeil = 1 << kDimLog2;
+      int kWarpSize = (kDimCeil < 32) ? kDimCeil : 32;
+      int batches_per_warp = (kDimCeil <= 128) ? 2 : 1;
+      constexpr int threads_per_block = 128;
+
+      int warps_per_block = (threads_per_block / kWarpSize);
+      int batches_per_block = warps_per_block * batches_per_warp;
+      int blocks = (N + batches_per_block - 1) / batches_per_block;
+      dim3 threads(kWarpSize, warps_per_block, 1);
+
+      // vectorization read/write
+      using T4 = typename VecT4<T>::Type;
+      using T2 = typename VecT2<T>::Type;
+      if (dim % 4 == 0) {
+        SwitchWarpSoftmaxBackward<T, T4, LogMode>(
+            blocks, threads, ctx, dx_data, dout->data<T>(), out->data<T>(), N,
+            dim, dim, kDimLog2);
+      } else if (dim % 2 == 0) {
+        SwitchWarpSoftmaxBackward<T, T2, LogMode>(
+            blocks, threads, ctx, dx_data, dout->data<T>(), out->data<T>(), N,
+            dim, dim, kDimLog2);
+      } else {
+        SwitchWarpSoftmaxBackward<T, T, LogMode>(
+            blocks, threads, ctx, dx_data, dout->data<T>(), out->data<T>(), N,
+            dim, dim, kDimLog2);
       }
-    }
-    if (!optimize) {
+    } else {
       ScopedTensorDescriptor desc;
       std::vector<int> tensor_dims = {N, dim, D, 1};
       DataLayout layout = DataLayout::kNCHW;
@@ -525,18 +606,32 @@ class SoftmaxGradCUDNNKernel : public framework::OpKernel<T> {
 #ifdef PADDLE_WITH_HIP
       auto mode = axis == rank - 1 ? MIOPEN_SOFTMAX_MODE_INSTANCE
                                    : MIOPEN_SOFTMAX_MODE_CHANNEL;
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSoftmaxBackward(
-          handle, platform::CudnnDataType<T>::kOne(), desc_, out->data<T>(),
-          desc_, dout->data<T>(), platform::CudnnDataType<T>::kZero(), desc_,
-          dx_data));
+      if (LogMode) {
+        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSoftmaxBackward_V2(
+            handle, platform::CudnnDataType<T>::kOne(), desc_, out->data<T>(),
+            desc_, dout->data<T>(), platform::CudnnDataType<T>::kZero(), desc_,
+            dx_data, MIOPEN_SOFTMAX_LOG, mode));
+      } else {
+        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSoftmaxBackward_V2(
+            handle, platform::CudnnDataType<T>::kOne(), desc_, out->data<T>(),
+            desc_, dout->data<T>(), platform::CudnnDataType<T>::kZero(), desc_,
+            dx_data, MIOPEN_SOFTMAX_ACCURATE, mode));
+      }
 #else
       auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE
                                    : CUDNN_SOFTMAX_MODE_CHANNEL;
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSoftmaxBackward(
-          handle, CUDNN_SOFTMAX_ACCURATE, mode,
-          platform::CudnnDataType<T>::kOne(), desc_, out->data<T>(), desc_,
-          dout->data<T>(), platform::CudnnDataType<T>::kZero(), desc_,
-          dx_data));
+      if (LogMode) {
+        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSoftmaxBackward(
+            handle, CUDNN_SOFTMAX_LOG, mode, platform::CudnnDataType<T>::kOne(),
+            desc_, out->data<T>(), desc_, dout->data<T>(),
+            platform::CudnnDataType<T>::kZero(), desc_, dx_data));
+      } else {
+        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSoftmaxBackward(
+            handle, CUDNN_SOFTMAX_ACCURATE, mode,
+            platform::CudnnDataType<T>::kOne(), desc_, out->data<T>(), desc_,
+            dout->data<T>(), platform::CudnnDataType<T>::kZero(), desc_,
+            dx_data));
+      }
 #endif
     }
   }
diff --git a/paddle/fluid/operators/softmax_impl.cuh b/paddle/fluid/operators/softmax_impl.cuh
new file mode 100755
index 0000000000000..2acc55d2398e9
--- /dev/null
+++ b/paddle/fluid/operators/softmax_impl.cuh
@@ -0,0 +1,47 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/platform/cuda_device_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T, int BatchSize, int WarpSize>
+__device__ __forceinline__ void WarpReduceSum(T* sum) {
+#pragma unroll
+  for (int offset = WarpSize / 2; offset > 0; offset /= 2) {
+#pragma unroll
+    for (int i = 0; i < BatchSize; ++i) {
+      T sum_val = platform::CudaShuffleXorSync(0xFFFFFFFF, sum[i], offset);
+      sum[i] = sum[i] + sum_val;
+    }
+  }
+}
+
+template <typename T, int BatchSize, int WarpSize>
+__device__ __forceinline__ void WarpReduceMax(T* sum) {
+#pragma unroll
+  for (int offset = WarpSize / 2; offset > 0; offset /= 2) {
+#pragma unroll
+    for (int i = 0; i < BatchSize; ++i) {
+      T max_val = platform::CudaShuffleXorSync(0xFFFFFFFF, sum[i], offset);
+      sum[i] = max(sum[i], max_val);
+    }
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle
\ No newline at end of file
diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc
index a21ef252c03f7..5e7244f4390d8 100644
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
@@ -83,11 +83,13 @@ class SoftmaxOp : public framework::OperatorWithKernel {
     }
 #endif
 
+#ifndef PADDLE_WITH_ASCEND_CL
     if (input_data_type == framework::proto::VarType::FP16) {
       PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
                         platform::errors::InvalidArgument(
                             "float16 can only be used on GPU place"));
     }
+#endif
 
     return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout_,
                                    library_);
@@ -207,9 +209,10 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
     }
 #endif
     if (input_data_type == framework::proto::VarType::FP16) {
-      PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
-                        platform::errors::InvalidArgument(
-                            "float16 can only be used on GPU place"));
+      if (!(platform::is_gpu_place(ctx.GetPlace()) ||
+            platform::is_npu_place(ctx.GetPlace())))
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "float16 can only be used on GPU/NPU place"));
     }
 
     return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout_,
diff --git a/paddle/fluid/operators/softmax_op.h b/paddle/fluid/operators/softmax_op.h
index a964c3b57a635..08266318fb970 100644
--- a/paddle/fluid/operators/softmax_op.h
+++ b/paddle/fluid/operators/softmax_op.h
@@ -45,6 +45,14 @@ static inline int SizeFromAxis(const int axis, DDim dims) {
   return size;
 }
 
+static inline int SizeOutAxis(const int axis, DDim dims) {
+  int size = 1;
+  for (int i = axis + 1; i < dims.size(); i++) {
+    size *= dims[i];
+  }
+  return size;
+}
+
 template <typename DeviceContext, typename T>
 class SoftmaxKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/softmax_op_npu.cc b/paddle/fluid/operators/softmax_op_npu.cc
new file mode 100644
index 0000000000000..0e94f6af232f9
--- /dev/null
+++ b/paddle/fluid/operators/softmax_op_npu.cc
@@ -0,0 +1,101 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/operators/npu_op_runner.h"
+#include "paddle/fluid/operators/softmax_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class SoftmaxNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<framework::LoDTensor>("X");
+    auto axis = ctx.Attr<int>("axis");
+    std::vector<int> axes;
+    axes.push_back(axis);
+    framework::NPUAttributeMap attr_input = {{"axes", axes}};
+
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
+    out->mutable_data<T>(ctx.GetPlace());
+
+    auto runner = NpuOpRunner("SoftmaxV2", {*in}, {*out}, attr_input);
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    runner.Run(stream);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class SoftmaxGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* out = ctx.Input<framework::LoDTensor>("Out");
+    auto* dOut = ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
+
+    auto* dX = ctx.Output<Tensor>(framework::GradVarName("X"));
+
+    auto dims = dX->dims();
+    const int rank = dims.size();
+    const int axis = CanonicalAxis(ctx.Attr<int>("axis"), rank);
+    int64_t first_dim = 1;
+    int64_t sec_dim = 1;
+    for (int i = 0; i < axis; i++) {
+      first_dim *= dims[i];
+    }
+    for (int i = axis; i < rank; i++) {
+      sec_dim *= dims[i];
+    }
+
+    Tensor tmp_out;
+    tmp_out.ShareDataWith(*out).Resize({first_dim, sec_dim});
+
+    Tensor tmp_dOut;
+    tmp_dOut.ShareDataWith(*dOut).Resize({first_dim, sec_dim});
+
+    dX->Resize(framework::make_ddim({first_dim, sec_dim}));
+    dX->mutable_data<T>(ctx.GetPlace());
+
+    framework::NPUAttributeMap attr_input = {};
+    auto runner = NpuOpRunner(std::string("SoftmaxGrad"), {tmp_out, tmp_dOut},
+                              {*dX}, attr_input);
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    runner.Run(stream);
+
+    dX->Resize(dims);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(
+    softmax, ops::SoftmaxNPUKernel<plat::NPUDeviceContext, float>,
+    ops::SoftmaxNPUKernel<plat::NPUDeviceContext, double>,
+    ops::SoftmaxNPUKernel<plat::NPUDeviceContext, plat::float16>);
+
+REGISTER_OP_NPU_KERNEL(
+    softmax_grad, ops::SoftmaxGradNPUKernel<plat::NPUDeviceContext, float>,
+    ops::SoftmaxGradNPUKernel<plat::NPUDeviceContext, double>,
+    ops::SoftmaxGradNPUKernel<plat::NPUDeviceContext,
+                              paddle::platform::float16>);
diff --git a/paddle/fluid/operators/softmax_op_npu_test.cc b/paddle/fluid/operators/softmax_op_npu_test.cc
new file mode 100644
index 0000000000000..d20b3ac04bf95
--- /dev/null
+++ b/paddle/fluid/operators/softmax_op_npu_test.cc
@@ -0,0 +1,170 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(softmax);
+USE_OP_DEVICE_KERNEL(softmax, NPU);
+
+template <typename T>
+void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
+  // init
+  auto x = scope->Var("X");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+
+  std::vector<T> init;
+  for (int i = 3; i < 9; ++i) {
+    init.push_back(static_cast<T>(i));
+  }
+
+  TensorFromVector(init, ctx, tensor_x);
+  tensor_x->Resize({2, 3});
+
+  ctx.Wait();
+
+  auto place = ctx.GetPlace();
+  auto out = scope->Var("Out");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+  tensor_out->Resize({2, 3});
+  tensor_out->mutable_data<T>(place);  // allocate
+
+  // run
+  int axis = 1;
+  f::AttributeMap attrs = {
+      {"axis", axis},        {"use_cudnn", false},
+      {"use_mkldnn", false}, {"mkldnn_data_type", std::string("float32")},
+      {"is_test", false},
+  };
+
+  auto op = f::OpRegistry::CreateOp("softmax", {{"X", {"X"}}},
+                                    {{"Out", {"Out"}}}, attrs);
+
+  op->Run(*scope, place);
+  ctx.Wait();
+
+  std::vector<T> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+
+  for (int i = 0; i < static_cast<int>(out_vec.size()); ++i) {
+    VLOG(3) << "out_vec[" << i << "] : " << out_vec[i];
+  }
+
+  ctx.Wait();
+
+  EXPECT_EQ((uint32_t)out_vec.size(), (uint32_t)(6));
+}
+
+template <typename T>
+void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) {
+  // init
+  auto out = scope->Var("Out");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+
+  std::vector<T> out_init;
+
+  out_init.push_back(static_cast<T>(0.6670));
+  out_init.push_back(static_cast<T>(0.5888));
+  out_init.push_back(static_cast<T>(0.4543));
+  out_init.push_back(static_cast<T>(0.3330));
+  out_init.push_back(static_cast<T>(0.4112));
+  out_init.push_back(static_cast<T>(0.5457));
+
+  TensorFromVector(out_init, ctx, tensor_out);
+  tensor_out->Resize({2, 3});
+
+  ctx.Wait();
+
+  auto dout = scope->Var("DOut");
+  auto tensor_dout = dout->GetMutable<f::LoDTensor>();
+
+  std::vector<T> dout_init;
+  for (int i = 0; i < 6; ++i) {
+    dout_init.push_back(static_cast<T>(1.0));
+  }
+
+  TensorFromVector(dout_init, ctx, tensor_dout);
+  tensor_dout->Resize({2, 3});
+
+  ctx.Wait();
+
+  auto dx = scope->Var("DX");
+  auto tensor_dx = dx->GetMutable<f::LoDTensor>();
+
+  ctx.Wait();
+
+  // run
+  f::AttributeMap attrs;
+  attrs = {
+      {"name", std::string("softmax_grad")},
+      {"axis", static_cast<int>(0)},
+      {"use_cudnn", false},
+      {"use_mkldnn", false},
+      {"mkldnn_data_type", std::string("float32")},
+      {"is_test", false},
+      {"data_format", std::string("AnyLayout")},
+  };
+  auto op = f::OpRegistry::CreateOp("softmax_grad",
+                                    {{"Out", {"Out"}}, {"Out@GRAD", {"DOut"}}},
+                                    {{"X@GRAD", {"DX"}}}, attrs);
+
+  auto place = ctx.GetPlace();
+  op->Run(*scope, place);
+  ctx.Wait();
+
+  EXPECT_EQ((uint32_t)tensor_dx->dims()[0], (uint32_t)(2));
+  EXPECT_EQ((uint32_t)tensor_dx->dims()[1], (uint32_t)(3));
+
+  ctx.Wait();
+
+  std::vector<float> out_vec;
+  TensorToVector(*tensor_dx, ctx, &out_vec);
+
+  ctx.Wait();
+
+  EXPECT_EQ((uint32_t)out_vec.size(), (uint32_t)(6));
+  EXPECT_NEAR((float)out_vec[0], (float)(-0.4737), 0.1);
+  EXPECT_NEAR((float)out_vec[1], (float)(-0.4181), 0.1);
+  EXPECT_NEAR((float)out_vec[2], (float)(-0.3226), 0.1);
+  EXPECT_NEAR((float)out_vec[3], (float)(-0.0965), 0.1);
+  EXPECT_NEAR((float)out_vec[4], (float)(-0.1192), 0.1);
+  EXPECT_NEAR((float)out_vec[5], (float)(-0.1582), 0.1);
+}
+
+TEST(softmax, NPU_fp32) {
+  f::Scope scope;
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  Compare<float>(&scope, *ctx);
+}
+
+TEST(softmax_grad, NPU_fp32) {
+  f::Scope scope;
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  CompareGrad<float>(&scope, *ctx);
+}
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc
new file mode 100644
index 0000000000000..a34946315f5a8
--- /dev/null
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc
@@ -0,0 +1,199 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/softmax.h"
+#include <memory>
+#include <string>
+#include "paddle/fluid/operators/math/cross_entropy.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+#include "paddle/fluid/operators/softmax_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class SoftmaxWithCrossEntropyNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* logits = ctx.Input<Tensor>("Logits");
+    auto* labels = ctx.Input<Tensor>("Label");
+    auto* softmax = ctx.Output<Tensor>("Softmax");
+    auto* loss = ctx.Output<Tensor>("Loss");
+
+    int cls_num = logits->dims()[1];
+    const int rank = logits->dims().size();
+    const int axis = CanonicalAxis(ctx.Attr<int>("axis"), rank);
+    std::vector<int> axes;
+    for (auto i = axis; i < logits->dims().size(); ++i) {
+      axes.push_back(i);
+    }
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    // softmax
+    softmax->mutable_data<T>(ctx.GetPlace());
+    auto runner_softmax =
+        NpuOpRunner("SoftmaxV2", {*logits}, {*softmax}, {{"axes", axes}});
+    runner_softmax.Run(stream);
+
+    // cast label from int64/int32 to int32
+    Tensor tmp_labels(framework::proto::VarType::INT32);
+    if (labels->type() != framework::proto::VarType::INT32) {
+      tmp_labels.Resize(labels->dims());
+      tmp_labels.mutable_data(ctx.GetPlace(), framework::proto::VarType::INT32);
+      auto dst_dtype = ConvertToNpuDtype(framework::proto::VarType::INT32);
+      auto runner_cast_label =
+          NpuOpRunner("Cast", {*labels}, {tmp_labels},
+                      {{"dst_type", static_cast<int>(dst_dtype)}});
+      runner_cast_label.Run(stream);
+      labels = &tmp_labels;
+    }
+
+    // on and off
+    Tensor on_tensor(framework::proto::VarType::INT32);
+    on_tensor.mutable_data<int>({1}, ctx.GetPlace());
+    FillNpuTensorWithConstant<int>(&on_tensor, static_cast<int>(1));
+    Tensor off_tensor(framework::proto::VarType::INT32);
+    off_tensor.mutable_data<int>({1}, ctx.GetPlace());
+    FillNpuTensorWithConstant<int>(&off_tensor, static_cast<int>(0));
+
+    // one_hot
+    Tensor tmp_onehot(on_tensor.type());
+    tmp_onehot.Resize(logits->dims());
+    tmp_onehot.mutable_data<int>(ctx.GetPlace());
+
+    auto runner_onehot =
+        NpuOpRunner("OneHotD", {*labels, on_tensor, off_tensor}, {tmp_onehot},
+                    {{"axis", -1}, {"depth", cls_num}});
+    runner_onehot.Run(stream);
+
+    // cast one_hot from int32 to T
+    Tensor cast_onehot(logits->type());
+    cast_onehot.Resize(tmp_onehot.dims());
+    cast_onehot.mutable_data<T>(ctx.GetPlace());
+    auto dst_dtype = ConvertToNpuDtype(logits->type());
+    auto runner_cast_onehot =
+        NpuOpRunner("Cast", {tmp_onehot}, {cast_onehot},
+                    {{"dst_type", static_cast<int>(dst_dtype)}});
+    runner_cast_onehot.Run(stream);
+
+    // SoftmaxCrossEntropyWithLogits
+    Tensor backprop(logits->type());
+    backprop.Resize(logits->dims());
+    backprop.mutable_data<T>(ctx.GetPlace());
+
+    loss->mutable_data<T>(ctx.GetPlace());
+
+    // SoftmaxCrossEntropyWithLogits requires loss to be of shape [batch_size]
+    auto loss_dims = loss->dims();
+    loss->Resize({loss_dims[0]});
+    auto runner_s = NpuOpRunner("SoftmaxCrossEntropyWithLogits",
+                                {*logits, cast_onehot}, {*loss, backprop}, {});
+    runner_s.Run(stream);
+    loss->Resize(loss_dims);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class SoftmaxWithCrossEntropyGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* labels = ctx.Input<Tensor>("Label");
+    auto* softmax = ctx.Input<Tensor>("Softmax");
+    auto* loss_grad = ctx.Input<Tensor>(framework::GradVarName("Loss"));
+    auto* logits_grad = ctx.Output<Tensor>(framework::GradVarName("Logits"));
+
+    int cls_num = softmax->dims()[1];
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    // cast label from int64/int32 to int32
+    Tensor tmp_labels(framework::proto::VarType::INT32);
+    if (labels->type() != framework::proto::VarType::INT32) {
+      tmp_labels.Resize(labels->dims());
+      tmp_labels.mutable_data(ctx.GetPlace(), framework::proto::VarType::INT32);
+      auto dst_dtype = ConvertToNpuDtype(framework::proto::VarType::INT32);
+      auto runner_cast_label =
+          NpuOpRunner("Cast", {*labels}, {tmp_labels},
+                      {{"dst_type", static_cast<int>(dst_dtype)}});
+      runner_cast_label.Run(stream);
+      labels = &tmp_labels;
+    }
+
+    // on and off
+    Tensor on_tensor(framework::proto::VarType::INT32);
+    on_tensor.mutable_data<int>({1}, ctx.GetPlace());
+    FillNpuTensorWithConstant<int>(&on_tensor, static_cast<int>(1));
+    Tensor off_tensor(framework::proto::VarType::INT32);
+    off_tensor.mutable_data<int>({1}, ctx.GetPlace());
+    FillNpuTensorWithConstant<int>(&off_tensor, static_cast<int>(0));
+
+    // one_hot
+    Tensor tmp_onehot(on_tensor.type());
+    tmp_onehot.Resize(softmax->dims());
+    tmp_onehot.mutable_data<int>(ctx.GetPlace());
+
+    auto runner_onehot =
+        NpuOpRunner("OneHotD", {*labels, on_tensor, off_tensor}, {tmp_onehot},
+                    {{"axis", -1}, {"depth", cls_num}});
+    runner_onehot.Run(stream);
+
+    // cast one_hot from int32 to T
+    Tensor cast_onehot(softmax->type());
+    cast_onehot.Resize(tmp_onehot.dims());
+    cast_onehot.mutable_data<T>(ctx.GetPlace());
+    auto dst_dtype = ConvertToNpuDtype(softmax->type());
+    auto runner_cast_onehot =
+        NpuOpRunner("Cast", {tmp_onehot}, {cast_onehot},
+                    {{"dst_type", static_cast<int>(dst_dtype)}});
+    runner_cast_onehot.Run(stream);
+
+    // sub
+    Tensor tmp_sub(softmax->type());
+    tmp_sub.Resize(softmax->dims());
+    tmp_sub.mutable_data<T>(ctx.GetPlace());
+    auto runner_sub =
+        NpuOpRunner("Sub", {*softmax, cast_onehot}, {tmp_sub}, {});
+
+    runner_sub.Run(stream);
+    // mul
+    logits_grad->mutable_data<T>(ctx.GetPlace());
+    auto runner_mul =
+        NpuOpRunner("Mul", {*loss_grad, tmp_sub}, {*logits_grad}, {});
+    runner_mul.Run(stream);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    softmax_with_cross_entropy,
+    ops::SoftmaxWithCrossEntropyNPUKernel<paddle::platform::NPUDeviceContext,
+                                          float>,
+    ops::SoftmaxWithCrossEntropyNPUKernel<paddle::platform::NPUDeviceContext,
+                                          paddle::platform::float16>);
+REGISTER_OP_NPU_KERNEL(
+    softmax_with_cross_entropy_grad,
+    ops::SoftmaxWithCrossEntropyGradNPUKernel<
+        paddle::platform::NPUDeviceContext, float>,
+    ops::SoftmaxWithCrossEntropyGradNPUKernel<
+        paddle::platform::NPUDeviceContext, paddle::platform::float16>);
diff --git a/paddle/fluid/operators/split_selected_rows_op.cc b/paddle/fluid/operators/split_selected_rows_op.cc
deleted file mode 100644
index 54a35b5cd7df7..0000000000000
--- a/paddle/fluid/operators/split_selected_rows_op.cc
+++ /dev/null
@@ -1,99 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/split_selected_rows_op.h"
-
-#include <memory>
-
-namespace paddle {
-namespace operators {
-
-class SplitSelectedRowsOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "The input SelectedRows.");
-    AddOutput("Out", "The outputs of the input SelectedRows.").AsDuplicable();
-    AddAttr<std::vector<int64_t>>("height_sections",
-                                  "Height for each output SelectedRows.")
-        .SetDefault(std::vector<int64_t>({}));
-
-    AddComment(R"DOC(
-Split a SelectedRows with a specified rows section.
-height_sections is only needed when need to split the dims of the original tensor.
-
-Example:
-  Input:
-    X.rows = {7, 5}
-    X.height = 12
-  Attr:
-    height_sections = {4, 8}
-  Out:
-    out0.rows = {}
-    out0.height = 4
-
-    out1.rows = {5, 7}
-    out2.height = 8
-
-)DOC");
-  }
-};
-
-class SplitSelectedRowsOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      platform::errors::InvalidArgument(
-                          "SplitSelectedRowsOp must have input X."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutputs("Out"), true,
-                      platform::errors::InvalidArgument(
-                          "SplitSelectedRowsOp must have output Out."));
-  }
-};
-
-class SplitSelectedRowsOpInferVarType : public framework::VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    ctx->SetOutputType("Out", framework::proto::VarType::SELECTED_ROWS,
-                       framework::ALL_ELEMENTS);
-  }
-};
-
-template <typename T>
-class SplitSelectedRowsGradMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> grad_op) const override {
-    grad_op->SetType("sum");
-    grad_op->SetInput("X", this->OutputGrad("Out"));
-    grad_op->SetOutput("Out", this->InputGrad("X"));
-    grad_op->SetAttrMap(this->Attrs());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(split_selected_rows, ops::SplitSelectedRowsOp,
-                  ops::SplitSelectedRowsOpMaker,
-                  ops::SplitSelectedRowsGradMaker<paddle::framework::OpDesc>,
-                  ops::SplitSelectedRowsGradMaker<paddle::imperative::OpBase>,
-                  ops::SplitSelectedRowsOpInferVarType);
-REGISTER_OP_CPU_KERNEL(
-    split_selected_rows,
-    ops::SplitSelectedRowsOpKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/fluid/operators/split_selected_rows_op.h b/paddle/fluid/operators/split_selected_rows_op.h
deleted file mode 100644
index 281f9fb7e596f..0000000000000
--- a/paddle/fluid/operators/split_selected_rows_op.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
-#include "paddle/fluid/operators/math/selected_rows_functor.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class SplitSelectedRowsOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<framework::SelectedRows>("X");
-    auto outs = ctx.MultiOutput<framework::SelectedRows>("Out");
-    auto height_sections = ctx.Attr<std::vector<int64_t>>("height_sections");
-
-    auto abs_sections = ToAbsoluteSection(height_sections);
-
-    auto& x_rows = x->rows();
-    auto height = x->height();
-    std::vector<std::vector<int>> outs_rows_idx;
-    std::vector<std::vector<int>> outs_dense_idx;
-
-    outs_rows_idx.resize(outs.size());
-    outs_dense_idx.resize(outs.size());
-
-    auto row_numel = x->value().numel() / x->value().dims()[0];
-    auto src = x->value().data<T>();
-
-    // split rows index into output sparse vars
-    for (size_t i = 0; i < x_rows.size(); ++i) {
-      auto& id = x_rows[i];
-      PADDLE_ENFORCE_LT(id, height,
-                        platform::errors::OutOfRange(
-                            "Each row_id in x.rows must be less than x.height. "
-                            "But received x.rows[%d] = %d, x.height = %d",
-                            i, id, height));
-      int out_idx = GetSectionIndex(id, abs_sections);
-      outs_rows_idx[out_idx].push_back(id);
-      outs_dense_idx[out_idx].push_back(i);
-    }
-    auto place = ctx.GetPlace();
-
-    for (size_t i = 0; i < outs_rows_idx.size(); ++i) {
-      auto rows_idx = outs_rows_idx[i];
-      outs[i]->set_height(height_sections[i]);
-      auto dims = x->GetCompleteDims();
-      dims[0] = rows_idx.size();
-      outs[i]->mutable_value()->mutable_data<T>(dims, x->place());
-      outs[i]->mutable_rows()->clear();
-      if (rows_idx.size() > 0) {
-        for (auto idx : rows_idx) {
-          auto id_offset = idx - abs_sections[i];
-          PADDLE_ENFORCE_LT(
-              id_offset, height_sections[i],
-              platform::errors::OutOfRange("Each row_id in out.rows must be "
-                                           "less than out.height. But recived "
-                                           "out.rows = [%d], out.height = [%d]",
-                                           id_offset, height_sections[i]));
-          outs[i]->mutable_rows()->push_back(id_offset);
-        }
-        auto dst = outs[i]->mutable_value()->mutable_data<T>(ctx.GetPlace());
-        for (size_t j = 0; j < rows_idx.size(); j++) {
-          if (platform::is_cpu_place(place)) {
-            memory::Copy(
-                platform::CPUPlace(), dst + j * row_numel, platform::CPUPlace(),
-                src + outs_dense_idx[i][j] * row_numel, sizeof(T) * row_numel);
-          } else {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-            auto stream = ctx.cuda_device_context().stream();
-            memory::Copy(platform::CUDAPlace(), dst + j * row_numel,
-                         platform::CUDAPlace(),
-                         src + outs_dense_idx[i][j] * row_numel,
-                         sizeof(T) * row_numel, stream);
-#else
-            PADDLE_THROW(platform::errors::Unavailable(
-                "Paddle is not compiled with CUDA. Cannot visit cuda device"));
-#endif
-          }
-        }
-      }
-      PADDLE_ENFORCE_EQ(rows_idx.size(), outs[i]->rows().size(),
-                        platform::errors::InvalidArgument(
-                            "rows should has the same size with tensor dim 0. "
-                            "But received rows = %d, tensor's dim[0] = %d.",
-                            rows_idx.size(), outs[i]->rows().size()));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/squeeze_op_npu.cc b/paddle/fluid/operators/squeeze_op_npu.cc
new file mode 100644
index 0000000000000..33c9273e3b6f5
--- /dev/null
+++ b/paddle/fluid/operators/squeeze_op_npu.cc
@@ -0,0 +1,43 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_ASCEND_CL
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/operators/npu_op_runner.h"
+#include "paddle/fluid/operators/squeeze_op.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(
+    squeeze, ops::SqueezeKernel<plat::NPUDeviceContext, float>,
+    ops::SqueezeKernel<plat::NPUDeviceContext, double>,
+    ops::SqueezeKernel<plat::NPUDeviceContext, plat::float16>,
+    ops::SqueezeKernel<plat::NPUDeviceContext, bool>,
+    ops::SqueezeKernel<plat::NPUDeviceContext, int>,
+    ops::SqueezeKernel<plat::NPUDeviceContext, uint8_t>,
+    ops::SqueezeKernel<plat::NPUDeviceContext, int8_t>,
+    ops::SqueezeKernel<plat::NPUDeviceContext, int64_t>);
+REGISTER_OP_NPU_KERNEL(
+    squeeze2, ops::SqueezeKernel<plat::NPUDeviceContext, float>,
+    ops::SqueezeKernel<plat::NPUDeviceContext, double>,
+    ops::SqueezeKernel<plat::NPUDeviceContext, plat::float16>,
+    ops::SqueezeKernel<plat::NPUDeviceContext, bool>,
+    ops::SqueezeKernel<plat::NPUDeviceContext, int>,
+    ops::SqueezeKernel<plat::NPUDeviceContext, uint8_t>,
+    ops::SqueezeKernel<plat::NPUDeviceContext, int8_t>,
+    ops::SqueezeKernel<plat::NPUDeviceContext, int64_t>);
+#endif
diff --git a/paddle/fluid/operators/squeeze_op_npu_test.cc b/paddle/fluid/operators/squeeze_op_npu_test.cc
new file mode 100644
index 0000000000000..1de7ca8c7bdbf
--- /dev/null
+++ b/paddle/fluid/operators/squeeze_op_npu_test.cc
@@ -0,0 +1,90 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(squeeze);
+USE_OP_DEVICE_KERNEL(squeeze, NPU);
+
+template <typename T>
+void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
+  // init
+  auto x = scope->Var("X");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+
+  int dim0 = 1;
+  int dim1 = 10;
+  int dim2 = 1;
+
+  std::vector<T> init;
+  for (int64_t i = 0; i < dim0 * dim1 * dim2; ++i) {
+    init.push_back(static_cast<T>(0.1));
+  }
+
+  TensorFromVector(init, ctx, tensor_x);
+  tensor_x->Resize({dim0, dim1, dim2});
+
+  ctx.Wait();
+
+  // run
+  auto place = ctx.GetPlace();
+  auto out = scope->Var("Out");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+
+  std::vector<int> axis;
+  axis.push_back(2);
+  f::AttributeMap attrs = {{"axes", axis}};
+
+  auto op = f::OpRegistry::CreateOp("squeeze", {{"X", {"X"}}},
+                                    {{"Out", {"Out"}}}, attrs);
+
+  op->Run(*scope, place);
+  ctx.Wait();
+
+  EXPECT_EQ((uint32_t)tensor_out->dims().size(), uint32_t(2));
+  EXPECT_EQ((uint32_t)tensor_out->dims()[0], uint32_t(dim0));
+  EXPECT_EQ((uint32_t)tensor_out->dims()[1], uint32_t(dim1));
+
+  std::vector<T> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+  for (uint32_t i = 0; i < out_vec.size(); i++) {
+    EXPECT_EQ(out_vec[i], static_cast<T>(0.1));
+  }
+
+  ctx.Wait();
+}
+
+TEST(squeeze, NPU_fp32) {
+  f::Scope scope;
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  Compare<float>(&scope, *ctx);
+}
diff --git a/paddle/fluid/operators/stack_op_npu.cc b/paddle/fluid/operators/stack_op_npu.cc
new file mode 100644
index 0000000000000..958655b1f27c6
--- /dev/null
+++ b/paddle/fluid/operators/stack_op_npu.cc
@@ -0,0 +1,106 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_ASCEND_CL
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/operators/activation_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+#include "paddle/fluid/operators/stack_op.h"
+#include "paddle/fluid/operators/unsqueeze_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class StackNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto x = ctx.MultiInput<Tensor>("X");
+    int32_t N = x.size();
+
+    PADDLE_ENFORCE_GT(
+        N, 0, platform::errors::InvalidArgument("number of input Tensor <= 0"));
+
+    std::vector<paddle::framework::Tensor> x_list;
+    for (int i = 0; i < N; i++) {
+      x_list.push_back(*x[i]);
+    }
+
+    int axis = ctx.Attr<int>("axis");
+
+    if (axis < 0) {
+      axis = axis + x_list[0].dims().size() + 1;
+    }
+    auto* out = ctx.Output<Tensor>("Y");
+
+    auto place = ctx.GetPlace();
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    out->mutable_data<T>(place);
+
+    if (axis != 0) {
+      auto x_dim = x_list[0].dims();
+      std::vector<int> vec_dim_tmp;
+      vec_dim_tmp.push_back(N);
+      for (auto i = 0; i < x_dim.size(); ++i) {
+        vec_dim_tmp.push_back(x_dim[i]);
+      }
+
+      Tensor tmp_stack(out->type());
+      tmp_stack.Resize(framework::make_ddim(vec_dim_tmp));
+      tmp_stack.mutable_data<T>(ctx.GetPlace());
+
+      auto runner =
+          NpuOpRunner("Pack", {x_list}, {tmp_stack}, {{"axis", 0}, {"N", N}});
+      runner.Run(stream);
+
+      std::vector<int64_t> vec_trans;
+      for (auto i = 1; i <= x_dim.size(); ++i) {
+        vec_trans.push_back(i);
+        if (i == axis) {
+          vec_trans.push_back(0);
+        }
+      }
+
+      auto runner_trans_final =
+          NpuOpRunner("TransposeD", {tmp_stack}, {*out}, {{"perm", vec_trans}});
+      runner_trans_final.Run(stream);
+
+    } else {
+      auto runner =
+          NpuOpRunner("Pack", {x_list}, {*out}, {{"axis", axis}, {"N", N}});
+      runner.Run(stream);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    stack, ops::StackNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::StackNPUKernel<paddle::platform::NPUDeviceContext,
+                        paddle::platform::float16>);
+
+#endif
diff --git a/paddle/fluid/operators/sum_op_npu.cc b/paddle/fluid/operators/sum_op_npu.cc
new file mode 100644
index 0000000000000..e3dc5faf46c81
--- /dev/null
+++ b/paddle/fluid/operators/sum_op_npu.cc
@@ -0,0 +1,64 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/operators/npu_op_runner.h"
+#include "paddle/fluid/operators/sum_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class SumNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto x = ctx.MultiInput<Tensor>("X");
+    auto* out = ctx.Output<Tensor>("Out");
+    out->mutable_data<T>(ctx.GetPlace());
+
+    auto place = ctx.GetPlace();
+
+    int n = static_cast<int>(x.size());
+    PADDLE_ENFORCE_EQ(n > 1, true,
+                      platform::errors::InvalidArgument(
+                          "The size of Input(x) list must larger or equal 2"));
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    auto runner = NpuOpRunner("Add", {*x[0], *x[1]}, {*out}, {});
+
+    runner.Run(stream);
+    for (int i = 2; i < n; i++) {
+      runner = NpuOpRunner("Add", {*out, *x[i]}, {*out}, {});
+      runner.Run(stream);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    sum, ops::SumNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::SumNPUKernel<paddle::platform::NPUDeviceContext,
+                      paddle::platform::float16>);
diff --git a/paddle/fluid/operators/tensor_formatter.cc b/paddle/fluid/operators/tensor_formatter.cc
index 046ae90ec7c6e..f1b64f042c3c0 100644
--- a/paddle/fluid/operators/tensor_formatter.cc
+++ b/paddle/fluid/operators/tensor_formatter.cc
@@ -125,6 +125,11 @@ void TensorFormatter::FormatData(const framework::LoDTensor& print_tensor,
     framework::LoDTensor cpu_tensor;
     platform::CPUPlace cpu_place;
     TensorCopy(print_tensor, cpu_place, &cpu_tensor);
+#ifdef PADDLE_WITH_ASCEND_CL
+    if (platform::is_npu_place(print_tensor.place())) {
+      platform::DeviceContextPool::Instance().Get(print_tensor.place())->Wait();
+    }
+#endif
     data = cpu_tensor.data<T>();
   }
 
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
index 1f0ae40798e4d..1f3029d94b940 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
@@ -76,6 +76,54 @@ static void RuntimeStaticShapeCheck(std::vector<int64_t> runtime_input_shape,
           model_input_shape_str, runtime_input_shape_str));
 }
 
+static void RuntimeDynamicShapeCheck(
+    const std::string &x, const std::vector<int64_t> &runtime_input_shape,
+    const std::vector<int> &min_input_shape,
+    const std::vector<int> &max_input_shape) {
+  PADDLE_ENFORCE_EQ(runtime_input_shape.size(), min_input_shape.size(),
+                    platform::errors::InvalidArgument(
+                        "TRT engine runtime input dims size(%d) inconsistent "
+                        "with the dynamic shape size(%d)",
+                        runtime_input_shape.size(), min_input_shape.size()));
+  auto is_input_shape_valid = [&](
+      const std::vector<int64_t> &runtime_input_shape,
+      const std::vector<int> &min_input_shape,
+      const std::vector<int> &max_input_shape) -> bool {
+    for (size_t i = 0; i < runtime_input_shape.size(); i++) {
+      if (runtime_input_shape[i] <= max_input_shape[i] &&
+          runtime_input_shape[i] >= min_input_shape[i]) {
+        continue;
+      } else {
+        return false;
+      }
+    }
+    return true;
+  };
+  auto comma_fold = [](std::string a, int b) {
+    return std::move(a) + ", " + std::to_string(b);
+  };
+  std::string runtime_input_shape_str = std::accumulate(
+      std::next(runtime_input_shape.begin()), runtime_input_shape.end(),
+      std::to_string(runtime_input_shape[0]), comma_fold);
+  std::string min_input_shape_str =
+      std::accumulate(std::next(min_input_shape.begin()), min_input_shape.end(),
+                      std::to_string(min_input_shape[0]), comma_fold);
+  std::string max_input_shape_str =
+      std::accumulate(std::next(max_input_shape.begin()), max_input_shape.end(),
+                      std::to_string(max_input_shape[0]), comma_fold);
+  PADDLE_ENFORCE_EQ(is_input_shape_valid(runtime_input_shape, min_input_shape,
+                                         max_input_shape),
+                    true,
+                    platform::errors::InvalidArgument(
+                        "TRT runtime input shape of %s is invalid. Expect "
+                        "runtime input shape to be within min/max input shape "
+                        "configured in SetTRTDynamicShapeInfo(),"
+                        "but got runtime input shape = [%s], min input shape = "
+                        "[%s], max input shape = [%s].",
+                        x, runtime_input_shape_str, min_input_shape_str,
+                        max_input_shape_str));
+}
+
 class TensorRTEngineOp : public framework::OperatorBase {
  private:
   std::vector<std::string> input_names_;
@@ -272,6 +320,22 @@ class TensorRTEngineOp : public framework::OperatorBase {
         }
       } else {
 #if IS_TRT_VERSION_GE(6000)
+        std::map<std::string, std::vector<int>> min_input_shape =
+            engine->min_input_shape();
+        std::map<std::string, std::vector<int>> max_input_shape =
+            engine->max_input_shape();
+        PADDLE_ENFORCE_EQ(
+            min_input_shape.count(x), true,
+            platform::errors::InvalidArgument(
+                "Input %s not found in TRT engine min_input_shape.", x));
+        PADDLE_ENFORCE_EQ(
+            max_input_shape.count(x), true,
+            platform::errors::InvalidArgument(
+                "Input %s not found in TRT engine max_input_shape.", x));
+        auto x_min_input_shape = min_input_shape[x];
+        auto x_max_input_shape = max_input_shape[x];
+        RuntimeDynamicShapeCheck(x, t_shape, x_min_input_shape,
+                                 x_max_input_shape);
         auto *trt_context = engine->context();
         trt_context->setBindingDimensions(
             bind_index, inference::tensorrt::Vec2TRT_Dims(t_shape, x, true));
diff --git a/paddle/fluid/operators/tile_op.cc b/paddle/fluid/operators/tile_op.cc
index 6527362bb9690..b98e620cc2d34 100644
--- a/paddle/fluid/operators/tile_op.cc
+++ b/paddle/fluid/operators/tile_op.cc
@@ -286,3 +286,20 @@ REGISTER_OP_CPU_KERNEL(
     ops::TileGradKernel<paddle::platform::CPUDeviceContext, double>,
     ops::TileGradKernel<paddle::platform::CPUDeviceContext, int>,
     ops::TileGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+REGISTER_OP_CUDA_KERNEL(
+    tile, ops::TileKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::TileKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::TileKernel<paddle::platform::CUDADeviceContext,
+                    paddle::platform::float16>,
+    ops::TileKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::TileKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::TileKernel<paddle::platform::CUDADeviceContext, bool>);
+REGISTER_OP_CUDA_KERNEL(
+    tile_grad, ops::TileGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::TileGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::TileGradKernel<paddle::platform::CUDADeviceContext,
+                        paddle::platform::float16>,
+    ops::TileGradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::TileGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
+#endif
diff --git a/paddle/fluid/operators/tile_op.cu b/paddle/fluid/operators/tile_op.cu
deleted file mode 100644
index 5ca82cd6a1f43..0000000000000
--- a/paddle/fluid/operators/tile_op.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/tile_op.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(
-    tile, ops::TileKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::TileKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::TileKernel<paddle::platform::CUDADeviceContext, plat::float16>,
-    ops::TileKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::TileKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::TileKernel<paddle::platform::CUDADeviceContext, bool>);
-REGISTER_OP_CUDA_KERNEL(
-    tile_grad, ops::TileGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::TileGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::TileGradKernel<paddle::platform::CUDADeviceContext, plat::float16>,
-    ops::TileGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::TileGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/top_k_op_npu.cc b/paddle/fluid/operators/top_k_op_npu.cc
new file mode 100644
index 0000000000000..684bd476b6ef2
--- /dev/null
+++ b/paddle/fluid/operators/top_k_op_npu.cc
@@ -0,0 +1,85 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/operators/npu_op_runner.h"
+#include "paddle/fluid/operators/top_k_op.h"
+
+namespace paddle {
+namespace operators {
+
+void gen_assist_seq(framework::Tensor* assit_tensor, int64_t dim,
+                    const framework::ExecutionContext& ctx) {
+  const int64_t dimx2 = dim;
+  std::vector<paddle::platform::float16> assit;
+  assit.resize(2 * dimx2);
+  for (int64_t i = 0; i < dimx2; i++) {
+    // for i in range [0, dim]
+    assit[i] = static_cast<paddle::platform::float16>(i);
+
+    // for i in range [dim, dimx2]
+    int64_t idx =
+        static_cast<int64_t>(static_cast<paddle::platform::float16>(i));
+    int64_t gap = i - idx;
+    assit[i + dim] = static_cast<paddle::platform::float16>(gap);
+  }
+  framework::TensorFromVector(assit, ctx.device_context(), assit_tensor);
+}
+
+template <typename DeviceContext, typename T>
+class TopkNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    // read input
+    auto* input = ctx.Input<framework::LoDTensor>("X");
+    auto* output = ctx.Output<framework::LoDTensor>("Out");
+    auto* indices = ctx.Output<framework::LoDTensor>("Indices");
+
+    size_t k = static_cast<int>(ctx.Attr<int>("k"));
+
+    output->mutable_data<T>(ctx.GetPlace());
+    indices->mutable_data<int>(ctx.GetPlace());
+
+    // prepare assit
+    auto dim = input->dims().size();
+    framework::Tensor assist_seq_tensor;
+    assist_seq_tensor.Resize({2 * dim});
+    assist_seq_tensor.mutable_data<T>(ctx.GetPlace());
+    gen_assist_seq(&assist_seq_tensor, dim, ctx);
+
+    framework::NPUAttributeMap attr_input = {{"sorted", "true"},
+                                             {"k", static_cast<int>(k)},
+                                             {"dim", -1},
+                                             {"largest", true}};
+
+    // run ascend
+    auto runner = NpuOpRunner("TopKD", {*input, assist_seq_tensor},
+                              {*output, *indices}, attr_input);
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    runner.Run(stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+// Ascend Op TopKD only support input float 16 dtype
+REGISTER_OP_NPU_KERNEL(top_k,
+                       ops::TopkNPUKernel<paddle::platform::NPUDeviceContext,
+                                          paddle::platform::float16>);
diff --git a/paddle/fluid/operators/trace_op.cu b/paddle/fluid/operators/trace_op.cu
index ea328361ded75..2c2745018be40 100644
--- a/paddle/fluid/operators/trace_op.cu
+++ b/paddle/fluid/operators/trace_op.cu
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
 #include "paddle/fluid/operators/reduce_ops/cub_reduce.h"
 #include "paddle/fluid/operators/trace_op.h"
 
diff --git a/paddle/fluid/operators/transpose_op_npu.cc b/paddle/fluid/operators/transpose_op_npu.cc
new file mode 100644
index 0000000000000..994b8e534f85e
--- /dev/null
+++ b/paddle/fluid/operators/transpose_op_npu.cc
@@ -0,0 +1,81 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <iostream>
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/expand_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class TransposeNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::LoDTensor>("X");
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
+    std::vector<int> axis = ctx.Attr<std::vector<int>>("axis");
+    framework::NPUAttributeMap attr_input = {{"perm", axis}};
+    out->mutable_data<T>(ctx.device_context().GetPlace());
+    auto runner = NpuOpRunner("TransposeD", {*x}, {*out}, attr_input);
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    runner.Run(stream);
+  }
+};
+
+template <typename T>
+class TransposeGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* out_grad =
+        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
+    auto* x_grad =
+        ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
+    std::vector<int> axis = ctx.Attr<std::vector<int>>("axis");
+    std::vector<int> reversed_axis(axis);
+    for (size_t i = 0; i < axis.size(); i++) {
+      reversed_axis[axis[i]] = i;
+    }
+    x_grad->mutable_data<T>(ctx.GetPlace());
+    framework::NPUAttributeMap attr_input = {{"perm", reversed_axis}};
+    auto runner = NpuOpRunner("TransposeD", {*out_grad}, {*x_grad}, attr_input);
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    runner.Run(stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    transpose2,
+    ops::TransposeNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::TransposeNPUKernel<paddle::platform::NPUDeviceContext,
+                            paddle::platform::float16>,
+    ops::TransposeNPUKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::TransposeNPUKernel<paddle::platform::NPUDeviceContext, uint8_t>,
+    ops::TransposeNPUKernel<paddle::platform::NPUDeviceContext, int8_t>);
+
+REGISTER_OP_NPU_KERNEL(transpose2_grad, ops::TransposeGradNPUKernel<float>,
+                       ops::TransposeGradNPUKernel<paddle::platform::float16>,
+                       ops::TransposeGradNPUKernel<int>,
+                       ops::TransposeGradNPUKernel<uint8_t>,
+                       ops::TransposeGradNPUKernel<int8_t>);
diff --git a/paddle/fluid/operators/transpose_op_npu_test.cc b/paddle/fluid/operators/transpose_op_npu_test.cc
new file mode 100644
index 0000000000000..f6712814e1e3b
--- /dev/null
+++ b/paddle/fluid/operators/transpose_op_npu_test.cc
@@ -0,0 +1,137 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <cmath>
+#include <iostream>
+#include <numeric>
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(transpose2);
+USE_OP_DEVICE_KERNEL(transpose2, NPU);
+
+template <typename T>
+void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
+  // init
+  auto x = scope->Var("X");
+  auto out = scope->Var("Out");
+  auto xshape = scope->Var("XShape");
+  auto* x_t = x->GetMutable<f::LoDTensor>();
+  auto* out_t = out->GetMutable<f::LoDTensor>();
+  auto* xshape_t = xshape->GetMutable<f::LoDTensor>();
+  auto place = ctx.GetPlace();
+
+  int dim0 = 2;
+  int dim1 = 3;
+  TensorFromVector(std::vector<T>({0, 1, 2, 3, 4, 5}), ctx, x_t);
+  ctx.Wait();
+  x_t->Resize({dim0, dim1});
+  out_t->Resize({dim0, dim1});
+  ctx.Wait();
+  out_t->mutable_data<T>(place);
+  ctx.Wait();
+  xshape_t->Resize({dim0, dim1});
+  xshape_t->mutable_data<T>(place);
+  f::AttributeMap attrs = {{"axis", std::vector<int>({1, 0})},
+                           {"data_format", std::string("AnyLayout")}};
+  auto op = f::OpRegistry::CreateOp("transpose2", {{"X", {"X"}}},
+                                    {{"Out", {"Out"}}, {"XShape", {"XShape"}}},
+                                    attrs);
+  ctx.Wait();
+  op->Run(*scope, place);
+  ctx.Wait();
+  std::vector<T> out_v;
+  TensorToVector(*out_t, ctx, &out_v);
+  ctx.Wait();
+
+  EXPECT_EQ(out_t->numel(), dim0 * dim1);
+  EXPECT_EQ(out_v[0], 0);
+  EXPECT_EQ(out_v[1], 3);
+  EXPECT_EQ(out_v[2], 1);
+  EXPECT_EQ(out_v[3], 4);
+  EXPECT_EQ(out_v[4], 2);
+  EXPECT_EQ(out_v[5], 5);
+}
+
+template <typename T>
+void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) {
+  // init
+  auto xshape = scope->Var("XShape");
+  auto x_grad = scope->Var("X@GRAD");
+  auto out_grad = scope->Var("Out@GRAD");
+
+  auto* x_grad_t = x_grad->GetMutable<f::LoDTensor>();
+  auto* xshape_t = xshape->GetMutable<f::LoDTensor>();
+  auto* out_grad_t = out_grad->GetMutable<f::LoDTensor>();
+
+  int dim0 = 2;
+  int dim1 = 3;
+  auto place = ctx.GetPlace();
+
+  TensorFromVector(std::vector<T>({0, 1, 2, 3, 4, 5}), ctx, out_grad_t);
+  ctx.Wait();
+
+  x_grad_t->Resize({dim0, dim1});
+  xshape_t->Resize(
+      {0, dim0,
+       dim1});  // NOTE(zhiqiu): 0 is needed, see its infershape function
+  out_grad_t->Resize({dim0, dim1});
+
+  f::AttributeMap attrs = {{"axis", std::vector<int>({1, 0})},
+                           {"data_format", std::string("AnyLayout")}};
+
+  auto op = f::OpRegistry::CreateOp(
+      "transpose2_grad", {{"Out@GRAD", {"Out@GRAD"}}, {"XShape", {"XShape"}}},
+      {{"X@GRAD", {"X@GRAD"}}}, attrs);
+
+  op->Run(*scope, place);
+  ctx.Wait();
+  std::vector<T> out_v;
+  TensorToVector(*x_grad_t, ctx, &out_v);
+  ctx.Wait();
+
+  EXPECT_EQ(x_grad_t->numel(), dim0 * dim1);
+  EXPECT_EQ(out_v[0], 0);
+  EXPECT_EQ(out_v[1], 3);
+  EXPECT_EQ(out_v[2], 1);
+  EXPECT_EQ(out_v[3], 4);
+  EXPECT_EQ(out_v[4], 2);
+  EXPECT_EQ(out_v[5], 5);
+}
+
+TEST(transpose2, NPU_fp32) {
+  f::Scope scope;
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  Compare<float>(&scope, *ctx);
+}
+
+TEST(transpose2_grad, NPU_fp32) {
+  f::Scope scope;
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  CompareGrad<float>(&scope, *ctx);
+}
diff --git a/paddle/fluid/operators/truncated_gaussian_random_op.cu b/paddle/fluid/operators/truncated_gaussian_random_op.cu
index 798709b1088d3..1f25a88075892 100644
--- a/paddle/fluid/operators/truncated_gaussian_random_op.cu
+++ b/paddle/fluid/operators/truncated_gaussian_random_op.cu
@@ -12,25 +12,28 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
 #include <thrust/random.h>
 #include <thrust/transform.h>
 #include <limits>
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/truncated_gaussian_random_op.h"
 
 namespace paddle {
 namespace operators {
 
 template <typename T>
-struct TruncatedNormal {
+struct GPUTruncatedNormal {
   T mean, std;
   T a_normal_cdf;
   T b_normal_cdf;
   unsigned int seed;
   T numeric_min;
 
-  __host__ __device__ TruncatedNormal(T mean, T std, T numeric_min, int seed)
+  __host__ __device__ GPUTruncatedNormal(T mean, T std, T numeric_min, int seed)
       : mean(mean), std(std), seed(seed), numeric_min(numeric_min) {
     a_normal_cdf = (1.0 + erff(-2.0 / sqrtf(2.0))) / 2.0;
     b_normal_cdf = (1.0 + erff(2.0 / sqrtf(2.0))) / 2.0;
@@ -110,10 +113,10 @@ class GPUTruncatedGaussianRandomKernel : public framework::OpKernel<T> {
           TruncatedNormalOffset<T>(mean, std, std::numeric_limits<T>::min(),
                                    seed_offset.first, gen_offset));
     } else {
-      thrust::transform(
-          index_sequence_begin, index_sequence_begin + size,
-          thrust::device_ptr<T>(data),
-          TruncatedNormal<T>(mean, std, std::numeric_limits<T>::min(), seed));
+      thrust::transform(index_sequence_begin, index_sequence_begin + size,
+                        thrust::device_ptr<T>(data),
+                        GPUTruncatedNormal<T>(
+                            mean, std, std::numeric_limits<T>::min(), seed));
     }
   }
 };
diff --git a/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc b/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc
new file mode 100644
index 0000000000000..7f3190d9112c6
--- /dev/null
+++ b/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc
@@ -0,0 +1,109 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/truncated_gaussian_random_op.h"
+#include <memory>
+#include <string>
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class TruncatedGaussianRandomNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    // TODO(zhiqiu): support dynamic shape and call ParameterizedTruncatedNormal
+    std::vector<int> shape = ctx.Attr<std::vector<int>>("shape");
+    Tensor shape_tensor(framework::proto::VarType::INT32);
+    shape_tensor.mutable_data<int32_t>({static_cast<int>(shape.size())},
+                                       ctx.GetPlace());
+    TensorFromVector(shape, ctx.device_context(), &shape_tensor);
+    float mean = ctx.Attr<float>("mean");
+    Tensor mean_tensor(framework::proto::VarType::FP32);
+    mean_tensor.mutable_data<float>({1}, ctx.GetPlace());
+    FillNpuTensorWithConstant<float>(&mean_tensor, mean);
+
+    float std = ctx.Attr<float>("std");
+    Tensor std_tensor(framework::proto::VarType::FP32);
+    std_tensor.mutable_data<float>({1}, ctx.GetPlace());
+    FillNpuTensorWithConstant<float>(&std_tensor, std);
+
+    int32_t seed_var = ctx.Attr<int32_t>("seed");
+
+    Tensor min_tensor(framework::proto::VarType::FP32);
+    min_tensor.mutable_data<float>({1}, ctx.GetPlace());
+    float min_value = mean - std * 2.0;
+    FillNpuTensorWithConstant<float>(&min_tensor, min_value);
+
+    Tensor max_tensor(framework::proto::VarType::FP32);
+    max_tensor.mutable_data<float>({1}, ctx.GetPlace());
+    float max_value = mean + std * 2.0;
+    FillNpuTensorWithConstant<float>(&max_tensor, max_value);
+
+    auto* out = ctx.Output<framework::Tensor>("Out");
+    out->mutable_data<T>(ctx.GetPlace());
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    auto runner = NpuOpRunner(
+        "ParameterizedTruncatedNormal",
+        {shape_tensor, mean_tensor, std_tensor, min_tensor, max_tensor}, {*out},
+        {{"seed", seed_var}});
+    runner.Run(stream);
+  }
+};
+
+// NOTE(zhiqiu): actually, this is cpu version kernel, and we need to make the
+// above
+// npu version work in the future.
+template <typename T>
+class NPUTruncatedGaussianRandomKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    float mean = context.Attr<float>("mean");
+    float std = context.Attr<float>("std");
+    auto* tensor = context.Output<framework::Tensor>("Out");
+    tensor->mutable_data<T>(context.GetPlace());
+
+    Tensor cpu_tensor(tensor->type());
+    cpu_tensor.Resize(tensor->dims());
+    T* cpu_data = cpu_tensor.mutable_data<T>(platform::CPUPlace());
+    std::uniform_real_distribution<T> dist(std::numeric_limits<float>::min(),
+                                           1.0);
+    TruncatedNormal<T> truncated_normal(mean, std);
+    int64_t size = tensor->numel();
+
+    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
+    auto engine = framework::GetCPURandomEngine(seed);
+    for (int64_t i = 0; i < size; ++i) {
+      cpu_data[i] = truncated_normal(dist(*engine));
+    }
+    framework::TensorCopy(
+        cpu_tensor, context.GetPlace(),
+        context.template device_context<platform::DeviceContext>(), tensor);
+    context.template device_context<paddle::platform::NPUDeviceContext>()
+        .Wait();
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(truncated_gaussian_random,
+                       ops::NPUTruncatedGaussianRandomKernel<float>);
diff --git a/paddle/fluid/operators/uniform_random_op.cu b/paddle/fluid/operators/uniform_random_op.cu
index 563a6c165b748..ceb13a3dda41d 100644
--- a/paddle/fluid/operators/uniform_random_op.cu
+++ b/paddle/fluid/operators/uniform_random_op.cu
@@ -11,9 +11,10 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
 #include <thrust/random.h>
 #include <thrust/transform.h>
-
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
diff --git a/paddle/fluid/operators/unsqueeze_op_npu.cc b/paddle/fluid/operators/unsqueeze_op_npu.cc
new file mode 100644
index 0000000000000..c3daeffc13d1a
--- /dev/null
+++ b/paddle/fluid/operators/unsqueeze_op_npu.cc
@@ -0,0 +1,41 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_ASCEND_CL
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/operators/npu_op_runner.h"
+#include "paddle/fluid/operators/unsqueeze_op.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(
+    unsqueeze, ops::UnsqueezeKernel<plat::NPUDeviceContext, float>,
+    ops::UnsqueezeKernel<plat::NPUDeviceContext, double>,
+    ops::UnsqueezeKernel<plat::NPUDeviceContext, plat::float16>,
+    ops::UnsqueezeKernel<plat::NPUDeviceContext, bool>,
+    ops::UnsqueezeKernel<plat::NPUDeviceContext, int>,
+    ops::UnsqueezeKernel<plat::NPUDeviceContext, int8_t>,
+    ops::UnsqueezeKernel<plat::NPUDeviceContext, int64_t>);
+REGISTER_OP_NPU_KERNEL(
+    unsqueeze2, ops::UnsqueezeKernel<plat::NPUDeviceContext, float>,
+    ops::UnsqueezeKernel<plat::NPUDeviceContext, double>,
+    ops::UnsqueezeKernel<plat::NPUDeviceContext, plat::float16>,
+    ops::UnsqueezeKernel<plat::NPUDeviceContext, bool>,
+    ops::UnsqueezeKernel<plat::NPUDeviceContext, int>,
+    ops::UnsqueezeKernel<plat::NPUDeviceContext, int8_t>,
+    ops::UnsqueezeKernel<plat::NPUDeviceContext, int64_t>);
+#endif
diff --git a/paddle/fluid/operators/unsqueeze_op_npu_test.cc b/paddle/fluid/operators/unsqueeze_op_npu_test.cc
new file mode 100644
index 0000000000000..a145c914a8621
--- /dev/null
+++ b/paddle/fluid/operators/unsqueeze_op_npu_test.cc
@@ -0,0 +1,90 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(unsqueeze);
+USE_OP_DEVICE_KERNEL(unsqueeze, NPU);
+
+template <typename T>
+void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
+  // init
+  auto x = scope->Var("X");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+
+  int dim0 = 5;
+  int dim1 = 10;
+
+  std::vector<T> init;
+  for (int64_t i = 0; i < dim0 * dim1; ++i) {
+    init.push_back(static_cast<T>(0.1));
+  }
+
+  TensorFromVector(init, ctx, tensor_x);
+  tensor_x->Resize({dim0, dim1});
+
+  ctx.Wait();
+
+  // run
+  auto place = ctx.GetPlace();
+  auto out = scope->Var("Out");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+
+  std::vector<int> axis;
+  axis.push_back(1);
+  f::AttributeMap attrs = {{"axes", axis}};
+
+  auto op = f::OpRegistry::CreateOp("unsqueeze", {{"X", {"X"}}},
+                                    {{"Out", {"Out"}}}, attrs);
+
+  op->Run(*scope, place);
+  ctx.Wait();
+
+  EXPECT_EQ((uint32_t)tensor_out->dims().size(), uint32_t(3));
+  EXPECT_EQ((uint32_t)tensor_out->dims()[0], uint32_t(5));
+  EXPECT_EQ((uint32_t)tensor_out->dims()[1], uint32_t(1));
+  EXPECT_EQ((uint32_t)tensor_out->dims()[2], uint32_t(10));
+
+  std::vector<T> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+  for (uint32_t i = 0; i < out_vec.size(); i++) {
+    EXPECT_EQ(out_vec[i], static_cast<T>(0.1));
+  }
+
+  ctx.Wait();
+}
+
+TEST(unsqueeze, NPU_fp32) {
+  f::Scope scope;
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  Compare<float>(&scope, *ctx);
+}
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 584dbd4756aa0..0827d6a5ae764 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -106,11 +106,11 @@ ELSE()
 ENDIF()
 
 IF(WITH_ASCEND_CL)
-cc_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce) 
+cc_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce)
 ENDIF()
 
 IF(WITH_GPU)
-    nv_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce) 
+    nv_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce)
 ENDIF()
 IF(WITH_ROCM)
     hip_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce)
@@ -136,13 +136,18 @@ cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool
     place eigen3 stringpiece cpu_helper cpu_info framework_proto ${GPU_CTX_DEPS} ${NPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}
     ${dgc_deps} dlpack cudnn_workspace_helper ${XPU_CTX_DEPS})
 
-cc_library(collective_helper SRCS collective_helper.cc gen_comm_id_helper.cc DEPS framework_proto  device_context enforce)
+cc_library(collective_helper SRCS collective_helper.cc collective_helper_npu.cc gen_comm_id_helper.cc DEPS framework_proto  device_context enforce)
 
 if(WITH_GPU OR WITH_ROCM)
     cc_library(cuda_resource_pool SRCS cuda_resource_pool.cc DEPS gpu_info)
     target_link_libraries(device_context cuda_resource_pool)
 endif()
 
+if(WITH_ASCEND_CL)
+    cc_library(npu_resource_pool SRCS npu_resource_pool.cc DEPS npu_info)
+    target_link_libraries(device_context npu_resource_pool)
+endif()
+
 cc_test(init_test SRCS init_test.cc DEPS device_context)
 
 if(WITH_GPU)
@@ -185,6 +190,7 @@ cc_test(bfloat16_test SRCS bfloat16_test.cc DEPS lod_tensor)
 
 IF(WITH_GPU)
   nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor)
+  nv_test(bfloat16_gpu_test SRCS bfloat16_test.cu DEPS lod_tensor)
   nv_test(test_limit_gpu_memory SRCS test_limit_gpu_memory.cu DEPS gpu_info flags)
   nv_library(cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info)
 ENDIF()
diff --git a/paddle/fluid/platform/ascend_npu_info.h b/paddle/fluid/platform/ascend_npu_info.h
index 7afed121a5acb..213013f5b1277 100644
--- a/paddle/fluid/platform/ascend_npu_info.h
+++ b/paddle/fluid/platform/ascend_npu_info.h
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 
-#ifdef PADDLE_WITH_ASCEND
+#ifdef PADDLE_WITH_ASCEND_CL
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/bfloat16.h b/paddle/fluid/platform/bfloat16.h
index 6cb4901f1dde3..a362e2903f245 100644
--- a/paddle/fluid/platform/bfloat16.h
+++ b/paddle/fluid/platform/bfloat16.h
@@ -21,6 +21,15 @@
 #include <iostream>
 #include <limits>
 
+#ifdef PADDLE_WITH_CUDA
+#include <cuda.h>
+#endif
+
+#if defined(__CUDACC__) && CUDA_VERSION >= 11000
+#define PADDLE_CUDA_BF16
+#include <cuda_bf16.h>
+#endif
+
 #if !defined(_WIN32)
 #define PADDLE_ALIGN(x) __attribute__((aligned(x)))
 #else
@@ -44,6 +53,7 @@ struct PADDLE_ALIGN(2) bfloat16 {
  public:
   uint16_t x;
 
+  // Constructors
   bfloat16() = default;
   bfloat16(const bfloat16& o) = default;
   bfloat16& operator=(const bfloat16& o) = default;
@@ -60,15 +70,34 @@ struct PADDLE_ALIGN(2) bfloat16 {
     tempRes = reinterpret_cast<uint32_t*>(&val);
     res = *tempRes;
     x = res >> 16;
+#else
+#if defined(PADDLE_CUDA_BF16)
+    __nv_bfloat16 tmp = __float2bfloat16(val);
+    x = *reinterpret_cast<uint16_t*>(&tmp);
 #else
     std::memcpy(&x, reinterpret_cast<char*>(&val) + 2, 2);
 #endif
+#endif
+  }
+
+#if defined(PADDLE_CUDA_BF16)
+  HOSTDEVICE inline explicit bfloat16(const __nv_bfloat16& val) {
+    x = *reinterpret_cast<const unsigned short*>(&val);
   }
+#endif
 
   template <class T>
   HOSTDEVICE inline explicit bfloat16(const T& val)
       : x(bfloat16(static_cast<float>(val)).x) {}
 
+// Assignment operators
+#if defined(PADDLE_CUDA_BF16)
+  HOSTDEVICE inline bfloat16& operator=(const __nv_bfloat16& val) {
+    x = *reinterpret_cast<const unsigned short*>(&val);
+    return *this;
+  }
+#endif
+
   HOSTDEVICE inline bfloat16& operator=(bool b) {
     x = b ? 0x3f80 : 0;
     return *this;
@@ -124,13 +153,24 @@ struct PADDLE_ALIGN(2) bfloat16 {
     return *this;
   }
 
+  // Conversion opertors
   HOSTDEVICE inline explicit operator float() const {
+#ifdef PADDLE_CUDA_BF16
+    return __bfloat162float(*reinterpret_cast<const __nv_bfloat16*>(&x));
+#else
     float val = 0.f;
     uint16_t temp = x;
     memcpy(reinterpret_cast<char*>(&val) + 2, reinterpret_cast<char*>(&temp),
            2);
     return val;
+#endif
+  }
+
+#ifdef PADDLE_CUDA_BF16
+  HOSTDEVICE inline explicit operator __nv_bfloat16() const {
+    return *reinterpret_cast<const __nv_bfloat16*>(&x);
   }
+#endif
 
   HOSTDEVICE inline explicit operator bool() const { return (x & 0x7fff) != 0; }
 
@@ -223,6 +263,7 @@ HOSTDEVICE inline bfloat16 raw_uint16_to_bfloat16(uint16_t a) {
   return res;
 }
 
+// Comparison operators
 HOSTDEVICE inline bool operator==(const bfloat16& a, const bfloat16& b) {
   return static_cast<float>(a) == static_cast<float>(b);
 }
diff --git a/paddle/fluid/platform/bfloat16_test.cu b/paddle/fluid/platform/bfloat16_test.cu
new file mode 100644
index 0000000000000..dbbb72920a53b
--- /dev/null
+++ b/paddle/fluid/platform/bfloat16_test.cu
@@ -0,0 +1,124 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/bfloat16.h"
+
+#define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include <iostream>
+#include "paddle/fluid/framework/lod_tensor.h"
+
+#if defined(PADDLE_CUDA_BF16)
+namespace paddle {
+namespace platform {
+
+TEST(bfloat16, convert_float32_to_bfloat16_on_gpu) {
+  // Convert float32 to bfloat16
+  EXPECT_EQ((bfloat16(1.0f)).x, 0x3f80);
+  EXPECT_EQ((bfloat16(0.5f)).x, 0x3f00);
+  EXPECT_EQ((bfloat16(0.33333f)).x, 0x3eab);
+  EXPECT_EQ((bfloat16(0.0f)).x, 0x0000);
+  EXPECT_EQ((bfloat16(-0.0f)).x, 0x8000);
+  EXPECT_EQ((bfloat16(65536.0f)).x, 0x4780);
+}
+
+TEST(bfloat16, assignment_operator_on_gpu) {
+  // Assignment operator
+  bfloat16 v_assign;
+  v_assign = nv_bfloat16(bfloat16(1.0f));
+  EXPECT_EQ(v_assign.x, 0x3f80);
+  v_assign = 0.33333;
+  EXPECT_EQ(v_assign.x, 0x3eab);
+}
+
+TEST(bfloat16, convert_bfloat16_to_float32_on_gpu) {
+  // Conversion operator
+  EXPECT_EQ(static_cast<float>(bfloat16(0.5f)), 0.5f);
+  EXPECT_NEAR(static_cast<double>(bfloat16(0.33333)), 0.33333, 0.01);
+  EXPECT_EQ(static_cast<int>(bfloat16(-1)), -1);
+  EXPECT_EQ(static_cast<bool>(bfloat16(true)), true);
+}
+
+TEST(bfloat16, lod_tensor_on_gpu) {
+  framework::LoDTensor src_tensor;
+  framework::LoDTensor gpu_tensor;
+  framework::LoDTensor dst_tensor;
+
+  bfloat16 *src_ptr = src_tensor.mutable_data<bfloat16>(
+      framework::make_ddim({2, 2}), CPUPlace());
+
+  bfloat16 arr[4] = {bfloat16(1.0f), bfloat16(0.5f), bfloat16(0.33333f),
+                     bfloat16(0.0f)};
+  memcpy(src_ptr, arr, 4 * sizeof(bfloat16));
+
+  // CPU LoDTensor to GPU LoDTensor
+  CUDAPlace gpu_place(0);
+  CUDADeviceContext gpu_ctx(gpu_place);
+  framework::TensorCopy(src_tensor, gpu_place, gpu_ctx, &gpu_tensor);
+
+  // GPU LoDTensor to CPU LoDTensor
+  framework::TensorCopy(gpu_tensor, CPUPlace(), gpu_ctx, &dst_tensor);
+
+  // Sync before comparing LoDTensors
+  gpu_ctx.Wait();
+  const bfloat16 *dst_ptr = dst_tensor.data<bfloat16>();
+  ASSERT_NE(src_ptr, dst_ptr);
+  for (size_t i = 0; i < 4; ++i) {
+    EXPECT_EQ(src_ptr[i].x, dst_ptr[i].x);
+  }
+}
+
+TEST(bfloat16, isinf) {
+  bfloat16 a;
+  a.x = 0x7f80;
+  bfloat16 b = bfloat16(INFINITY);
+  bfloat16 c = static_cast<bfloat16>(INFINITY);
+  EXPECT_EQ(std::isinf(a), true);
+  EXPECT_EQ(std::isinf(b), true);
+  EXPECT_EQ(std::isinf(c), true);
+}
+
+TEST(bfloat16, isnan) {
+  bfloat16 a;
+  a.x = 0x7fff;
+  bfloat16 b = bfloat16(NAN);
+  bfloat16 c = static_cast<bfloat16>(NAN);
+  EXPECT_EQ(std::isnan(a), true);
+  EXPECT_EQ(std::isnan(b), true);
+  EXPECT_EQ(std::isnan(c), true);
+}
+
+TEST(bfloat16, cast) {
+  bfloat16 a;
+  a.x = 0x0070;
+  auto b = a;
+  {
+    // change semantic, keep the same value
+    bfloat16 c = reinterpret_cast<bfloat16 &>(reinterpret_cast<unsigned &>(b));
+    EXPECT_EQ(b, c);
+  }
+
+  {
+    // use uint32 low 16 bit store float16
+    uint32_t c = reinterpret_cast<uint32_t &>(b);
+    bfloat16 d;
+    d.x = c;
+    EXPECT_EQ(b, d);
+  }
+}
+
+}  // namespace platform
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/platform/collective_helper.h b/paddle/fluid/platform/collective_helper.h
index 197f905ba68a2..b0b857f7ee3f2 100644
--- a/paddle/fluid/platform/collective_helper.h
+++ b/paddle/fluid/platform/collective_helper.h
@@ -22,6 +22,7 @@
 #include "boost/variant.hpp"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/dynload/hccl.h"
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
@@ -126,6 +127,113 @@ class NCCLCommContext {
 };
 #endif
 
+#if defined(PADDLE_WITH_ASCEND_CL)
+// In order to apply hierarchical communication with HCCL, we need
+// a communication ring contains HCCL communicators associated to a global
+// HCCLUniqueId. E.g. for a hierarchical case,
+//
+//    11 - 12   21 - 22
+//     |    |    |    |
+//    13 - 14 - 23 - 24
+//          |    |
+//    31 - 32 - 41 - 42
+//     |    |    |    |
+//    33 - 34   43 - 44
+//
+// we group (14,23,32,41) as the top, and (11,12,13,14), (21,22,23,24),
+// (31,32,33,34), (41,42,43,44) as bottoms respectively.
+//
+// We could also use a single communication ring for the flatten case
+//
+// The HCCLComm instance is created and reversed in the HCCLCommContext
+// singleton with a global user specified group id.
+class NPUDeviceContext;
+
+#define ENV_RANK_TABLE_FILE "RANK_TABLE_FILE"
+#define ENV_RANK_ID "PADDLE_TRAINER_ID"
+
+class HCCLComm {
+ public:
+  virtual int ring_id() const = 0;
+  virtual int nranks() const = 0;
+  virtual int rank() const = 0;
+  virtual int device_id() const = 0;
+  virtual HcclComm comm() const = 0;
+  virtual aclrtStream stream() const = 0;
+  virtual NPUDeviceContext* dev_context() const = 0;
+  virtual ~HCCLComm() = default;
+};
+
+// A singleton HCCL communicator context reserves communication ring ids
+class HCCLCommContext {
+ public:
+  static HCCLCommContext& Instance() {
+    static HCCLCommContext comm_ctx;
+    return comm_ctx;
+  }
+
+  HCCLComm* CreateHCCLComm(HcclRootInfo* hccl_id, int nranks, int rank,
+                           int dev_id, int ring_id);
+  // a latter comm with the same dev_id and the same ring_id
+  // will override the former
+  HCCLComm* AssignHCCLComm(HcclComm comm, int nranks, int rank, int dev_id,
+                           int ring_id);
+
+  // retrieve a communicator by the ring id in multiprocessing mode
+  HCCLComm* Get(int ring_id) const {
+    PADDLE_ENFORCE_GT(
+        comm_map_.count(ring_id), 0,
+        platform::errors::InvalidArgument(
+            "Communicator in ring id %d has not been initialized.", ring_id));
+    PADDLE_ENFORCE_EQ(comm_map_.at(ring_id).size(), 1,
+                      platform::errors::InvalidArgument(
+                          "One device id should be specified to retrieve from "
+                          "multiple communicators."));
+    return comm_map_.at(ring_id).begin()->second.get();
+  }
+
+  // retrieve a communicator by the ring id and the device id
+  HCCLComm* Get(int ring_id, int dev_id) const {
+    PADDLE_ENFORCE_GT(
+        comm_map_.count(ring_id), 0,
+        platform::errors::InvalidArgument(
+            "Communicator of ring id %d has not been initialized.", ring_id));
+    PADDLE_ENFORCE_GT(
+        comm_map_.at(ring_id).count(dev_id), 0,
+        platform::errors::InvalidArgument(
+            "Communicator at device id %d has not been initialized in ring %d.",
+            dev_id, ring_id));
+    return comm_map_.at(ring_id).at(dev_id).get();
+  }
+
+  // retrieve a communicator by the ring id and place
+  HCCLComm* Get(int ring_id, Place place) const {
+    return Get(ring_id, BOOST_GET_CONST(NPUPlace, place).device);
+  }
+
+ private:
+  // Init global hcom
+  HCCLCommContext() {}
+  // we may use group feature in the feature
+  // HCCLCommContext() { InitHcomWorldGroup(); }
+
+  HcclComm comm_;
+
+ public:
+  ~HCCLCommContext() {}
+
+  std::once_flag once_flag_;
+  std::mutex comm_map_mutex_;
+  // ring id to dev-HCCLComm
+  std::map<int, std::map<int, std::unique_ptr<HCCLComm>>> comm_map_;
+
+  // void InitHcomWorldGroup();
+  void ReleaseHCCLComms();
+
+  DISABLE_COPY_AND_ASSIGN(HCCLCommContext);
+};
+#endif
+
 #if defined(PADDLE_WITH_XPU_BKCL)
 // In order to apply hierarchical communication with BKCL, we need
 // a communication ring contains BKCL communicators associated to a global
diff --git a/paddle/fluid/platform/collective_helper_npu.cc b/paddle/fluid/platform/collective_helper_npu.cc
new file mode 100644
index 0000000000000..f30e5fa833d44
--- /dev/null
+++ b/paddle/fluid/platform/collective_helper_npu.cc
@@ -0,0 +1,145 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include <utility>
+
+namespace paddle {
+namespace platform {
+
+class HCCLCommImpl : public HCCLComm {
+ public:
+  void set_ring_id(int ring_id) { ring_id_ = ring_id; }
+  int ring_id() const override { return ring_id_; }
+
+  void set_nranks(int nranks) { nranks_ = nranks; }
+  int nranks() const override { return nranks_; }
+
+  void set_rank(int rank) { rank_ = rank; }
+  int rank() const override { return rank_; }
+
+  int device_id() const override {
+    return BOOST_GET_CONST(NPUPlace, dev_ctx_->GetPlace()).device;
+  }
+
+  ~HCCLCommImpl() {
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclCommDestroy(comm_));
+  }
+
+  void set_comm(HcclComm comm) { comm_ = comm; }
+  HcclComm comm() const override { return comm_; }
+
+  aclrtStream stream() const override { return dev_ctx_->stream(); }
+
+  void set_dev_ctx(std::unique_ptr<NPUDeviceContext>&& dev_ctx) {
+    dev_ctx_ = std::move(dev_ctx);
+  }
+  NPUDeviceContext* dev_context() const override { return dev_ctx_.get(); }
+
+ private:
+  int ring_id_;
+  int nranks_;
+  int rank_;
+  HcclComm comm_;
+  std::unique_ptr<NPUDeviceContext> dev_ctx_;
+};
+
+HCCLComm* HCCLCommContext::CreateHCCLComm(HcclRootInfo* hccl_id, int nranks,
+                                          int rank, int dev_id, int ring_id) {
+  PADDLE_ENFORCE_NOT_NULL(hccl_id,
+                          platform::errors::InvalidArgument(
+                              "The hccl unique id should not be null."));
+  PADDLE_ENFORCE_GT(
+      nranks, 1,
+      platform::errors::InvalidArgument(
+          "Expected nranks > 1. But received nranks is %d.", nranks));
+  PADDLE_ENFORCE_GE(rank, 0,
+                    platform::errors::InvalidArgument(
+                        "Expected rank >= 0. But received rank is %d.", rank));
+  PADDLE_ENFORCE_LT(
+      rank, nranks,
+      platform::errors::InvalidArgument(
+          "Expected rank < nranks. But received rank is %d, nranks is %d.",
+          rank, nranks));
+  PADDLE_ENFORCE_GE(
+      dev_id, 0,
+      platform::errors::InvalidArgument(
+          "Expected dev_id >= 0. But received dev_id is %d.", dev_id));
+
+  HcclComm comm;
+  PADDLE_ENFORCE_NPU_SUCCESS(aclrtSetDevice(dev_id));
+  VLOG(1) << "initialized comm: " << &comm << ", nranks: " << nranks
+          << ", hccl_id: " << hccl_id << ", rank: " << rank;
+  PADDLE_ENFORCE_NPU_SUCCESS(
+      platform::dynload::HcclCommInitRootInfo(nranks, hccl_id, rank, &comm));
+
+  VLOG(1) << "initialized comm: " << &comm << ", nranks: " << nranks
+          << ", hccl_id: " << hccl_id << ", rank: " << rank;
+
+  auto* comm_wrapper = AssignHCCLComm(comm, nranks, rank, dev_id, ring_id);
+
+  VLOG(1) << "hccl communicator of rank " << rank << " in ring " << ring_id
+          << " has been created on device " << dev_id
+          << ", with comm: " << comm_wrapper->comm();
+
+  std::call_once(once_flag_, []() {
+    std::atexit([]() { HCCLCommContext::Instance().ReleaseHCCLComms(); });
+  });
+
+  return comm_wrapper;
+}
+
+HCCLComm* HCCLCommContext::AssignHCCLComm(HcclComm comm, int nranks, int rank,
+                                          int dev_id, int ring_id) {
+  std::unique_ptr<NPUDeviceContext> dev_ctx(
+      new NPUDeviceContext(NPUPlace(dev_id)));
+
+  HCCLCommImpl* c = new HCCLCommImpl;
+  c->set_ring_id(ring_id);
+  c->set_nranks(nranks);
+  c->set_rank(rank);
+  c->set_comm(comm);
+  c->set_dev_ctx(std::move(dev_ctx));
+
+  comm_map_mutex_.lock();
+  if (comm_map_.count(ring_id) == 0) {
+    comm_map_.emplace(ring_id, std::map<int, std::unique_ptr<HCCLComm>>());
+  }
+  auto& dev2comm = comm_map_[ring_id];
+
+  dev2comm.emplace(dev_id, std::unique_ptr<HCCLComm>(c));
+  comm_map_mutex_.unlock();
+
+  if (ring_id == 0) {
+    auto* dev_ctx = static_cast<platform::NPUDeviceContext*>(
+        platform::DeviceContextPool::Instance().Get(
+            platform::NPUPlace(dev_id)));
+    dev_ctx->set_hccl_comm(comm);
+  }
+
+  return comm_map_[ring_id][dev_id].get();
+}
+
+void HCCLCommContext::ReleaseHCCLComms() {
+  for (auto& p : comm_map_) {
+    for (auto& q : p.second) {
+      q.second.reset();
+    }
+  }
+}
+
+}  // namespace platform
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/platform/complex128.h b/paddle/fluid/platform/complex128.h
index d6fddd672a0f8..da2f83c3497cc 100644
--- a/paddle/fluid/platform/complex128.h
+++ b/paddle/fluid/platform/complex128.h
@@ -47,6 +47,10 @@
 #define HOST
 #endif
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#define PADDLE_WITH_CUDA_OR_HIP_COMPLEX128
+#endif
+
 namespace paddle {
 namespace platform {
 
@@ -217,7 +221,8 @@ struct PADDLE_ALIGN(16) complex128 {
 
 HOSTDEVICE inline complex128 operator+(const complex128& a,
                                        const complex128& b) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   return complex128(thrust::complex<double>(a.real, a.imag) +
                     thrust::complex<double>(b.real, b.imag));
 #else
@@ -227,7 +232,8 @@ HOSTDEVICE inline complex128 operator+(const complex128& a,
 
 HOSTDEVICE inline complex128 operator-(const complex128& a,
                                        const complex128& b) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   return complex128(thrust::complex<double>(a.real, a.imag) -
                     thrust::complex<double>(b.real, b.imag));
 #else
@@ -237,7 +243,8 @@ HOSTDEVICE inline complex128 operator-(const complex128& a,
 
 HOSTDEVICE inline complex128 operator*(const complex128& a,
                                        const complex128& b) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   return complex128(thrust::complex<double>(a.real, a.imag) *
                     thrust::complex<double>(b.real, b.imag));
 #else
@@ -248,7 +255,8 @@ HOSTDEVICE inline complex128 operator*(const complex128& a,
 
 HOSTDEVICE inline complex128 operator/(const complex128& a,
                                        const complex128& b) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   return complex128(thrust::complex<double>(a.real, a.imag) /
                     thrust::complex<double>(b.real, b.imag));
 #else
@@ -259,7 +267,8 @@ HOSTDEVICE inline complex128 operator/(const complex128& a,
 }
 
 HOSTDEVICE inline complex128 operator-(const complex128& a) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   return complex128(-thrust::complex<double>(a.real, a.imag));
 #else
   complex128 res;
@@ -271,7 +280,8 @@ HOSTDEVICE inline complex128 operator-(const complex128& a) {
 
 HOSTDEVICE inline complex128& operator+=(complex128& a,  // NOLINT
                                          const complex128& b) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   a = complex128(thrust::complex<double>(a.real, a.imag) +=
                  thrust::complex<double>(b.real, b.imag));
   return a;
@@ -284,7 +294,8 @@ HOSTDEVICE inline complex128& operator+=(complex128& a,  // NOLINT
 
 HOSTDEVICE inline complex128& operator-=(complex128& a,  // NOLINT
                                          const complex128& b) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   a = complex128(thrust::complex<double>(a.real, a.imag) -=
                  thrust::complex<double>(b.real, b.imag));
   return a;
@@ -297,7 +308,8 @@ HOSTDEVICE inline complex128& operator-=(complex128& a,  // NOLINT
 
 HOSTDEVICE inline complex128& operator*=(complex128& a,  // NOLINT
                                          const complex128& b) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   a = complex128(thrust::complex<double>(a.real, a.imag) *=
                  thrust::complex<double>(b.real, b.imag));
   return a;
@@ -310,7 +322,8 @@ HOSTDEVICE inline complex128& operator*=(complex128& a,  // NOLINT
 
 HOSTDEVICE inline complex128& operator/=(complex128& a,  // NOLINT
                                          const complex128& b) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   a = complex128(thrust::complex<double>(a.real, a.imag) /=
                  thrust::complex<double>(b.real, b.imag));
   return a;
@@ -353,7 +366,7 @@ HOSTDEVICE inline bool operator>=(const complex128& a, const complex128& b) {
 }
 
 HOSTDEVICE inline bool(isnan)(const complex128& a) {
-#if defined(__CUDA_ARCH__)
+#if defined(PADDLE_WITH_CUDA) && defined(__CUDA_ARCH__)
   // __isnanf not supported on HIP platform
   return __isnan(a.real) || __isnan(a.imag);
 #else
@@ -362,7 +375,7 @@ HOSTDEVICE inline bool(isnan)(const complex128& a) {
 }
 
 HOSTDEVICE inline bool(isinf)(const complex128& a) {
-#if defined(__CUDA_ARCH__)
+#if defined(PADDLE_WITH_CUDA) && defined(__CUDA_ARCH__)
   // __isinf not supported on HIP platform
   return __isinf(a.real) || __isinf(a.imag);
 #else
@@ -375,7 +388,8 @@ HOSTDEVICE inline bool(isfinite)(const complex128& a) {
 }
 
 HOSTDEVICE inline double(abs)(const complex128& a) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   return thrust::abs(thrust::complex<double>(a.real, a.imag));
 #else
   return std::abs(std::complex<double>(a.real, a.imag));
@@ -383,7 +397,8 @@ HOSTDEVICE inline double(abs)(const complex128& a) {
 }
 
 HOSTDEVICE inline complex128(pow)(const complex128& a, const complex128& b) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   return complex128(thrust::pow(thrust::complex<double>(a.real, a.imag),
                                 thrust::complex<double>(b.real, b.imag)));
 #else
@@ -392,7 +407,8 @@ HOSTDEVICE inline complex128(pow)(const complex128& a, const complex128& b) {
 }
 
 HOSTDEVICE inline complex128(sqrt)(const complex128& a) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   return complex128(thrust::sqrt(thrust::complex<double>(a.real, a.imag)));
 #else
   return std::sqrt(std::complex<double>(a));
@@ -400,7 +416,8 @@ HOSTDEVICE inline complex128(sqrt)(const complex128& a) {
 }
 
 HOSTDEVICE inline complex128(tanh)(const complex128& a) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   return complex128(thrust::tanh(thrust::complex<double>(a.real, a.imag)));
 #else
   return std::tanh(std::complex<double>(a));
@@ -408,7 +425,8 @@ HOSTDEVICE inline complex128(tanh)(const complex128& a) {
 }
 
 HOSTDEVICE inline complex128(log)(const complex128& a) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   return complex128(thrust::log(thrust::complex<double>(a.real, a.imag)));
 #else
   return complex128(std::log(std::complex<double>(a)));
diff --git a/paddle/fluid/platform/complex64.h b/paddle/fluid/platform/complex64.h
index 9d55ba19105a6..0aad7bd9dd2a8 100644
--- a/paddle/fluid/platform/complex64.h
+++ b/paddle/fluid/platform/complex64.h
@@ -47,6 +47,10 @@
 #define HOST
 #endif
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#define PADDLE_WITH_CUDA_OR_HIP_COMPLEX64
+#endif
+
 #include "complex128.h"  // NOLINT
 
 namespace paddle {
@@ -224,7 +228,8 @@ struct PADDLE_ALIGN(8) complex64 {
 };
 
 HOSTDEVICE inline complex64 operator+(const complex64& a, const complex64& b) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX64) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   return complex64(thrust::complex<float>(a.real, a.imag) +
                    thrust::complex<float>(b.real, b.imag));
 #else
@@ -233,7 +238,8 @@ HOSTDEVICE inline complex64 operator+(const complex64& a, const complex64& b) {
 }
 
 HOSTDEVICE inline complex64 operator-(const complex64& a, const complex64& b) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX64) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   return complex64(thrust::complex<float>(a.real, a.imag) -
                    thrust::complex<float>(b.real, b.imag));
 #else
@@ -242,7 +248,8 @@ HOSTDEVICE inline complex64 operator-(const complex64& a, const complex64& b) {
 }
 
 HOSTDEVICE inline complex64 operator*(const complex64& a, const complex64& b) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX64) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   return complex64(thrust::complex<float>(a.real, a.imag) *
                    thrust::complex<float>(b.real, b.imag));
 #else
@@ -252,7 +259,8 @@ HOSTDEVICE inline complex64 operator*(const complex64& a, const complex64& b) {
 }
 
 HOSTDEVICE inline complex64 operator/(const complex64& a, const complex64& b) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX64) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   return complex64(thrust::complex<float>(a.real, a.imag) /
                    thrust::complex<float>(b.real, b.imag));
 #else
@@ -263,7 +271,8 @@ HOSTDEVICE inline complex64 operator/(const complex64& a, const complex64& b) {
 }
 
 HOSTDEVICE inline complex64 operator-(const complex64& a) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX64) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   return complex64(-thrust::complex<float>(a.real, a.imag));
 #else
   complex64 res;
@@ -275,7 +284,8 @@ HOSTDEVICE inline complex64 operator-(const complex64& a) {
 
 HOSTDEVICE inline complex64& operator+=(complex64& a,  // NOLINT
                                         const complex64& b) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX64) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   a = complex64(thrust::complex<float>(a.real, a.imag) +=
                 thrust::complex<float>(b.real, b.imag));
   return a;
@@ -288,7 +298,8 @@ HOSTDEVICE inline complex64& operator+=(complex64& a,  // NOLINT
 
 HOSTDEVICE inline complex64& operator-=(complex64& a,  // NOLINT
                                         const complex64& b) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX64) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   a = complex64(thrust::complex<float>(a.real, a.imag) -=
                 thrust::complex<float>(b.real, b.imag));
   return a;
@@ -301,7 +312,8 @@ HOSTDEVICE inline complex64& operator-=(complex64& a,  // NOLINT
 
 HOSTDEVICE inline complex64& operator*=(complex64& a,  // NOLINT
                                         const complex64& b) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX64) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   a = complex64(thrust::complex<float>(a.real, a.imag) *=
                 thrust::complex<float>(b.real, b.imag));
   return a;
@@ -314,7 +326,8 @@ HOSTDEVICE inline complex64& operator*=(complex64& a,  // NOLINT
 
 HOSTDEVICE inline complex64& operator/=(complex64& a,  // NOLINT
                                         const complex64& b) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX64) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   a = complex64(thrust::complex<float>(a.real, a.imag) /=
                 thrust::complex<float>(b.real, b.imag));
   return a;
@@ -357,7 +370,7 @@ HOSTDEVICE inline bool operator>=(const complex64& a, const complex64& b) {
 }
 
 HOSTDEVICE inline bool(isnan)(const complex64& a) {
-#if defined(__CUDA_ARCH__)
+#if defined(PADDLE_WITH_CUDA) && defined(__CUDA_ARCH__)
   // __isnanf not supported on HIP platform
   return __isnanf(a.real) || __isnanf(a.imag);
 #else
@@ -366,7 +379,7 @@ HOSTDEVICE inline bool(isnan)(const complex64& a) {
 }
 
 HOSTDEVICE inline bool(isinf)(const complex64& a) {
-#if defined(__CUDA_ARCH__)
+#if defined(PADDLE_WITH_CUDA) && defined(__CUDA_ARCH__)
   // __isinff not supported on HIP platform
   return __isinff(a.real) || __isinff(a.imag);
 #else
@@ -379,7 +392,8 @@ HOSTDEVICE inline bool(isfinite)(const complex64& a) {
 }
 
 HOSTDEVICE inline float(abs)(const complex64& a) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX64) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   return complex64(thrust::abs(thrust::complex<float>(a.real, a.imag)));
 #else
   return std::abs(std::complex<float>(a.real, a.imag));
@@ -387,7 +401,8 @@ HOSTDEVICE inline float(abs)(const complex64& a) {
 }
 
 HOSTDEVICE inline complex64(pow)(const complex64& a, const complex64& b) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX64) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   return complex64(thrust::pow(thrust::complex<float>(a.real, a.imag),
                                thrust::complex<float>(b.real, b.imag)));
 #else
@@ -396,7 +411,8 @@ HOSTDEVICE inline complex64(pow)(const complex64& a, const complex64& b) {
 }
 
 HOSTDEVICE inline complex64(sqrt)(const complex64& a) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX64) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   return complex64(thrust::sqrt(thrust::complex<float>(a.real, a.imag)));
 #else
   return std::sqrt(std::complex<float>(a));
@@ -404,7 +420,8 @@ HOSTDEVICE inline complex64(sqrt)(const complex64& a) {
 }
 
 HOSTDEVICE inline complex64(tanh)(const complex64& a) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX64) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   return complex64(thrust::tanh(thrust::complex<float>(a.real, a.imag)));
 #else
   return std::tanh(std::complex<float>(a));
@@ -412,7 +429,8 @@ HOSTDEVICE inline complex64(tanh)(const complex64& a) {
 }
 
 HOSTDEVICE inline complex64(log)(const complex64& a) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX64) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   return complex64(thrust::log(thrust::complex<float>(a.real, a.imag)));
 #else
   return std::log(std::complex<float>(a));
diff --git a/paddle/fluid/platform/denormal.cc b/paddle/fluid/platform/denormal.cc
index 35e9804e2a308..4af156d1577dd 100644
--- a/paddle/fluid/platform/denormal.cc
+++ b/paddle/fluid/platform/denormal.cc
@@ -28,7 +28,7 @@
 #endif
 
 #if !defined(GCC_WITHOUT_INTRINSICS) && !defined(PADDLE_WITH_ARM) && \
-    !defined(PADDLE_WITH_SW) && !defined(PADDLE_WITH_MIPS)
+    !defined(PADDLE_WITH_SW) && !defined(PADDLE_WITH_MIPS) && !defined(_WIN32)
 #define DENORM_USE_INTRINSICS
 #endif
 
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index a0ade3898c336..50bb64d557444 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -16,8 +16,8 @@ limitations under the License. */
 #include "paddle/fluid/memory/allocation/cuda_device_context_allocator.h"
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
-
 #include "glog/logging.h"
+#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace memory {
@@ -254,8 +254,9 @@ NPUDeviceContext::~NPUDeviceContext() {
 }
 
 void NPUDeviceContext::Wait() const {
-  NPUDeviceGuard guard(place_.device);
-  PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeDevice());
+  platform::RecordEvent record_event("NPUDeviceContext/wait");
+  VLOG(4) << "NPU context(" << this << ")  Wait";
+  stream_->Wait();
 }
 
 aclrtStream NPUDeviceContext::stream() const { return stream_->raw_stream(); }
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index face048f28e83..f79cb1ab94788 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -85,6 +85,7 @@ namespace platform {
 void SetAllowTF32Cublas(bool active);
 /*Get the global variable allow_tf32_cublas value*/
 bool AllowTF32Cublas();
+extern bool allow_tf32_cudnn;
 /*Set the value of the global variable allow_tf32_cudnn*/
 void SetAllowTF32Cudnn(bool active);
 /*Get the global variable allow_tf32_cudnn value*/
@@ -188,19 +189,35 @@ class NPUDeviceContext : public DeviceContext {
   /*! \brief  Return npu stream in the device context. */
   aclrtStream stream() const;
 
-#ifdef PADDLE_WITH_ASCEND_HCCL
-  /*! \brief  Return bkcl context. */
-  HCCLContext_t hccl_context() const { return hccl_context_; }
+  template <typename Callback>
+  void AddStreamCallback(Callback&& callback) const {
+    return stream_->AddCallback(callback);
+  }
 
-  /*! \brief  Set bkcl context. */
-  void set_hccl_context(HCCLContext_t context) { hccl_context_ = context; }
+  void WaitStreamCallback() const { return stream_->WaitCallback(); }
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+  /*! \brief  Return hccl communicators. */
+  HcclComm hccl_comm() const { return hccl_comm_; }
+
+  /*! \brief  Set hccl communicators. */
+  void set_hccl_comm(HcclComm comm) { hccl_comm_ = comm; }
 #endif
 
+  // template <typename Callback>
+  // void AddStreamCallback(Callback&& callback) const {
+  //   return stream_->AddCallback(callback);
+  // }
+
+  // void WaitStreamCallback() const { return stream_->WaitCallback(); }
+
  private:
   NPUPlace place_;
   aclrtContext context_;
-#ifdef PADDLE_WITH_ASCEND_HCCL
-  HCCLContext_t hccl_context_;
+
+#ifdef PADDLE_WITH_ASCEND_CL
+  // HCCLContext_t hccl_context_;
+  HcclComm hccl_comm_{nullptr};
 #endif
 
   // Need to be the same with other DeviceContext,
diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc
index 717b5ce83c6c9..724a9b8483cde 100644
--- a/paddle/fluid/platform/device_tracer.cc
+++ b/paddle/fluid/platform/device_tracer.cc
@@ -587,6 +587,8 @@ class DeviceTracerImpl : public DeviceTracer {
               BOOST_GET_CONST(platform::CUDAPlace, r.place).GetDeviceId());
         } else if (platform::is_cuda_pinned_place(r.place)) {
           event->set_place(proto::MemEvent::CUDAPinnedPlace);
+        } else if (platform::is_npu_place(r.place)) {
+          event->set_place(proto::MemEvent::NPUPlace);
         } else {
           PADDLE_THROW(platform::errors::Unimplemented(
               "The current place is not supported."));
diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt
index e65a38cd323aa..b25fb5978d055 100644
--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/dynload/CMakeLists.txt
@@ -9,7 +9,7 @@ endif()
 # There is no macOS version of NCCL.
 # Disable nvrtc and cuda_driver api on MacOS and Windows, and only do a early test on Linux.
 if (NOT APPLE AND NOT WIN32)
-  list(APPEND CUDA_SRCS nvrtc.cc cuda_driver.cc)
+    list(APPEND CUDA_SRCS nvrtc.cc cuda_driver.cc)
   if (WITH_NCCL)
     list(APPEND CUDA_SRCS nccl.cc)
   endif()
@@ -32,6 +32,8 @@ endif(CUPTI_FOUND)
 if(WITH_ROCM)
   hip_library(dynload_cuda SRCS ${HIP_SRCS} DEPS dynamic_loader)
   cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc)
+elseif (WITH_ASCEND_CL)
+  cc_library(dynload_warpctc SRCS warpctc.cc hccl.cc DEPS dynamic_loader warpctc)
 else()
   nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader)
   cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc)
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc
index 956acfe2771c5..b49875f256bb2 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -36,6 +36,13 @@ DEFINE_string(nccl_dir, "",
               "For instance, /usr/local/cuda/lib64. If default, "
               "dlopen will search cuda from LD_LIBRARY_PATH");
 
+DEFINE_string(hccl_dir, "",
+              "Specify path for loading hccl library, such as libhccl.so. "
+              "For instance, "
+              "/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/. If "
+              "default, "
+              "dlopen will search hccl from LD_LIBRARY_PATH");
+
 DEFINE_string(cupti_dir, "", "Specify path for loading cupti.so.");
 
 DEFINE_string(
@@ -392,6 +399,24 @@ void* GetNCCLDsoHandle() {
                                     warning_msg);
 #endif
 }
+void* GetHCCLDsoHandle() {
+  std::string warning_msg(
+      "You may need to install 'hccl2' from Huawei official website: "
+      "before install PaddlePaddle.");
+#if defined(__APPLE__) || defined(__OSX__)
+  return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.dylib", true, {},
+                                    warning_msg);
+#elif defined(PADDLE_WITH_HIP) && defined(PADDLE_WITH_RCCL)
+  return GetDsoHandleFromSearchPath(FLAGS_rccl_dir, "librccl.so", true);
+
+#elif defined(PADDLE_WITH_ASCEND_CL)
+  return GetDsoHandleFromSearchPath(FLAGS_hccl_dir, "libhccl.so", true, {},
+                                    warning_msg);
+#else
+  return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.so", true, {},
+                                    warning_msg);
+#endif
+}
 
 void* GetTensorRtDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.h b/paddle/fluid/platform/dynload/dynamic_loader.h
index c3f5953c78579..8424160931690 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.h
+++ b/paddle/fluid/platform/dynload/dynamic_loader.h
@@ -34,6 +34,7 @@ void* GetNVRTCDsoHandle();
 void* GetCUDADsoHandle();
 void* GetWarpCTCDsoHandle();
 void* GetNCCLDsoHandle();
+void* GetHCCLDsoHandle();
 void* GetTensorRtDsoHandle();
 void* GetMKLMLDsoHandle();
 void* GetOpDsoHandle(const std::string& dso_name);
diff --git a/paddle/fluid/platform/dynload/hccl.cc b/paddle/fluid/platform/dynload/hccl.cc
new file mode 100644
index 0000000000000..5efac7691eb98
--- /dev/null
+++ b/paddle/fluid/platform/dynload/hccl.cc
@@ -0,0 +1,41 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_ASCEND_CL
+
+#include "paddle/fluid/platform/dynload/hccl.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+std::once_flag hccl_dso_flag;
+void *hccl_dso_handle;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+HCCL_RAND_ROUTINE_EACH(DEFINE_WRAP);
+
+#if HCCL_VERSION_CODE >= 2212
+HCCL_RAND_ROUTINE_EACH_AFTER_2212(DEFINE_WRAP)
+#endif
+
+#if HCCL_VERSION_CODE >= 2703
+HCCL_RAND_ROUTINE_EACH_AFTER_2703(DEFINE_WRAP)
+#endif
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/platform/dynload/hccl.h b/paddle/fluid/platform/dynload/hccl.h
new file mode 100644
index 0000000000000..a56180ce2d4ca
--- /dev/null
+++ b/paddle/fluid/platform/dynload/hccl.h
@@ -0,0 +1,75 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#ifdef PADDLE_WITH_ASCEND_CL
+
+#include <hccl/hccl.h>
+#include <hccl/hccl_types.h>
+#include <mutex>  // NOLINT
+
+#include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/fluid/platform/port.h"
+
+#define HCOM_GROUP_PREFIX "HCOM_GROUP_"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+extern std::once_flag hccl_dso_flag;
+extern void* hccl_dso_handle;
+
+#define DECLARE_DYNAMIC_LOAD_HCCL_WRAP(__name)                           \
+  struct DynLoad__##__name {                                             \
+    template <typename... Args>                                          \
+    auto operator()(Args... args) -> decltype(__name(args...)) {         \
+      using HCCL_func = decltype(&::__name);                             \
+      std::call_once(hccl_dso_flag, []() {                               \
+        hccl_dso_handle = paddle::platform::dynload::GetHCCLDsoHandle(); \
+      });                                                                \
+      static void* p_##__name = dlsym(hccl_dso_handle, #__name);         \
+      return reinterpret_cast<HCCL_func>(p_##__name)(args...);           \
+    }                                                                    \
+  };                                                                     \
+  extern DynLoad__##__name __name
+
+#define HCCL_RAND_ROUTINE_EACH(__macro) \
+  __macro(HcclReduceScatter);           \
+  __macro(HcclCommDestroy);             \
+  __macro(HcclAllReduce);               \
+  __macro(HcclCommInitRootInfo);        \
+  __macro(HcclGetRootInfo);             \
+  __macro(HcclBroadcast);               \
+  __macro(HcclCommInitClusterInfo);     \
+  __macro(HcclAllGather);
+
+HCCL_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_HCCL_WRAP)
+
+#if HCCL_VERSION_CODE >= 2212
+#define HCCL_RAND_ROUTINE_EACH_AFTER_2212(__macro) __macro(HCCLBroadcast);
+HCCL_RAND_ROUTINE_EACH_AFTER_2212(DECLARE_DYNAMIC_LOAD_HCCL_WRAP)
+#endif
+
+#if HCCL_VERSION_CODE >= 2703
+#define HCCL_RAND_ROUTINE_EACH_AFTER_2703(__macro) \
+  __macro(HCCLSend);                               \
+  __macro(HCCLRecv);
+HCCL_RAND_ROUTINE_EACH_AFTER_2703(DECLARE_DYNAMIC_LOAD_HCCL_WRAP)
+#endif
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/platform/dynload/miopen.h b/paddle/fluid/platform/dynload/miopen.h
index 05b1fc891a0bf..5ff4bff4bff65 100644
--- a/paddle/fluid/platform/dynload/miopen.h
+++ b/paddle/fluid/platform/dynload/miopen.h
@@ -78,6 +78,7 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
  **/
 #define MIOPEN_DNN_ROUTINE_EACH(__macro)                  \
   __macro(miopenGetVersion);                              \
+  __macro(miopenOpTensor);                                \
   __macro(miopenSet4dTensorDescriptor);                   \
   __macro(miopenSetTensorDescriptor);                     \
   __macro(miopenInitConvolutionNdDescriptor);             \
diff --git a/paddle/fluid/platform/dynload/tensorrt.cc b/paddle/fluid/platform/dynload/tensorrt.cc
index e72fbd246cf05..1d105a1fd8682 100644
--- a/paddle/fluid/platform/dynload/tensorrt.cc
+++ b/paddle/fluid/platform/dynload/tensorrt.cc
@@ -27,7 +27,8 @@ void* tensorrt_plugin_dso_handle;
 
 #define DEFINE_WRAP(__name) DynLoad__##__name __name
 
-TENSORRT_RAND_ROUTINE_EACH(DEFINE_WRAP);
+TENSORRT_RAND_ROUTINE_EACH_POINTER(DEFINE_WRAP);
+TENSORRT_RAND_ROUTINE_EACH_NON_POINTER(DEFINE_WRAP);
 TENSORRT_PLUGIN_RAND_ROUTINE_EACH(DEFINE_WRAP);
 
 void* GetDsoHandle(const std::string& dso_name) {
diff --git a/paddle/fluid/platform/dynload/tensorrt.h b/paddle/fluid/platform/dynload/tensorrt.h
index e9bea9af9ca6e..bc29a0472041a 100644
--- a/paddle/fluid/platform/dynload/tensorrt.h
+++ b/paddle/fluid/platform/dynload/tensorrt.h
@@ -37,7 +37,7 @@ void* GetTensorRtPluginHandle();
 extern std::once_flag tensorrt_plugin_dso_flag;
 extern void* tensorrt_plugin_dso_handle;
 
-#define DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP(__name)                            \
+#define DECLARE_DYNAMIC_LOAD_TENSORRT_POINTER_WRAP(__name)                    \
   struct DynLoad__##__name {                                                  \
     template <typename... Args>                                               \
     void* operator()(Args... args) {                                          \
@@ -55,6 +55,23 @@ extern void* tensorrt_plugin_dso_handle;
   };                                                                          \
   extern DynLoad__##__name __name
 
+#define DECLARE_DYNAMIC_LOAD_TENSORRT_NON_POINTER_WRAP(__name)                \
+  struct DynLoad__##__name {                                                  \
+    template <typename... Args>                                               \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {          \
+      std::call_once(tensorrt_dso_flag, []() {                                \
+        tensorrt_dso_handle = paddle::platform::dynload::GetTensorRtHandle(); \
+      });                                                                     \
+      static void* p_##__name = dlsym(tensorrt_dso_handle, #__name);          \
+      PADDLE_ENFORCE_NOT_NULL(p_##__name,                                     \
+                              platform::errors::Unavailable(                  \
+                                  "Load tensorrt api %s failed", #__name));   \
+      using tensorrt_func = decltype(&::__name);                              \
+      return reinterpret_cast<tensorrt_func>(p_##__name)(args...);            \
+    }                                                                         \
+  };                                                                          \
+  extern DynLoad__##__name __name
+
 #define DECLARE_DYNAMIC_LOAD_TENSORRT_PLUGIN_WRAP(__name)                      \
   struct DynLoad__##__name {                                                   \
     template <typename... Args>                                                \
@@ -76,20 +93,25 @@ extern void* tensorrt_plugin_dso_handle;
 #ifdef NV_TENSORRT_MAJOR
 
 #if (NV_TENSORRT_MAJOR >= 6)
-#define TENSORRT_RAND_ROUTINE_EACH(__macro) \
-  __macro(createInferBuilder_INTERNAL);     \
-  __macro(createInferRuntime_INTERNAL);     \
+#define TENSORRT_RAND_ROUTINE_EACH_POINTER(__macro) \
+  __macro(createInferBuilder_INTERNAL);             \
+  __macro(createInferRuntime_INTERNAL);             \
   __macro(getPluginRegistry);
 #else
-#define TENSORRT_RAND_ROUTINE_EACH(__macro) \
-  __macro(createInferBuilder_INTERNAL);     \
+#define TENSORRT_RAND_ROUTINE_EACH_POINTER(__macro) \
+  __macro(createInferBuilder_INTERNAL);             \
   __macro(createInferRuntime_INTERNAL);
 #endif
 
+#define TENSORRT_RAND_ROUTINE_EACH_NON_POINTER(__macro) \
+  __macro(getInferLibVersion);
+
 #define TENSORRT_PLUGIN_RAND_ROUTINE_EACH(__macro) \
   __macro(initLibNvInferPlugins);
 
-TENSORRT_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP)
+TENSORRT_RAND_ROUTINE_EACH_POINTER(DECLARE_DYNAMIC_LOAD_TENSORRT_POINTER_WRAP)
+TENSORRT_RAND_ROUTINE_EACH_NON_POINTER(
+    DECLARE_DYNAMIC_LOAD_TENSORRT_NON_POINTER_WRAP)
 TENSORRT_PLUGIN_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_TENSORRT_PLUGIN_WRAP)
 
 #endif  // end of NV_TENSORRT_MAJOR
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index f0809d34d493e..cfca3ceadf41a 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -47,6 +47,7 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_ASCEND_CL
 #include "acl/acl.h"
+#include "hccl/hccl_types.h"
 #endif  // PADDLE_WITH_ASCEND_CL
 
 #include <fstream>
@@ -1220,6 +1221,7 @@ struct NPUStatusType {};
   }
 
 DEFINE_NPU_STATUS_TYPE(aclError, ACL_ERROR_NONE);
+DEFINE_NPU_STATUS_TYPE(HcclResult, HCCL_SUCCESS);
 }  // namespace details
 
 inline std::string build_npu_error_msg(aclError stat) {
@@ -1228,6 +1230,12 @@ inline std::string build_npu_error_msg(aclError stat) {
   return sout.str();
 }
 
+inline std::string build_npu_error_msg(HcclResult stat) {
+  std::ostringstream sout;
+  sout << " HCCL error, the error code is : " << stat << ". ";
+  return sout.str();
+}
+
 #define PADDLE_ENFORCE_NPU_SUCCESS(COND)                       \
   do {                                                         \
     auto __cond__ = (COND);                                    \
diff --git a/paddle/fluid/platform/hccl_helper.h b/paddle/fluid/platform/hccl_helper.h
new file mode 100644
index 0000000000000..692f8dbe0bf1e
--- /dev/null
+++ b/paddle/fluid/platform/hccl_helper.h
@@ -0,0 +1,355 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#if defined(PADDLE_WITH_HCCL) || defined(PADDLE_WITH_RCCL) || \
+    defined(PADDLE_WITH_ASCEND_CL)
+
+#include <stdio.h>
+#include <memory>
+#include <string>
+#include <thread>  // NOLINT
+#include <typeindex>
+#include <unordered_map>
+#include <vector>
+
+#ifdef PADDLE_WITH_ASCEND_CL
+#include "paddle/fluid/platform/dynload/hccl.h"
+#endif
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/float16.h"
+
+#define HCCL_ID_VARNAME "HCCLID"
+
+namespace paddle {
+namespace platform {
+
+inline HcclDataType ToHCCLDataType(framework::proto::VarType::Type type) {
+  if (type == framework::proto::VarType::FP32) {
+    return HCCL_DATA_TYPE_FP32;
+  } else if (type == framework::proto::VarType::FP16) {
+    return HCCL_DATA_TYPE_FP16;
+  } else if (type == framework::proto::VarType::INT32) {
+    return HCCL_DATA_TYPE_INT32;
+  } else if (type == framework::proto::VarType::INT8) {
+    return HCCL_DATA_TYPE_INT8;
+  } else {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "This datatype in hccl is not supported."));
+  }
+}
+
+// NOTE(minqiyang): according to the ncclGroupEnd documentations:
+// https://docs.nvidia.com/deeplearning/sdk/nccl-api/ncclapidoc.html,
+// ncclGroupEnd will wait for all communicators to be initialized, which will
+// cause blocking problem when a runtime_error was thrown, so try only guard
+// HCCL actions when use it.
+
+// class HCCLGroupGuard {
+//  public:
+//   static std::mutex &HCCLMutex() {
+//     static std::mutex mtx;
+//     return mtx;
+//   }
+
+//   inline HCCLGroupGuard() {
+//     HCCLMutex().lock();
+//     PADDLE_ENFORCE_CUDA_SUCCESS(dynload::ncclGroupStart());
+//   }
+
+//   inline ~HCCLGroupGuard() PADDLE_MAY_THROW {
+//     PADDLE_ENFORCE_CUDA_SUCCESS(dynload::ncclGroupEnd());
+//     HCCLMutex().unlock();
+//   }
+// };
+
+struct HCCLContext {
+  std::unique_ptr<NPUDeviceContext> ctx_;
+  HcclComm comm_;
+
+  explicit HCCLContext(int dev_id)
+      : ctx_(new NPUDeviceContext(NPUPlace(dev_id))), comm_{nullptr} {}
+
+  aclrtStream stream() const { return ctx_->stream(); }
+  HcclComm comm() const { return comm_; }
+
+  int device_id() const {
+    return BOOST_GET_CONST(platform::NPUPlace, ctx_->GetPlace()).device;
+  }
+};
+
+struct HCCLContextMap {
+  std::unordered_map<int, HCCLContext> contexts_;
+  std::vector<int> order_;
+
+  explicit HCCLContextMap(const std::vector<platform::Place> &places,
+                          HcclRootInfo *hccl_id = nullptr,
+                          size_t num_trainers = 1, size_t trainer_id = 0) {
+    PADDLE_ENFORCE_EQ(!places.empty(), true,
+                      platform::errors::InvalidArgument(
+                          "The HCCL place should not be empty."));
+    order_.reserve(places.size());
+    for (auto &p : places) {
+      int dev_id = BOOST_GET_CONST(NPUPlace, p).device;
+      order_.emplace_back(dev_id);
+      contexts_.emplace(dev_id, HCCLContext(dev_id));
+    }
+    PADDLE_ENFORCE_EQ(
+        order_.size(), contexts_.size(),
+        platform::errors::Unavailable("HCCL Context Map does not support "
+                                      "contain two or more same device."));
+
+    std::unique_ptr<HcclComm[]> comms(new HcclComm[order_.size()]);
+    // if num_trainers == 1, should create a new nccl id for local comms.
+    if (num_trainers == 1 && hccl_id == nullptr) {
+      // we do not know how to tackle this situation under hccl
+      // std::lock_guard<std::mutex> guard(HCCLGroupGuard::HCCLMutex());
+      // PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::ncclCommInitAll(
+      //     comms.get(), static_cast<int>(order_.size()), order_.data()));
+    } else {
+      PADDLE_ENFORCE_NOT_NULL(hccl_id, platform::errors::InvalidArgument(
+                                           "The HCCL id should not be null."));
+      {
+        int nranks = num_trainers * order_.size();
+        // HCCLGroupGuard gurad;
+        for (size_t i = 0; i < order_.size(); ++i) {
+          int gpu_id = order_[i];
+          int rank;
+          if (order_.size() > 1) {
+            rank = trainer_id * order_.size() + i;
+          } else {
+            rank = trainer_id;
+          }
+          VLOG(1) << "init hccl rank:" << rank << ", nranks:" << nranks
+                  << ", gpu_id:" << gpu_id << ", dev_id:" << order_[i];
+          aclrtSetDevice(gpu_id);
+          PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclCommInitRootInfo(
+              nranks, hccl_id, rank, comms.get() + i));
+        }
+      }
+    }
+    int i = 0;
+    for (auto &dev_id : order_) {
+      contexts_.at(dev_id).comm_ = comms[i++];
+    }
+  }
+
+  HCCLContextMap(const HCCLContextMap &other) = delete;
+  HCCLContextMap &operator=(const HCCLContextMap &other) = delete;
+
+  NPUDeviceContext *DevCtx(int dev_id) const { return at(dev_id).ctx_.get(); }
+
+  NPUDeviceContext *DevCtx(platform::Place p) const {
+    return DevCtx(BOOST_GET_CONST(NPUPlace, p).device);
+  }
+
+  const HCCLContext &at(platform::Place p) const {
+    return this->at(BOOST_GET_CONST(NPUPlace, p).device);
+  }
+
+  const HCCLContext &at(int dev_id) const { return contexts_.at(dev_id); }
+
+  void WaitAll() {
+    for (auto &p : contexts_) {
+      p.second.ctx_->Wait();
+    }
+  }
+};
+
+inline std::string GetFlatHCCLVarName(size_t pos) {
+  if (pos == 0) {
+    return HCCL_ID_VARNAME;
+  }
+  return string::Sprintf("%s_%d", HCCL_ID_VARNAME, static_cast<int>(pos));
+}
+
+inline std::string GetHierarchicalExterHCCLVarName(size_t pos) {
+  return string::Sprintf("Hierarchical_exter_%s_%d", HCCL_ID_VARNAME,
+                         static_cast<int>(pos));
+}
+inline std::string GetHierarchicalInterHCCLVarName(size_t pos) {
+  return string::Sprintf("Hierarchical_inter_%s_%d", HCCL_ID_VARNAME,
+                         static_cast<int>(pos));
+}
+
+class HCCLCommunicator {
+ public:
+  HCCLCommunicator() {}
+  virtual ~HCCLCommunicator() PADDLE_MAY_THROW {}
+
+  HCCLContextMap *DefaultFlatCtx() const {
+    if (flat_ctxs_.size() == 0) {
+      return nullptr;
+    }
+
+    return flat_ctxs_[0].get();
+  }
+
+  std::vector<std::unique_ptr<HCCLContextMap>> *GetFlatCtxs() {
+    return &flat_ctxs_;
+  }
+
+  HCCLContextMap *GetFlatCtx(size_t run_order) const {
+    return flat_ctxs_[run_order % flat_ctxs_.size()].get();
+  }
+
+  HCCLContextMap *GetRunEnvHCCLCtx(size_t run_order,
+                                   bool use_hierarchical_allreduce) const {
+    if (!use_hierarchical_allreduce) {
+      return GetFlatCtx(run_order);
+    }
+
+    return GetHierarchicalInterCtx(run_order);
+  }
+
+  /*
+   When nccl inits nccl comm using ncclCommInitAll, it meets error when
+   allreduce ophandle and sync_batch_norm_op use ncclallreduce parallelly. So
+   create a new nccl comm for sync_batch_norm_op. And these codes should be
+   polished with a unified nccl management.
+  */
+
+  HCCLContextMap *GetSyncBatchNormCtx(
+      framework::Scope *scope, const std::vector<platform::Place> &places) {
+    auto *hccl_id_var = scope->FindVar(HCCL_ID_VARNAME);
+    if (hccl_id_var != nullptr) {
+      return DefaultFlatCtx();
+    }
+
+    if (sync_batch_norm_ctx_.get() == nullptr) {
+      sync_batch_norm_ctx_.reset(new HCCLContextMap(places));
+    }
+    return sync_batch_norm_ctx_.get();
+  }
+
+  void InitFlatCtxs(const std::vector<platform::Place> &places,
+                    const std::vector<HcclRootInfo *> &hccl_ids,
+                    size_t trainers_num, size_t trainer_id) {
+    if (hccl_ids.size() == 0) {
+      auto ptr = new platform::HCCLContextMap(places);
+      VLOG(1) << "init local trainer";
+      flat_ctxs_.emplace_back(ptr);
+    } else {
+      for (size_t i = 0; i < hccl_ids.size(); i++) {
+        auto ptr = new platform::HCCLContextMap(places, hccl_ids[i],
+                                                trainers_num, trainer_id);
+        VLOG(1) << "init trainer_id:" << trainer_id << ", comm no:" << i;
+        flat_ctxs_.emplace_back(ptr);
+      }
+    }
+
+    // as Executor have no way to use ncclComm created by ParallelExecutor,
+    // we assign all flatten contexts to HCCLCommContext to fix.
+    int nranks = static_cast<int>(trainers_num * places.size());
+    int nrings = static_cast<int>(flat_ctxs_.size());
+    for (int ring_id = 0; ring_id < nrings; ++ring_id) {
+      for (size_t p = 0; p < places.size(); ++p) {
+        int rank = trainer_id * places.size() + p;
+        int dev_id = BOOST_GET_CONST(NPUPlace, places[p]).device;
+        auto &ctx = flat_ctxs_[ring_id]->contexts_.at(dev_id);
+        HCCLCommContext::Instance().AssignHCCLComm(ctx.comm_, nranks, rank,
+                                                   dev_id, ring_id);
+      }
+    }
+  }
+
+  void InitHierarchicalCtxs(const std::vector<platform::Place> &places,
+                            const std::vector<HcclRootInfo *> &inter_hccl_ids,
+                            const std::vector<HcclRootInfo *> &exter_hccl_ids,
+                            size_t trainers_num, size_t trainer_id,
+                            size_t inter_trainers_num,
+                            size_t exter_trainers_num) {
+    PADDLE_ENFORCE_EQ(
+        trainers_num, inter_trainers_num * exter_trainers_num,
+        platform::errors::InvalidArgument(
+            "trainers_num:%llu != inter_trainers_num:%llu * "
+            "exter_trainers_num:%llu",
+            trainers_num, inter_trainers_num, exter_trainers_num));
+
+    PADDLE_ENFORCE_GT(
+        inter_trainers_num, 1,
+        platform::errors::InvalidArgument(
+            "The inter_trainers_num:%llu should be larger than 1.",
+            inter_trainers_num));
+
+    int inter_trainer_id = trainer_id % inter_trainers_num;
+    for (size_t i = 0; i < inter_hccl_ids.size(); i++) {
+      VLOG(1) << "init inter_trainer_id:" << inter_trainer_id
+              << ", comm no:" << i;
+      auto local = new HCCLContextMap(places, inter_hccl_ids[i],
+                                      inter_trainers_num, inter_trainer_id);
+
+      h_inter_ctxs_.emplace_back(local);
+    }
+
+    int exter_trainer_id = -1;
+    if (trainer_id % inter_trainers_num == 0) {
+      exter_trainer_id = trainer_id / inter_trainers_num;
+    }
+
+    if (exter_trainer_id >= 0) {
+      for (size_t i = 0; i < exter_hccl_ids.size(); i++) {
+        auto ex = new HCCLContextMap(places, exter_hccl_ids[i],
+                                     exter_trainers_num, exter_trainer_id);
+        VLOG(1) << "init exter_trainer_id:" << exter_trainer_id
+                << ", comm no:" << i;
+        h_exter_ctxs_.emplace_back(ex);
+      }
+    }
+  }
+
+  bool NeedExterAllReduce() const { return h_exter_ctxs_.size() > 0; }
+
+  HCCLContextMap *GetHierarchicalInterCtx(size_t run_order) const {
+    PADDLE_ENFORCE_GT(h_inter_ctxs_.size(), 0,
+                      platform::errors::InvalidArgument(
+                          "Hierarchical ctxs should be initialized firstly!"));
+    return h_inter_ctxs_[run_order % h_inter_ctxs_.size()].get();
+  }
+
+  HCCLContextMap *GetHierarchicalExterCtx(size_t run_order) const {
+    PADDLE_ENFORCE_GT(h_exter_ctxs_.size(), 0,
+                      platform::errors::InvalidArgument(
+                          "Hierarchical ctxs should be initialized firstly!"));
+    return h_exter_ctxs_[run_order % h_exter_ctxs_.size()].get();
+  }
+
+  std::vector<std::unique_ptr<HCCLContextMap>> *GetHierarchicalInterCtxs() {
+    return &h_inter_ctxs_;
+  }
+
+  std::vector<std::unique_ptr<HCCLContextMap>> *GetHierarchicalExterCtxs() {
+    return &h_exter_ctxs_;
+  }
+
+ protected:
+  // Support multi nccl comm on default nccl ring while HCCLContextMap can't.
+  std::vector<std::unique_ptr<HCCLContextMap>> flat_ctxs_;
+
+  // h_inter_ctxs_ and h_exter_ctxs_ are for 2d allreduce.
+  // And h_exter_ctxs_ can support multi comm too.
+  std::vector<std::unique_ptr<HCCLContextMap>> h_inter_ctxs_;
+  std::vector<std::unique_ptr<HCCLContextMap>> h_exter_ctxs_;
+
+  // just used for sync_batch_norm op.
+  std::unique_ptr<HCCLContextMap> sync_batch_norm_ctx_;
+};
+
+}  // namespace platform
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
index c79b642c51b1f..54efa55cc4cd9 100644
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -630,6 +630,78 @@ class BinaryMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::binary> {
   }
 };
 
+template <typename T>
+class BroadcastDataMKLDNNHandler
+    : public platform::MKLDNNHandlerT<T, dnnl::binary> {
+ public:
+  BroadcastDataMKLDNNHandler(const dnnl::algorithm algo,
+                             const MKLDNNDeviceContext& dev_ctx,
+                             const mkldnn::engine engine,
+                             platform::Place cpu_place, const Tensor* x,
+                             const Tensor* y, float scale_x, float scale_y,
+                             const std::string& uniq_name)
+      : platform::MKLDNNHandlerT<T, dnnl::binary>(
+            dev_ctx, engine, cpu_place,
+            platform::CreateKey(dev_ctx, framework::vectorize(x->dims()),
+                                uniq_name)) {
+    if (!this->isCached()) {
+      PADDLE_ENFORCE_EQ(
+          x->layout(), DataLayout::kMKLDNN,
+          platform::errors::InvalidArgument("Wrong layout set for X tensor."));
+      PADDLE_ENFORCE_NE(
+          x->format(), MKLDNNMemoryFormat::undef,
+          platform::errors::InvalidArgument("Wrong format set for X tensor."));
+
+      PADDLE_ENFORCE_EQ(
+          y->layout(), DataLayout::kMKLDNN,
+          platform::errors::InvalidArgument("Wrong layout set for Y tensor."));
+      PADDLE_ENFORCE_NE(
+          y->format(), MKLDNNMemoryFormat::undef,
+          platform::errors::InvalidArgument("Wrong format set for Y tensor."));
+
+      auto src1_tz = framework::vectorize(y->dims());
+      const auto src0_tz = framework::vectorize(x->dims());
+
+      // GetExpectedKernelType checks if smaller vector is a subvector with all
+      // the dims in correct order on the rightmost part of the bigger vector,
+      // i.e. a correct vector for broadcasting:
+      //  x = 5, 7, 3, 2, 4, 8
+      //  y = 4, 8
+      src1_tz.reserve(src0_tz.size());
+
+      for (size_t i = src1_tz.size(); i < src0_tz.size(); ++i) {
+        src1_tz.insert(src1_tz.begin(), 1L);
+      }
+
+      const auto src0_md = dnnl::memory::desc(
+          src0_tz, platform::MKLDNNGetDataType<T>(), x->format());
+      const auto src1_md = dnnl::memory::desc(
+          src1_tz, platform::MKLDNNGetDataType<T>(), x->format());
+
+      dnnl::primitive_attr attributes;
+      attributes.set_scales(DNNL_ARG_SRC_0, 0, {scale_x});
+      attributes.set_scales(DNNL_ARG_SRC_1, 0, {scale_y});
+
+      this->AcquireForwardPrimitiveDescriptor(attributes, algo, src0_md,
+                                              src1_md, src0_md);
+    }
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireSrcMemory(framework::Tensor* input) {
+    T* input_data = input->data<T>();
+    memset(input_data, 0, this->fwd_pd_->src_desc().get_size());
+    return this->AcquireMemoryFromPrimitive(
+        this->fwd_pd_->src_desc(), to_void_cast<T>(input_data), "@src0_mem_p");
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireSecondSrcMemory(
+      const framework::Tensor* input) {
+    const T* input_data = input->data<T>();
+    return this->AcquireMemoryFromPrimitive(
+        this->fwd_pd_->src1_desc(), to_void_cast<T>(input_data), "@src1_mem_p");
+  }
+};
+
 template <typename T>
 class ReductionMKLDNNHandler
     : public platform::MKLDNNHandlerT<T, dnnl::reduction> {
@@ -638,7 +710,8 @@ class ReductionMKLDNNHandler
                          const float eps, const MKLDNNDeviceContext& dev_ctx,
                          const mkldnn::engine engine, platform::Place cpu_place,
                          const Tensor* x, const Tensor* y,
-                         const std::string& uniq_name)
+                         const std::string& uniq_name,
+                         std::vector<int64_t> output_dims)
       : platform::MKLDNNHandlerT<T, dnnl::reduction>(
             dev_ctx, engine, cpu_place,
             platform::CreateKey(dev_ctx, framework::vectorize(x->dims()),
@@ -653,20 +726,11 @@ class ReductionMKLDNNHandler
           platform::errors::InvalidArgument("Wrong format set for X tensor."));
 
       const auto src_tz = framework::vectorize(x->dims());
-      const auto dst_tz = framework::vectorize(y->dims());
-
-      // For oneDNN dimensionality should match so we need to
-      // extend Y tensor dims with values of 1 (before and after pattern)
-      int j = 0;
-      std::vector<int64_t> dst_tz_ex(src_tz.size(), 1);
-      for (size_t i = 0; i < src_tz.size(); ++i) {
-        dst_tz_ex[i] = (src_tz[i] != dst_tz[j]) ? 1 : dst_tz[j++];
-      }
 
       const auto src_md = dnnl::memory::desc(
           src_tz, platform::MKLDNNGetDataType<T>(), x->format());
       const auto dst_md = memory::desc(
-          dst_tz_ex, platform::MKLDNNGetDataType<T>(), x->format());
+          output_dims, platform::MKLDNNGetDataType<T>(), x->format());
 
       this->AcquireForwardPrimitiveDescriptor(algo, src_md, dst_md, p, eps);
     }
diff --git a/paddle/fluid/platform/npu_info.cc b/paddle/fluid/platform/npu_info.cc
index 090945239a3a1..bb36eedb83238 100644
--- a/paddle/fluid/platform/npu_info.cc
+++ b/paddle/fluid/platform/npu_info.cc
@@ -163,8 +163,11 @@ size_t NPUInitAllocSize() { return NPUAllocSize(/* realloc = */ false); }
 size_t NPUReallocSize() { return NPUAllocSize(/* realloc = */ true); }
 
 size_t NPUMinChunkSize() {
-  // Allow to allocate the minimum chunk size is 256 bytes.
-  return 1 << 8;
+  // NOTE(zhiqiu): It seems the min chunk size should be 512 on NPU,
+  // though no document specify that explicitly.
+  // See https://gitee.com/zhiqiuchen/Ascend/tree/master/test_reduce_sum_d for
+  // details.
+  return 1 << 9;
 }
 
 size_t NPUMaxChunkSize() {
@@ -187,6 +190,8 @@ void NPUMemcpySync(void *dst, const void *src, size_t count,
                    enum aclrtMemcpyKind kind, size_t dst_max_count) {
   // NOTE(zhiqiu):  The default max_count is count
   dst_max_count = dst_max_count ? dst_max_count : count;
+  VLOG(4) << dst << " " << dst_max_count << " " << src << " " << count << " "
+          << kind;
   PADDLE_ENFORCE_NPU_SUCCESS(aclrtMemcpy(dst, dst_max_count, src, count, kind));
 }
 
diff --git a/paddle/fluid/platform/npu_profiler.h b/paddle/fluid/platform/npu_profiler.h
new file mode 100644
index 0000000000000..a7b674d0d0c3f
--- /dev/null
+++ b/paddle/fluid/platform/npu_profiler.h
@@ -0,0 +1,102 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "acl/acl_prof.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace platform {
+
+#ifdef PADDLE_WITH_ASCEND_STRING
+// For CANN 20.2+
+// ACL_AICORE_ARITHMETIC_UTILIZATION = 0, record arithmetic stats
+// ACL_AICORE_PIPE_UTILIZATION = 1, record pipeline
+// ACL_AICORE_MEMORY_BANDWIDTH = 2, record memory
+// ACL_AICORE_L0B_AND_WIDTH = 3, recore internal memory
+// ACL_AICORE_RESOURCE_CONFLICT_RATIO = 5, record pipeline ratio
+constexpr aclprofAicoreMetrics default_metrics =
+    ACL_AICORE_ARITHMETIC_UTILIZATION;
+#else
+// For CANN 20.1
+// ACL_AICORE_ARITHMATIC_THROUGHPUT = 0, record arithmetic stats
+// ACL_AICORE_PIPELINE = 1, record pipeline
+// ACL_AICORE_SYNCHRONIZATION = 2, record sync
+// ACL_AICORE_MEMORY = 3, recore memory
+// ACL_AICORE_INTERNAL_MEMORY = 4, recore internal memory
+// ACL_AICORE_STALL = 5, record pipeline ratio
+constexpr aclprofAicoreMetrics default_metrics =
+    ACL_AICORE_ARITHMATIC_THROUGHPUT;
+#endif
+
+// ACL_PROF_ACL_API, record ACL API stats
+// ACL_PROF_TASK_TIME, record AI core stats
+// ACL_PROF_AICORE_METRICS, must include
+// ACL_PROF_AICPU_TRACE, recore AICPU, not supported yet
+constexpr uint64_t default_type =
+    ACL_PROF_ACL_API | ACL_PROF_AICORE_METRICS | ACL_PROF_TASK_TIME;
+
+aclprofConfig *NPUProfilerCreateConfig(
+    std::vector<uint32_t> devices = {},
+    aclprofAicoreMetrics metrics = default_metrics, uint64_t c = default_type,
+    aclprofAicoreEvents *events = nullptr) {
+  if (devices.size() == 0) {
+    int device_id = GetCurrentNPUDeviceId();
+    devices.emplace_back(device_id);
+  }
+  aclprofConfig *config =
+      aclprofCreateConfig(devices.data(), devices.size(), metrics, events, c);
+  PADDLE_ENFORCE_NOT_NULL(config, paddle::platform::errors::External(
+                                      "Failed to create prof config for NPU"));
+  return config;
+}
+
+void NPUProfilerDestroyConfig(const aclprofConfig *config) {
+  PADDLE_ENFORCE_NPU_SUCCESS(aclprofDestroyConfig(config));
+}
+
+void NPUProfilerInit(std::string output_path) {
+  PADDLE_ENFORCE_NPU_SUCCESS(
+      aclprofInit(output_path.c_str(), output_path.size()));
+}
+
+void NPUProfilerStart(const aclprofConfig *config) {
+  if (config == nullptr) {
+    // NOTE(zhiqiu): support single device by default.
+    int device_id = GetCurrentNPUDeviceId();
+    std::vector<uint32_t> devices = {static_cast<uint32_t>(device_id)};
+    config = NPUProfilerCreateConfig(devices);
+  }
+  PADDLE_ENFORCE_NPU_SUCCESS(aclprofStart(config));
+}
+
+void NPUProfilerStop(const aclprofConfig *config) {
+  PADDLE_ENFORCE_NPU_SUCCESS(aclprofStop(config));
+  NPUProfilerDestroyConfig(config);
+}
+
+void NPUProfilerFinalize() { PADDLE_ENFORCE_NPU_SUCCESS(aclprofFinalize()); }
+
+struct NPUProfConfigWrapper {
+  aclprofConfig *p_;
+  explicit NPUProfConfigWrapper(aclprofConfig *p) : p_(p) {}
+  aclprofConfig *ptr() { return p_; }
+};
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/npu_resource_pool.cc b/paddle/fluid/platform/npu_resource_pool.cc
new file mode 100644
index 0000000000000..22b9e8f03971e
--- /dev/null
+++ b/paddle/fluid/platform/npu_resource_pool.cc
@@ -0,0 +1,101 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifdef PADDLE_WITH_ASCEND_CL
+#include "paddle/fluid/platform/npu_resource_pool.h"
+#include "paddle/fluid/platform/npu_info.h"
+
+namespace paddle {
+namespace platform {
+
+NpuStreamResourcePool::NpuStreamResourcePool() {
+  int dev_cnt = platform::GetNPUDeviceCount();
+  pool_.reserve(dev_cnt);
+  for (int dev_idx = 0; dev_idx < dev_cnt; ++dev_idx) {
+    auto creator = [dev_idx] {
+      platform::SetNPUDeviceId(dev_idx);
+      aclrtStream stream;
+      PADDLE_ENFORCE_NPU_SUCCESS(aclrtCreateStream(&stream));
+      return stream;
+    };
+
+    auto deleter = [dev_idx](aclrtStream stream) {
+      platform::SetNPUDeviceId(dev_idx);
+      PADDLE_ENFORCE_NPU_SUCCESS(aclrtDestroyStream(stream));
+    };
+
+    pool_.emplace_back(ResourcePool<NpuStreamObject>::Create(creator, deleter));
+  }
+}
+
+NpuStreamResourcePool& NpuStreamResourcePool::Instance() {
+  static NpuStreamResourcePool pool;
+  return pool;
+}
+
+std::shared_ptr<NpuStreamObject> NpuStreamResourcePool::New(int dev_idx) {
+  PADDLE_ENFORCE_GE(
+      dev_idx, 0,
+      platform::errors::InvalidArgument(
+          "The dev_idx should be not less than 0, but got %d.", dev_idx));
+  PADDLE_ENFORCE_LT(
+      dev_idx, pool_.size(),
+      platform::errors::OutOfRange(
+          "The dev_idx should be less than device count %d, but got %d.",
+          pool_.size(), dev_idx));
+  return pool_[dev_idx]->New();
+}
+
+NpuEventResourcePool::NpuEventResourcePool() {
+  int dev_cnt = platform::GetNPUDeviceCount();
+  pool_.reserve(dev_cnt);
+  for (int dev_idx = 0; dev_idx < dev_cnt; ++dev_idx) {
+    auto creator = [dev_idx] {
+      platform::SetNPUDeviceId(dev_idx);
+      aclrtEvent event;
+      PADDLE_ENFORCE_NPU_SUCCESS(aclrtCreateEvent(&event));
+      return event;
+    };
+
+    auto deleter = [dev_idx](aclrtEvent event) {
+      platform::SetNPUDeviceId(dev_idx);
+      PADDLE_ENFORCE_NPU_SUCCESS(aclrtDestroyEvent(event));
+    };
+
+    pool_.emplace_back(ResourcePool<NpuEventObject>::Create(creator, deleter));
+  }
+}
+
+NpuEventResourcePool& NpuEventResourcePool::Instance() {
+  static NpuEventResourcePool pool;
+  return pool;
+}
+
+std::shared_ptr<NpuEventObject> NpuEventResourcePool::New(int dev_idx) {
+  PADDLE_ENFORCE_GE(
+      dev_idx, 0,
+      platform::errors::InvalidArgument(
+          "The dev_idx should be not less than 0, but got %d.", dev_idx));
+  PADDLE_ENFORCE_LT(
+      dev_idx, pool_.size(),
+      platform::errors::OutOfRange(
+          "The dev_idx should be less than device count %d, but got %d.",
+          pool_.size(), dev_idx));
+  return pool_[dev_idx]->New();
+}
+
+}  // namespace platform
+}  // namespace paddle
+
+#endif
diff --git a/paddle/fluid/platform/npu_resource_pool.h b/paddle/fluid/platform/npu_resource_pool.h
new file mode 100644
index 0000000000000..bfd6ec7f94112
--- /dev/null
+++ b/paddle/fluid/platform/npu_resource_pool.h
@@ -0,0 +1,64 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#ifdef PADDLE_WITH_ASCEND_CL
+#include <memory>
+#include <type_traits>
+#include <vector>
+
+#include "acl/acl.h"
+#include "paddle/fluid/platform/resource_pool.h"
+
+namespace paddle {
+namespace platform {
+
+using NpuStreamObject = std::remove_pointer<aclrtStream>::type;
+using NpuEventObject = std::remove_pointer<aclrtEvent>::type;
+
+class NpuStreamResourcePool {
+ public:
+  std::shared_ptr<NpuStreamObject> New(int dev_idx);
+
+  static NpuStreamResourcePool &Instance();
+
+ private:
+  NpuStreamResourcePool();
+
+  DISABLE_COPY_AND_ASSIGN(NpuStreamResourcePool);
+
+ private:
+  std::vector<std::shared_ptr<ResourcePool<NpuStreamObject>>> pool_;
+};
+
+class NpuEventResourcePool {
+ public:
+  std::shared_ptr<NpuEventObject> New(int dev_idx);
+
+  static NpuEventResourcePool &Instance();
+
+ private:
+  NpuEventResourcePool();
+
+  DISABLE_COPY_AND_ASSIGN(NpuEventResourcePool);
+
+ private:
+  std::vector<std::shared_ptr<ResourcePool<NpuEventObject>>> pool_;
+};
+
+}  // namespace platform
+}  // namespace paddle
+
+#endif
diff --git a/paddle/fluid/platform/profiler.proto b/paddle/fluid/platform/profiler.proto
index cfa3c6906f83f..31193534a00be 100644
--- a/paddle/fluid/platform/profiler.proto
+++ b/paddle/fluid/platform/profiler.proto
@@ -21,6 +21,7 @@ message Event {
   enum EventType {
     CPU = 0;
     GPUKernel = 1;
+    NPUKernel = 2;
   }
   optional EventType type = 8;
   optional string name = 1;
@@ -39,6 +40,8 @@ message MemEvent {
     CUDAPlace = 0;
     CPUPlace = 1;
     CUDAPinnedPlace = 2;
+    XPUPlace = 3;
+    NPUPlace = 4;
   }
   optional uint64 start_ns = 1;
   optional uint64 end_ns = 2;
diff --git a/paddle/fluid/platform/stream_callback_manager.cc b/paddle/fluid/platform/stream_callback_manager.cc
index 287c8fc37e005..9f4ec9b3ce0d4 100644
--- a/paddle/fluid/platform/stream_callback_manager.cc
+++ b/paddle/fluid/platform/stream_callback_manager.cc
@@ -71,6 +71,8 @@ void StreamCallbackManager<Stream>::AddCallback(
 #endif
 
 #if PADDLE_WITH_ASCEND_CL
+  VLOG(3) << "aclrtLaunchCallback at stream: " << stream_;
+  // TODO(zhiqiu): failed to call aclrtLaunchCallback
   PADDLE_ENFORCE_NPU_SUCCESS(aclrtLaunchCallback(StreamCallbackFunc, func,
                                                  ACL_CALLBACK_BLOCK, stream_));
 #endif
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 10c79933546b3..b30214e1d8355 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -36,6 +36,7 @@ endif(NOT WIN32)
 
 if(WITH_PYTHON)
   list(APPEND PYBIND_DEPS py_func_op)
+  list(APPEND PYBIND_DEPS py_layer_op)
 endif()
 
 set(PYBIND_SRCS
@@ -60,7 +61,7 @@ set(PYBIND_SRCS
 if(WITH_ASCEND)
   set(PYBIND_DEPS ${PYBIND_DEPS} ascend_wrapper)
   set(PYBIND_SRCS ${PYBIND_SRCS} ascend_wrapper_py.cc)
-endif(WITH_ASCEND)
+endif()
 
 if(WITH_GLOO)
   set(PYBIND_DEPS ${PYBIND_DEPS} gloo_context)
@@ -75,7 +76,7 @@ endif (WITH_CRYPTO)
 if (WITH_PSCORE)
   set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor -Wno-error=sign-compare -Wno-error=unused-variable -Wno-error=return-type -Wno-error=unused-but-set-variable -Wno-error=type-limits -Wno-error=unknown-pragmas -Wno-error=parentheses -Wno-error=unused-result")
   set_source_files_properties(fleet_py.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-  list(APPEND PYBIND_DEPS fleet communicator)
+  list(APPEND PYBIND_DEPS fleet communicator index_wrapper index_sampler)
   list(APPEND PYBIND_SRCS fleet_py.cc)
 endif()
 
@@ -85,7 +86,11 @@ endif()
 
 if(WITH_PYTHON)
   # generate op pybind functions automatically for dygraph.
-  set(OP_FUNCTION_GENERETOR_DEPS pybind proto_desc executor layer tracer engine imperative_profiler imperative_flag)
+  if (WITH_ASCEND_CL)
+    set(OP_FUNCTION_GENERETOR_DEPS pybind proto_desc executor layer tracer engine imperative_profiler imperative_flag ascend_wrapper)
+  else()
+    set(OP_FUNCTION_GENERETOR_DEPS pybind proto_desc executor layer tracer engine imperative_profiler imperative_flag)
+  endif()
   list(APPEND OP_FUNCTION_GENERETOR_DEPS ${GLOB_OP_LIB})
   list(APPEND OP_FUNCTION_GENERETOR_DEPS ${GLOB_OPERATOR_DEPS})
 
@@ -99,6 +104,7 @@ if(WITH_PYTHON)
 
   add_executable(op_function_generator op_function_generator.cc)
   target_link_libraries(op_function_generator ${OP_FUNCTION_GENERETOR_DEPS})
+
   get_property (os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
   target_link_libraries(op_function_generator ${os_dependency_modules})
   if(WITH_ROCM)
@@ -152,9 +158,9 @@ if(WITH_PYTHON)
           )
     endif()
   else(WIN32)
-    # If there are no *.so in /usr/lib or LD_LIBRARY_PATH, 
+    # If there are no *.so in /usr/lib or LD_LIBRARY_PATH,
     # copy these *.so to current directory and append current directory to
-    # LD_LIBRARY_PATH. This is different with Windows platformm, which search 
+    # LD_LIBRARY_PATH. This is different with Windows platformm, which search
     # *.dll in current directory automatically.
     add_custom_command(TARGET op_function_generator
           POST_BUILD
diff --git a/paddle/fluid/pybind/ascend_wrapper_py.cc b/paddle/fluid/pybind/ascend_wrapper_py.cc
index 303ab5c0fe8ca..9a1fa1d7704c2 100644
--- a/paddle/fluid/pybind/ascend_wrapper_py.cc
+++ b/paddle/fluid/pybind/ascend_wrapper_py.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifdef PADDLE_WITH_ASCEND
+#ifdef PADDLE_WITH_ASCEND_CL
 #include <fcntl.h>
 
 #ifdef _POSIX_C_SOURCE
diff --git a/paddle/fluid/pybind/ascend_wrapper_py.h b/paddle/fluid/pybind/ascend_wrapper_py.h
index e999080544c31..15fb056c90e02 100644
--- a/paddle/fluid/pybind/ascend_wrapper_py.h
+++ b/paddle/fluid/pybind/ascend_wrapper_py.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#ifdef PADDLE_WITH_ASCEND
+#ifdef PADDLE_WITH_ASCEND_CL
 #include "pybind11/pybind11.h"
 #include "pybind11/stl.h"
 
diff --git a/paddle/fluid/pybind/fleet_py.cc b/paddle/fluid/pybind/fleet_py.cc
index 0a2159667f352..91461aa26f341 100644
--- a/paddle/fluid/pybind/fleet_py.cc
+++ b/paddle/fluid/pybind/fleet_py.cc
@@ -30,6 +30,8 @@ limitations under the License. */
 
 #include "paddle/fluid/distributed/communicator_common.h"
 #include "paddle/fluid/distributed/fleet.h"
+#include "paddle/fluid/distributed/index_dataset/index_sampler.h"
+#include "paddle/fluid/distributed/index_dataset/index_wrapper.h"
 #include "paddle/fluid/distributed/service/communicator.h"
 #include "paddle/fluid/distributed/service/env.h"
 #include "paddle/fluid/distributed/service/graph_brpc_client.h"
@@ -212,5 +214,76 @@ void BindGraphPyClient(py::module* m) {
       .def("bind_local_server", &GraphPyClient::bind_local_server);
 }
 
+using paddle::distributed::TreeIndex;
+using paddle::distributed::IndexWrapper;
+using paddle::distributed::IndexNode;
+
+void BindIndexNode(py::module* m) {
+  py::class_<IndexNode>(*m, "IndexNode")
+      .def(py::init<>())
+      .def("id", [](IndexNode& self) { return self.id(); })
+      .def("is_leaf", [](IndexNode& self) { return self.is_leaf(); })
+      .def("probability", [](IndexNode& self) { return self.probability(); });
+}
+
+void BindTreeIndex(py::module* m) {
+  py::class_<TreeIndex, std::shared_ptr<TreeIndex>>(*m, "TreeIndex")
+      .def(py::init([](const std::string name, const std::string path) {
+        auto index_wrapper = IndexWrapper::GetInstancePtr();
+        index_wrapper->insert_tree_index(name, path);
+        return index_wrapper->get_tree_index(name);
+      }))
+      .def("height", [](TreeIndex& self) { return self.Height(); })
+      .def("branch", [](TreeIndex& self) { return self.Branch(); })
+      .def("total_node_nums",
+           [](TreeIndex& self) { return self.TotalNodeNums(); })
+      .def("emb_size", [](TreeIndex& self) { return self.EmbSize(); })
+      .def("get_all_leafs", [](TreeIndex& self) { return self.GetAllLeafs(); })
+      .def("get_nodes",
+           [](TreeIndex& self, const std::vector<uint64_t>& codes) {
+             return self.GetNodes(codes);
+           })
+      .def("get_layer_codes",
+           [](TreeIndex& self, int level) { return self.GetLayerCodes(level); })
+      .def("get_ancestor_codes",
+           [](TreeIndex& self, const std::vector<uint64_t>& ids, int level) {
+             return self.GetAncestorCodes(ids, level);
+           })
+      .def("get_children_codes",
+           [](TreeIndex& self, uint64_t ancestor, int level) {
+             return self.GetChildrenCodes(ancestor, level);
+           })
+      .def("get_travel_codes",
+           [](TreeIndex& self, uint64_t id, int start_level) {
+             return self.GetTravelCodes(id, start_level);
+           });
+}
+
+void BindIndexWrapper(py::module* m) {
+  py::class_<IndexWrapper, std::shared_ptr<IndexWrapper>>(*m, "IndexWrapper")
+      .def(py::init([]() { return IndexWrapper::GetInstancePtr(); }))
+      .def("insert_tree_index", &IndexWrapper::insert_tree_index)
+      .def("get_tree_index", &IndexWrapper::get_tree_index)
+      .def("clear_tree", &IndexWrapper::clear_tree);
+}
+
+using paddle::distributed::IndexSampler;
+using paddle::distributed::LayerWiseSampler;
+
+void BindIndexSampler(py::module* m) {
+  py::class_<IndexSampler, std::shared_ptr<IndexSampler>>(*m, "IndexSampler")
+      .def(py::init([](const std::string& mode, const std::string& name) {
+        if (mode == "by_layerwise") {
+          return IndexSampler::Init<LayerWiseSampler>(name);
+        } else {
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "Unsupported IndexSampler Type!"));
+        }
+      }))
+      .def("init_layerwise_conf", &IndexSampler::init_layerwise_conf)
+      .def("init_beamsearch_conf", &IndexSampler::init_beamsearch_conf)
+      .def("sample", &IndexSampler::sample);
+}
+
 }  // end namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/fleet_py.h b/paddle/fluid/pybind/fleet_py.h
index 11b430cd208fd..206a69f5a8019 100644
--- a/paddle/fluid/pybind/fleet_py.h
+++ b/paddle/fluid/pybind/fleet_py.h
@@ -32,5 +32,9 @@ void BindGraphPyService(py::module* m);
 void BindGraphPyFeatureNode(py::module* m);
 void BindGraphPyServer(py::module* m);
 void BindGraphPyClient(py::module* m);
+void BindIndexNode(py::module* m);
+void BindTreeIndex(py::module* m);
+void BindIndexWrapper(py::module* m);
+void BindIndexSampler(py::module* m);
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 4ab507fe36725..0817dc3367162 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -39,6 +39,7 @@ limitations under the License. */
 #include "paddle/fluid/imperative/nccl_context.h"
 #include "paddle/fluid/imperative/partial_grad_engine.h"
 #include "paddle/fluid/imperative/profiler.h"
+#include "paddle/fluid/imperative/py_layer_fwd.h"
 #include "paddle/fluid/imperative/reducer.h"
 #include "paddle/fluid/imperative/tracer.h"
 #include "paddle/fluid/imperative/type_defs.h"
@@ -1032,6 +1033,10 @@ void BindImperative(py::module *m_ptr) {
              return std::shared_ptr<imperative::VarBase>(nullptr);
            },
            py::return_value_policy::copy)
+      .def("_set_grad_ivar",
+           [](imperative::VarBase &self, imperative::VarBase &grad) {
+             self.SetGradVarBase(grad);
+           })
       .def("_is_sparse",
            [](imperative::VarBase &self) {
              return self.Var().IsType<framework::SelectedRows>();
@@ -1065,20 +1070,58 @@ void BindImperative(py::module *m_ptr) {
       .def("_register_grad_hook",
            [](imperative::VarBase &self, const py::handle &hook) {
              PADDLE_ENFORCE_EQ(
-                 self.HasGradVar(), true,
+                 !self.OverridedStopGradient() && self.HasGradVar(), true,
                  platform::errors::InvalidArgument(
-                     "Cannot register hook on a tensor without gradient."));
-             return self.GradVarBase()->AddHook(
+                     "Cannot register gradient hook on a Tensor that stop "
+                     "gradient or without gradient."));
+             return self.GradVarBase()->AddVariableWrapperHook(
                  std::make_shared<PyVariableWrapperHook>(hook.ptr()));
            })
       .def("_remove_grad_hook",
            [](imperative::VarBase &self, int64_t hook_id) {
              PADDLE_ENFORCE_EQ(
-                 self.HasGradVar(), true,
+                 !self.OverridedStopGradient() && self.HasGradVar(), true,
                  platform::errors::InvalidArgument(
-                     "Cannot remove hook on a tensor without gradient."));
-             return self.GradVarBase()->RemoveHook(hook_id);
+                     "Cannot remove gradient hook on a Tensor that stop "
+                     "gradient or without gradient."));
+             return self.GradVarBase()->RemoveVariableWrapperHook(hook_id);
            })
+      .def("_register_backward_hook",
+           [](imperative::VarBase &self, const py::handle &hook) {
+             PADDLE_ENFORCE_EQ(
+                 self.IsLeaf(), true,
+                 platform::errors::InvalidArgument(
+                     "Only can register backward hook for leaf Tensor."));
+             PADDLE_ENFORCE_EQ(
+                 !self.OverridedStopGradient() && self.HasGradVar(), true,
+                 platform::errors::InvalidArgument(
+                     "Cannot register backward hook on a Tensor that stop "
+                     "gradient or without gradient."));
+             auto py_func = PyObjectCast<std::function<void()>>(hook.ptr());
+             self.GradVarBase()->AddVoidHook(
+                 std::make_shared<std::function<void()>>(py_func));
+           },
+           R"DOC(
+             Registers a backward hook for current Tensor.
+
+             This hook will be called every time the gradient of current Tensor has been fully calculated.
+
+             There are two differences with `_register_grad_hook`:
+             1. This backward hook will be executed after the gradient accumulation completed across batchs,
+                but the hook registered by `_register_grad_hook` will be executed the gradient accumulation
+                completed in current batch.
+             2. This backward hook function should have the following signature:
+
+                  hook() -> None
+
+                It requires no input and no return value.
+
+             Args:
+                 hook(function): A backward hook to be registered for Tensor.gradient
+
+             Returns:
+                 None
+           )DOC")
       .def("cpu",
            [](const std::shared_ptr<imperative::VarBase> &self) {
              if (platform::is_cpu_place(self->Place())) {
@@ -1278,6 +1321,16 @@ void BindImperative(py::module *m_ptr) {
              return new_var;
            },
            py::return_value_policy::copy)
+      .def("_copy_to",
+           [](const std::shared_ptr<imperative::VarBase> &self,
+              const platform::Place &place, bool blocking) {
+             auto new_var = self->NewVarBase(place, blocking);
+             if (!blocking) {
+               IncreaseVarbaseReferenceCountUntilCopyComplete(self, place);
+             }
+             return new_var;
+           },
+           py::return_value_policy::copy)
       .def("value", [](imperative::VarBase &self) { return self.MutableVar(); },
            py::return_value_policy::reference)
       .def_property("name", &imperative::VarBase::Name,
@@ -1597,6 +1650,29 @@ void BindImperative(py::module *m_ptr) {
            &imperative::BKCLParallelContext::InitWithRingID,
            py::arg("ring_id"));
 #endif
+  m.def("pylayer_apply",
+        [](const platform::CPUPlace &place, const py::object &cls,
+           const py::args args, const py::kwargs kwargs) {
+          return imperative::PyLayerApply(place, cls, args, kwargs);
+        });
+
+  m.def("pylayer_apply",
+        [](const platform::CUDAPlace &place, const py::object &cls,
+           const py::args args, const py::kwargs kwargs) {
+          return imperative::PyLayerApply(place, cls, args, kwargs);
+        });
+
+  m.def("pylayer_apply",
+        [](const platform::XPUPlace &place, const py::object &cls,
+           const py::args args, const py::kwargs kwargs) {
+          return imperative::PyLayerApply(place, cls, args, kwargs);
+        });
+
+  m.def("pylayer_apply",
+        [](const platform::CUDAPinnedPlace &place, const py::object &cls,
+           const py::args args, const py::kwargs kwargs) {
+          return imperative::PyLayerApply(place, cls, args, kwargs);
+        });
 }
 
 }  // namespace pybind
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index dd9cb65142a3d..8a5ad5852aedf 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -467,7 +467,10 @@ void BindAnalysisConfig(py::module *m) {
       .def("enable_use_gpu", &AnalysisConfig::EnableUseGpu,
            py::arg("memory_pool_init_size_mb"), py::arg("device_id") = 0)
       .def("enable_xpu", &AnalysisConfig::EnableXpu,
-           py::arg("l3_workspace_size"))
+           py::arg("l3_workspace_size") = 16 * 1024 * 1024,
+           py::arg("locked") = false, py::arg("autotune") = true,
+           py::arg("autotune_file") = "", py::arg("precision") = "int16",
+           py::arg("adaptive_seqlen") = false)
       .def("disable_gpu", &AnalysisConfig::DisableGpu)
       .def("use_gpu", &AnalysisConfig::use_gpu)
       .def("use_xpu", &AnalysisConfig::use_xpu)
@@ -512,6 +515,8 @@ void BindAnalysisConfig(py::module *m) {
            py::arg("dla_core") = 0)
       .def("tensorrt_dla_enabled", &AnalysisConfig::tensorrt_dla_enabled)
       .def("tensorrt_engine_enabled", &AnalysisConfig::tensorrt_engine_enabled)
+      .def("enable_dlnne", &AnalysisConfig::EnableDlnne,
+           py::arg("min_subgraph_size") = 3)
       .def("enable_lite_engine", &AnalysisConfig::EnableLiteEngine,
            py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32,
            py::arg("zero_copy") = false,
diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc
index 2c1927f49f6b7..dbc761e571609 100644
--- a/paddle/fluid/pybind/op_function_generator.cc
+++ b/paddle/fluid/pybind/op_function_generator.cc
@@ -26,7 +26,7 @@
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/pybind/pybind.h"
 #include "paddle/fluid/string/string_helper.h"
-#ifdef PADDLE_WITH_ASCEND
+#ifdef PADDLE_WITH_ASCEND_CL
 #include "paddle/fluid/framework/fleet/ascend_wrapper.h"
 #endif
 
@@ -182,16 +182,16 @@ const char* OUT_DUPLICABLE_INITIALIZER_TEMPLATE = R"({"%s", ConstructDuplicableO
 const char* INPUT_INITIALIZER_TEMPLATE = R"({"%s", {%s}})";
 const char* INPUT_LIST_INITIALIZER_TEMPLATE = R"({"%s", %s})";
 
-const char* INPUT_INITIALIZER_TEMPLATE_WITH_NULL = R"(	
-    if (%s != nullptr) {	
-      ins["%s"] = {%s};	
-    }	
+const char* INPUT_INITIALIZER_TEMPLATE_WITH_NULL = R"(
+    if (%s != nullptr) {
+      ins["%s"] = {%s};
+    }
 )";
 
-const char* INPUT_INITIALIZER_TEMPLATE_WITH_NULL_LIST = R"(	
+const char* INPUT_INITIALIZER_TEMPLATE_WITH_NULL_LIST = R"(
     if (%s.size() != 0) {
-      ins["%s"] = %s;	
-    }	
+      ins["%s"] = %s;
+    }
 )";
 
 const char* OUTPUT_INITIALIZER_TEMPLATE_WITH_NULL = R"(
@@ -264,8 +264,8 @@ R"(
     imperative::NameVarBaseMap ins = %s;
     %s
     tracer->TraceOp("%s", ins, outs, attrs, {%s});
-    return %s; 
-  }   
+    return %s;
+  }
 })";
 
 const char* PYBIND_ITEM_TEMPLATE = R"(  %s.def("%s", &%s);)";
@@ -350,7 +350,7 @@ std::string GenerateOpFunctionsBody(
   }
   ins_initializer += "}";
 
-  if (input_args.back() == ',') {
+  if (!input_args.empty() && input_args.back() == ',') {
     input_args.pop_back();
   }
 
@@ -364,6 +364,7 @@ std::string GenerateOpFunctionsBody(
   int outs_num = 0;
   for (auto& output : op_proto->outputs()) {
     auto& out_name = output.name();
+
     // skip those dispensable oututs
     if (output.dispensable() && !FindOutsMap(op_type, out_name)) {
       continue;
@@ -459,7 +460,7 @@ std::string GenerateOpFunctionsBody(
     return_str.pop_back();
   }
   outs_initializer += "}";
-  if (inplace_mapping_str.back() == ',') {
+  if (!inplace_mapping_str.empty() && inplace_mapping_str.back() == ',') {
     inplace_mapping_str.pop_back();
   }
   if (!use_inplace_strategy && FindViewOpMap(op_type)) {
@@ -567,7 +568,7 @@ int main(int argc, char* argv[]) {
     return -1;
   }
 
-#ifdef PADDLE_WITH_ASCEND
+#ifdef PADDLE_WITH_ASCEND_CL
   auto ascend_ptr = paddle::framework::AscendInstance::GetInstance();
   ascend_ptr->InitGEForUT();
 #endif
@@ -602,8 +603,9 @@ int main(int argc, char* argv[]) {
 
   out.close();
 
-#ifdef PADDLE_WITH_ASCEND
+#ifdef PADDLE_WITH_ASCEND_CL
   ge::GEFinalize();
 #endif
+
   return 0;
 }
diff --git a/paddle/fluid/pybind/ps_gpu_wrapper_py.cc b/paddle/fluid/pybind/ps_gpu_wrapper_py.cc
index 5bff9178fdfa5..0c239f8157e5d 100644
--- a/paddle/fluid/pybind/ps_gpu_wrapper_py.cc
+++ b/paddle/fluid/pybind/ps_gpu_wrapper_py.cc
@@ -32,8 +32,7 @@ namespace py = pybind11;
 
 namespace paddle {
 namespace pybind {
-#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
-    (defined PADDLE_WITH_PSLIB)
+#ifdef PADDLE_WITH_HETERPS
 void BindPSGPUWrapper(py::module* m) {
   py::class_<framework::PSGPUWrapper, std::shared_ptr<framework::PSGPUWrapper>>(
       *m, "PSGPU")
diff --git a/paddle/fluid/pybind/ps_gpu_wrapper_py.h b/paddle/fluid/pybind/ps_gpu_wrapper_py.h
index 8bd6ee13cf50b..ba4f146389ed3 100644
--- a/paddle/fluid/pybind/ps_gpu_wrapper_py.h
+++ b/paddle/fluid/pybind/ps_gpu_wrapper_py.h
@@ -22,8 +22,7 @@ namespace py = pybind11;
 namespace paddle {
 namespace pybind {
 
-#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
-    (defined PADDLE_WITH_PSLIB)
+#ifdef PADDLE_WITH_HETERPS
 void BindPSGPUWrapper(py::module* m);
 #endif
 }  // namespace pybind
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 428c7c2420b98..560d8c892b09f 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -109,6 +109,7 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_ASCEND_CL
 #include "paddle/fluid/platform/npu_info.h"
+#include "paddle/fluid/platform/npu_profiler.h"
 #endif
 
 #ifdef PADDLE_WITH_XPU
@@ -267,11 +268,6 @@ bool IsCompiledWithBrpc() {
 #ifndef PADDLE_WITH_DISTRIBUTE
   return false;
 #endif
-
-#ifdef PADDLE_WITH_GRPC
-  return false;
-#endif
-
   return true;
 }
 
@@ -500,7 +496,56 @@ PYBIND11_MODULE(core_noavx, m) {
 #endif
     return tensor;
   });
-
+  m.def("_save_lod_tensor", [](const LoDTensor &tensor,
+                               const std::string &str_file_name) {
+    std::ofstream fout(str_file_name, std::ios::binary);
+    PADDLE_ENFORCE_EQ(static_cast<bool>(fout), true,
+                      platform::errors::Unavailable(
+                          "Cannot open %s to save variables.", str_file_name));
+    SerializeToStream(fout, tensor);
+
+    int64_t tellp = fout.tellp();
+    fout.close();
+    return tellp;
+  });
+  m.def("_load_lod_tensor", [](LoDTensor &tensor,
+                               const std::string &str_file_name) {
+    std::ifstream fin(str_file_name, std::ios::binary);
+    PADDLE_ENFORCE_EQ(static_cast<bool>(fin), true,
+                      platform::errors::Unavailable(
+                          "Cannot open %s to load variables.", str_file_name));
+
+    DeserializeFromStream(fin, &tensor);
+    int64_t tellg = fin.tellg();
+    fin.close();
+    return tellg;
+  });
+  m.def("_save_selected_rows", [](const SelectedRows &selected_rows,
+                                  const std::string &str_file_name) {
+    std::ofstream fout(str_file_name, std::ios::binary);
+    PADDLE_ENFORCE_EQ(
+        static_cast<bool>(fout), true,
+        platform::errors::Unavailable("Cannot open %s to save SelectedRows.",
+                                      str_file_name));
+
+    SerializeToStream(fout, selected_rows);
+    int64_t tellp = fout.tellp();
+    fout.close();
+    return tellp;
+  });
+  m.def("_load_selected_rows",
+        [](SelectedRows &selected_rows, const std::string &str_file_name) {
+          std::ifstream fin(str_file_name, std::ios::binary);
+          PADDLE_ENFORCE_EQ(
+              static_cast<bool>(fin), true,
+              platform::errors::Unavailable(
+                  "Cannot open %s to load SelectedRows.", str_file_name));
+
+          DeserializeFromStream(fin, &selected_rows);
+          int64_t tellg = fin.tellg();
+          fin.close();
+          return tellg;
+        });
   m.def("_save_static_dict",
         [](const std::string &str_file_name, const py::handle &vec_var_list,
            const Scope &scope) {
@@ -581,11 +626,6 @@ PYBIND11_MODULE(core_noavx, m) {
         make_ddim(x_dim), make_ddim(y_dim), -1));
   });
 
-#ifdef PADDLE_WITH_ASCEND_CL
-  m.def("_npu_finalize",
-        []() { platform::AclInstance::Instance().Finalize(); });
-#endif
-
   m.def(
       "_append_python_callable_object_and_return_id",
       [](py::object py_obj) -> size_t {
@@ -1744,7 +1784,7 @@ All parameter, weight, gradient are variables in Paddle.
                  "Cannot use NPU because you have installed CPU/GPU version "
                  "PaddlePaddle.\n"
                  "If you want to use NPU, please try to install NPU version "
-                 "PaddlePaddle by: pip install paddlepaddle-xpu\n"
+                 "PaddlePaddle by: pip install paddlepaddle-npu\n"
                  "If you only have CPU, please change NPUPlace(%d) to be "
                  "CPUPlace().\n",
                  dev_id);
@@ -2180,6 +2220,29 @@ All parameter, weight, gradient are variables in Paddle.
 #endif
 #endif
 
+#ifdef PADDLE_WITH_ASCEND_CL
+  m.def("get_npu_device_count", platform::GetNPUDeviceCount);
+  m.def("npu_finalize", []() { platform::AclInstance::Instance().Finalize(); });
+
+  py::class_<platform::NPUProfConfigWrapper>(m, "NPUProfConfigWrapper");
+
+  m.def("npu_prof_init", platform::NPUProfilerInit);
+  m.def("npu_prof_start", [](platform::NPUProfConfigWrapper c) {
+    platform::NPUProfilerStart(c.ptr());
+  });
+  m.def("npu_prof_stop", [](platform::NPUProfConfigWrapper c) {
+    platform::NPUProfilerStop(c.ptr());
+  });
+  m.def("npu_prof_finalize", platform::NPUProfilerFinalize);
+  m.def("npu_prof_create_config", []() {
+    return platform::NPUProfConfigWrapper(platform::NPUProfilerCreateConfig());
+  });
+
+  m.def("npu_prof_destropy_config", [](platform::NPUProfConfigWrapper c) {
+    platform::NPUProfilerDestroyConfig(c.ptr());
+  });
+#endif
+
   py::enum_<platform::TracerOption>(m, "TracerOption", py::arithmetic())
       .value("kDefault", platform::TracerOption::kDefault)
       .value("kOpDetail", platform::TracerOption::kOpDetail)
@@ -3052,8 +3115,7 @@ All parameter, weight, gradient are variables in Paddle.
 #ifdef PADDLE_WITH_PSLIB
   BindHeterWrapper(&m);
 #endif
-#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
-    (defined PADDLE_WITH_PSLIB)
+#ifdef PADDLE_WITH_HETERPS
   BindPSGPUWrapper(&m);
 #endif
   BindGlooWrapper(&m);
@@ -3093,6 +3155,11 @@ All parameter, weight, gradient are variables in Paddle.
   BindGraphPyService(&m);
   BindGraphPyServer(&m);
   BindGraphPyClient(&m);
+  BindIndexNode(&m);
+  BindTreeIndex(&m);
+  BindIndexWrapper(&m);
+  BindIndexSampler(&m);
+
 #endif
 }
 }  // namespace pybind
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index ab1dd8a180b5b..416361d06a996 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -663,6 +663,7 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor,
   }
   bool is_gpu_tensor = platform::is_gpu_place(tensor.place());
   bool is_xpu_tensor = platform::is_xpu_place(tensor.place());
+  bool is_npu_tensor = platform::is_npu_place(tensor.place());
   const auto &tensor_dims = tensor.dims();
   auto tensor_dtype = tensor.type();
   size_t sizeof_dtype = framework::SizeOfType(tensor_dtype);
@@ -681,7 +682,7 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor,
 
   std::string py_dtype_str = details::TensorDTypeToPyDTypeStr(tensor.type());
 
-  if (!is_gpu_tensor && !is_xpu_tensor) {
+  if (!is_gpu_tensor && !is_xpu_tensor && !is_npu_tensor) {
     if (!need_deep_copy) {
       auto base = py::cast(std::move(tensor));
       return py::array(py::dtype(py_dtype_str.c_str()), py_dims, py_strides,
@@ -749,6 +750,34 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor,
     PADDLE_THROW(platform::errors::PermissionDenied(
         "Cannot use CUDAPlace in CPU only version, "
         "Please recompile or reinstall Paddle with CUDA support."));
+#endif
+  } else if (is_npu_tensor) {
+#ifdef PADDLE_WITH_ASCEND_CL
+    py::array py_arr(py::dtype(py_dtype_str.c_str()), py_dims, py_strides);
+    PADDLE_ENFORCE_EQ(py_arr.writeable(), true,
+                      platform::errors::InvalidArgument(
+                          "PyArray is not writable, in which case memory leak "
+                          "or double free would occur"));
+    PADDLE_ENFORCE_EQ(
+        py_arr.owndata(), true,
+        platform::errors::InvalidArgument(
+            "PyArray does not own data, in which case  memory leak "
+            "or double free would occur"));
+
+    size_t copy_bytes = sizeof_dtype * numel;
+    auto p = BOOST_GET_CONST(platform::NPUPlace, tensor.place());
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &ctx = *pool.Get(tensor.place());
+    paddle::memory::Copy(
+        platform::CPUPlace(), py_arr.mutable_data(), p, tensor_buf_ptr,
+        copy_bytes,
+        reinterpret_cast<const platform::NPUDeviceContext &>(ctx).stream());
+    ctx.Wait();
+    return py_arr;
+#else
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Cannot use NPUPlace in CPU/GPU/XPU version, "
+        "Please recompile or reinstall Paddle with NPU support."));
 #endif
   }
   PADDLE_THROW(platform::errors::Unimplemented("Place is not supported"));
diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index 14e62d6761f24..787f5297e7405 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -26,10 +26,30 @@ set cache_dir=%work_dir:Paddle=cache%
 if not exist %cache_dir%\tools (
     git clone https://github.com/zhouwei25/tools.git %cache_dir%\tools
 )
-taskkill /f /im op_function_generator.exe
-wmic process where name="op_function_generator.exe" call terminate
+taskkill /f /im op_function_generator.exe  2>NUL
+taskkill /f /im cmake.exe  2>NUL
+taskkill /f /im MSBuild.exe 2>NUL
+taskkill /f /im CL.exe 2>NUL
+taskkill /f /im Lib.exe 2>NUL
+taskkill /f /im link.exe 2>NUL
+taskkill /f /im vctip.exe 2>NUL
+taskkill /f /im cvtres.exe 2>NUL
+taskkill /f /im rc.exe 2>NUL
+taskkill /f /im mspdbsrv.exe 2>NUL
+taskkill /f /im csc.exe 2>NUL
 taskkill /f /im python.exe  2>NUL
-
+taskkill /f /im nvcc.exe 2>NUL
+taskkill /f /im cicc.exe 2>NUL
+taskkill /f /im ptxas.exe 2>NUL
+taskkill /f /im test_api_impl.exe 2>NUL
+taskkill /f /im op_function_generator.exe 2>NUL
+wmic process where name="op_function_generator.exe" call terminate 2>NUL
+wmic process where name="test_api_impl.exe" call terminate 2>NUL
+wmic process where name="cvtres.exe" call terminate 2>NUL
+wmic process where name="rc.exe" call terminate 2>NUL
+wmic process where name="CL.exe" call terminate 2>NUL
+wmic process where name="Lib.exe" call terminate 2>NUL
+wmic process where name="python.exe" call terminate 2>NUL
 
 rem ------initialize common variable------
 if not defined GENERATOR set GENERATOR="Visual Studio 15 2017 Win64"
@@ -54,6 +74,7 @@ if not defined INFERENCE_DEMO_INSTALL_DIR set INFERENCE_DEMO_INSTALL_DIR=%cache_
 if not defined LOG_LEVEL set LOG_LEVEL=normal
 if not defined PRECISION_TEST set PRECISION_TEST=OFF
 if not defined NIGHTLY_MODE set PRECISION_TEST=OFF
+if not defined retry_times set retry_times=2
 
 rem -------set cache build directory-----------
 rmdir build\python /s/q
@@ -162,11 +183,10 @@ rem ------show summary of current environment----------
 cmake --version
 if "%WITH_GPU%"=="ON" (
     nvcc --version
-    where nvidia-smi
     nvidia-smi
 )
-python %work_dir%\tools\summary_env.py
-%cache_dir%\tools\busybox64.exe bash %work_dir%\tools\get_cpu_info.sh
+::python %work_dir%\tools\summary_env.py
+::%cache_dir%\tools\busybox64.exe bash %work_dir%\tools\get_cpu_info.sh
 
 goto :CASE_%1
 
@@ -195,6 +215,7 @@ rem ------PR CI windows check for OPENBLAS/CPU------
 set WITH_MKL=ON
 set WITH_GPU=OFF
 set MSVC_STATIC_CRT=ON
+set retry_times=1
 
 call :cmake || goto cmake_error
 call :build || goto build_error
@@ -209,6 +230,7 @@ rem ------Build windows avx whl package------
 set WITH_AVX=ON
 set ON_INFER=OFF
 set CUDA_ARCH_NAME=All
+set retry_times=4
 
 call :cmake || goto cmake_error
 call :build || goto build_error
@@ -220,6 +242,7 @@ rem ------Build windows no-avx whl package------
 set WITH_AVX=OFF
 set ON_INFER=OFF
 set CUDA_ARCH_NAME=All
+set retry_times=4
 
 call :cmake || goto cmake_error
 call :build || goto build_error
@@ -249,6 +272,8 @@ echo    ========================================
 rem Configure the environment for 64-bit builds. 'DISTUTILS_USE_SDK' indicates that the user has selected the compiler.
 call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvars64.bat"
 set DISTUTILS_USE_SDK=1
+rem Windows 10 Kit bin dir
+set PATH=C:\Program Files (x86)\Windows Kits\10\bin\10.0.17763.0\x64;%PATH%
 
 for /F %%# in ('wmic os get localdatetime^|findstr 20') do set start=%%#
 set start=%start:~4,10%
@@ -342,7 +367,7 @@ if %GENERATOR% == "Ninja" (
 )
 if %ERRORLEVEL% NEQ 0 (
     set /a build_times=%build_times%+1  
-    if %build_times% GTR 2 (
+    if %build_times% GTR %retry_times% (
         exit /b 7
     ) else (
         echo Build third_party failed, will retry!
@@ -356,6 +381,28 @@ set build_times=1
 :: reset clcache zero stats for collect PR's actual hit rate
 rem clcache.exe -z
 
+rem -------clean up environment again-----------
+taskkill /f /im MSBuild.exe 2>NUL
+taskkill /f /im cl.exe 2>NUL
+taskkill /f /im lib.exe 2>NUL
+taskkill /f /im link.exe 2>NUL
+taskkill /f /im vctip.exe 2>NUL
+taskkill /f /im cvtres.exe 2>NUL
+taskkill /f /im rc.exe 2>NUL
+taskkill /f /im mspdbsrv.exe 2>NUL
+taskkill /f /im csc.exe 2>NUL
+taskkill /f /im nvcc.exe 2>NUL
+taskkill /f /im cicc.exe 2>NUL
+taskkill /f /im ptxas.exe 2>NUL
+taskkill /f /im test_api_impl.exe 2>NUL
+taskkill /f /im op_function_generator.exe 2>NUL
+wmic process where name="op_function_generator.exe" call terminate 2>NUL
+wmic process where name="test_api_impl.exe" call terminate 2>NUL
+wmic process where name="cvtres.exe" call terminate 2>NUL
+wmic process where name="rc.exe" call terminate 2>NUL
+wmic process where name="CL.exe" call terminate 2>NUL
+wmic process where name="Lib.exe" call terminate 2>NUL
+
 echo Build Paddle the %build_times% time:
 if %GENERATOR% == "Ninja" (
     ninja -j %PARALLEL_PROJECT_COUNT%
@@ -369,7 +416,7 @@ if %GENERATOR% == "Ninja" (
 
 if %ERRORLEVEL% NEQ 0 (
     set /a build_times=%build_times%+1
-    if %build_times% GTR 1 (
+    if %build_times% GTR %retry_times% (
         exit /b 7
     ) else (
         echo Build Paddle failed, will retry!
@@ -462,27 +509,11 @@ dir %THIRD_PARTY_PATH:/=\%\install\mkldnn\bin
 dir %THIRD_PARTY_PATH:/=\%\install\warpctc\bin
 
 pip install requests
-python %work_dir%\tools\get_quick_disable_lt.py > Output
-if %errorlevel%==0 (
-    set /p disable_ut_quickly=<Output
-    DEL Output
-    ) else (
-    set disable_ut_quickly=''
-)
 
 set PATH=%THIRD_PARTY_PATH:/=\%\install\openblas\lib;%THIRD_PARTY_PATH:/=\%\install\openblas\bin;^
 %THIRD_PARTY_PATH:/=\%\install\zlib\bin;%THIRD_PARTY_PATH:/=\%\install\mklml\lib;^
 %THIRD_PARTY_PATH:/=\%\install\mkldnn\bin;%THIRD_PARTY_PATH:/=\%\install\warpctc\bin;%PATH%
 
-if "%NIGHTLY_MODE%"=="ON" (
-    set nightly_label="()"
-    ) else (
-    set nightly_label="(RUN_TYPE=NIGHTLY^|RUN_TYPE=DIST:NIGHTLY^|RUN_TYPE=EXCLUSIVE:NIGHTLY)"
-    echo    ========================================
-    echo    "Unittests with nightly labels  are only run at night"
-    echo    ========================================
-)
-
 if "%WITH_GPU%"=="ON" (
     goto:parallel_test_base_gpu
 ) else (
@@ -722,9 +753,21 @@ taskkill /f /im git-remote-https.exe 2>NUL
 taskkill /f /im vctip.exe 2>NUL
 taskkill /f /im cvtres.exe 2>NUL
 taskkill /f /im rc.exe 2>NUL
+taskkill /f /im mspdbsrv.exe 2>NUL
+taskkill /f /im csc.exe 2>NUL
+taskkill /f /im python.exe  2>NUL
+taskkill /f /im nvcc.exe 2>NUL
+taskkill /f /im cicc.exe 2>NUL
+taskkill /f /im ptxas.exe 2>NUL
+taskkill /f /im test_api_impl.exe 2>NUL
+taskkill /f /im op_function_generator.exe 2>NUL
 wmic process where name="op_function_generator.exe" call terminate 2>NUL
+wmic process where name="test_api_impl.exe" call terminate 2>NUL
+wmic process where name="cvtres.exe" call terminate 2>NUL
+wmic process where name="rc.exe" call terminate 2>NUL
+wmic process where name="CL.exe" call terminate 2>NUL
+wmic process where name="Lib.exe" call terminate 2>NUL
 wmic process where name="python.exe" call terminate 2>NUL
-taskkill /f /im python.exe  2>NUL
 echo Windows CI run successfully!
 exit /b 0
 
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 2df9e0198ee49..b8b9f40aa33fc 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -145,6 +145,18 @@ function cmake_base() {
             else
                 exit 1
             fi
+        elif [ "$1" == "cp39-cp39" ]; then
+            if [ -d "/Library/Frameworks/Python.framework/Versions/3.9" ]; then
+                export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.9/lib/
+                export DYLD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.9/lib/
+                export PATH=/Library/Frameworks/Python.framework/Versions/3.9/bin/:${PATH}
+                PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.9/bin/python3
+            -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.9/include/python3.9/
+            -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.9/lib/libpython3.9.dylib"
+                pip3.9 install --user -r ${PADDLE_ROOT}/python/requirements.txt
+            else
+                exit 1
+            fi
         fi
     else
         if [ "$1" != "" ]; then
@@ -205,6 +217,13 @@ function cmake_base() {
             -DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.8.0/include/python3.8
             -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.8.0/lib/libpython3.so"
                 pip3.8 install -r ${PADDLE_ROOT}/python/requirements.txt
+            elif [ "$1" == "cp39-cp39" ]; then
+                export LD_LIBRARY_PATH=/opt/_internal/cpython-3.9.0/lib/:${LD_LIBRARY_PATH}
+                export PATH=/opt/_internal/cpython-3.9.0/bin/:${PATH}
+                export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.9.0/bin/python3.9
+            -DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.9.0/include/python3.9
+            -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.9.0/lib/libpython3.so"
+                pip3.9 install -r ${PADDLE_ROOT}/python/requirements.txt
            elif [ "$1" == "conda-python3.7" ]; then
                 export LD_LIBRARY_PATH=/opt/conda/lib/:${LD_LIBRARY_PATH}
                 export PATH=/opt/conda/bin/:${PATH}
@@ -227,7 +246,6 @@ function cmake_base() {
     fi
 
     distibuted_flag=${WITH_DISTRIBUTE:-OFF}
-    grpc_flag="OFF"
     gloo_flag=${distibuted_flag}
 
     cat <<EOF
@@ -238,7 +256,6 @@ function cmake_base() {
         -DWITH_GPU=${WITH_GPU:-OFF}
         -DWITH_TENSORRT=${WITH_TENSORRT:-ON}
         -DWITH_ROCM=${WITH_ROCM:-OFF}
-        -DWITH_RCCL=${WITH_RCCL:-OFF}
         -DWITH_DISTRIBUTE=${distibuted_flag}
         -DWITH_MKL=${WITH_MKL:-ON}
         -DWITH_AVX=${WITH_AVX:-OFF}
@@ -250,13 +267,11 @@ function cmake_base() {
         -DWITH_INCREMENTAL_COVERAGE=${WITH_INCREMENTAL_COVERAGE:-OFF}
         -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake
         -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
-        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
         -DWITH_CONTRIB=${WITH_CONTRIB:-ON}
         -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON}
         -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR}
         -DPY_VERSION=${PY_VERSION:-2.7}
         -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build}
-        -DWITH_GRPC=${grpc_flag}
         -DWITH_PSCORE=${distibuted_flag}
         -DWITH_GLOO=${gloo_flag}
         -DWITH_LITE=${WITH_LITE:-OFF}
@@ -264,6 +279,7 @@ function cmake_base() {
         -DLITE_GIT_TAG=release/v2.8
         -DWITH_UNITY_BUILD=${WITH_UNITY_BUILD:-OFF}
         -DWITH_XPU_BKCL=${WITH_XPU_BKCL:-OFF}
+        -DWITH_STRIP=${WITH_STRIP:-ON}
     ========================================
 EOF
     # Disable UNITTEST_USE_VIRTUALENV in docker because
@@ -276,7 +292,6 @@ EOF
         -DWITH_GPU=${WITH_GPU:-OFF} \
         -DWITH_TENSORRT=${WITH_TENSORRT:-ON} \
         -DWITH_ROCM=${WITH_ROCM:-OFF} \
-        -DWITH_RCCL=${WITH_RCCL:-OFF} \
         -DWITH_DISTRIBUTE=${distibuted_flag} \
         -DWITH_MKL=${WITH_MKL:-ON} \
         -DWITH_AVX=${WITH_AVX:-OFF} \
@@ -294,7 +309,6 @@ EOF
         -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR} \
         -DPY_VERSION=${PY_VERSION:-2.7} \
         -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build} \
-        -DWITH_GRPC=${grpc_flag} \
         -DWITH_PSCORE=${distibuted_flag} \
         -DWITH_GLOO=${gloo_flag} \
         -DLITE_GIT_TAG=release/v2.8 \
@@ -302,6 +316,7 @@ EOF
         -DXPU_SDK_ROOT=${XPU_SDK_ROOT:-""} \
         -DWITH_LITE=${WITH_LITE:-OFF} \
         -DWITH_XPU_BKCL=${WITH_XPU_BKCL:-OFF} \
+        -DWITH_STRIP=${WITH_STRIP:-ON} \
         -DWITH_UNITY_BUILD=${WITH_UNITY_BUILD:-OFF};build_error=$?
     if [ "$build_error" != 0 ];then
         exit 7;
@@ -523,8 +538,7 @@ function run_brpc_test() {
     mkdir -p ${PADDLE_ROOT}/build
     cd ${PADDLE_ROOT}/build
     if [[ ${WITH_TESTING:-ON} == "ON" \
-        && ${WITH_DISTRIBUTE:-OFF} == "ON" \
-        && ${WITH_GRPC:-OFF} == "OFF" ]] ; then
+        && ${WITH_DISTRIBUTE:-OFF} == "ON" ]] ; then
     cat <<EOF
     ========================================
     Running brpc unit tests ...
@@ -587,6 +601,8 @@ EOF
             pip3.7 uninstall -y paddlepaddle
         elif [ "$1" == "cp38-cp38" ]; then
             pip3.8 uninstall -y paddlepaddle
+        elif [ "$1" == "cp39-cp39" ]; then
+            pip3.9 uninstall -y paddlepaddle
         fi
         set -ex
 
@@ -602,6 +618,8 @@ EOF
             pip3.7 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
         elif [ "$1" == "cp38-cp38" ]; then
             pip3.8 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
+        elif [ "$1" == "cp39-cp39" ]; then
+            pip3.9 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
         fi
         tmpfile_rand=`date +%s%N`
         tmpfile=$tmp_dir/$tmpfile_rand
@@ -609,7 +627,7 @@ EOF
         ut_startTime_s=`date +%s`
         get_quickly_disable_ut||disable_ut_quickly='' # indicate whether the case was in quickly disable list 
         if [ ${NIGHTLY_MODE:-OFF} == "ON" ]; then
-            nightly_label=""
+            nightly_label="(NIGHTLY_LABEL)"
         else
             nightly_label="(RUN_TYPE=NIGHTLY|RUN_TYPE=DIST:NIGHTLY|RUN_TYPE=EXCLUSIVE:NIGHTLY)"
             echo "========================================="
@@ -687,28 +705,23 @@ EOF
         echo "ipipe_log_param_Mac_TestCases_Time: $[ $ut_endTime_s - $ut_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt
         paddle version
         # Recovery proxy to avoid failure in later steps
-        set +x
         export http_proxy=$my_proxy
         export https_proxy=$my_proxy
         if [ "$mactest_error" != 0 ];then
             show_ut_retry_result
         fi
-        set -x
     fi
 }
 
 function get_precision_ut_mac() {
     on_precision=0
-    set -x
     UT_list=$(ctest -N | awk -F ': ' '{print $2}' | sed '/^$/d' | sed '$d')
     precison_cases=""
     if [ ${PRECISION_TEST:-OFF} == "ON" ]; then
         python3.7 $PADDLE_ROOT/tools/get_pr_ut.py
         if [[ -f "ut_list" ]]; then
-            set +x
             echo "PREC length: "`wc -l ut_list`
             precision_cases=`cat ut_list`
-            set -x
         fi
     fi
     if [ ${PRECISION_TEST:-OFF} == "ON" ] && [[ "$precision_cases" != "" ]];then
@@ -805,7 +818,7 @@ function generate_api_spec() {
 
     awk -F '(' '{print $NF}' $spec_path >${spec_path}.doc
     awk -F '(' '{$NF="";print $0}' $spec_path >${spec_path}.api
-    if [ "$1" == "cp35-cp35m" ] || [ "$1" == "cp36-cp36m" ] || [ "$1" == "cp37-cp37m" ] || [ "$1" == "cp38-cp38" ]; then
+    if [ "$1" == "cp35-cp35m" ] || [ "$1" == "cp36-cp36m" ] || [ "$1" == "cp37-cp37m" ] || [ "$1" == "cp38-cp38" || [ "$1" == "cp39-cp39" ]; then
         # Use sed to make python2 and python3 sepc keeps the same
         sed -i 's/arg0: str/arg0: unicode/g' $spec_path
         sed -i "s/\(.*Transpiler.*\).__init__ (ArgSpec(args=\['self'].*/\1.__init__ /g" $spec_path
@@ -850,7 +863,7 @@ function check_approvals_of_unittest() {
                 echo -e "If you have any problems about deleting unit-test, please read the specification [https://github.com/PaddlePaddle/Paddle/wiki/Deleting-unit-test-is-forbidden]. \n"
                 echo -e "Following unit-tests are deleted in this PR: \n ${unittest_spec_diff} \n"
                 echo "************************************"
-                exit 1
+                exit 6
             fi
         fi
     fi
@@ -1542,12 +1555,14 @@ EOF
     ref_paddle36=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp36-cp36m-linux_x86_64.whl
     ref_paddle37=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp37-cp37m-linux_x86_64.whl
     ref_paddle38=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp38-cp38-linux_x86_64.whl
+    ref_paddle39=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp39-cp39-linux_x86_64.whl
 
     ref_paddle2_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp27-cp27mu-linux_x86_64.whl
     ref_paddle35_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp35-cp35m-linux_x86_64.whl
     ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp36-cp36m-linux_x86_64.whl
     ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp37-cp37m-linux_x86_64.whl
     ref_paddle38_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp38-cp38-linux_x86_64.whl
+    ref_paddle39_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp39-cp39-linux_x86_64.whl
 
     if [[ ${PADDLE_BRANCH} != "0.0.0" && ${WITH_MKL} == "ON" && ${WITH_GPU} == "ON" ]]; then
         ref_paddle2=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp27-cp27mu-linux_x86_64.whl
@@ -1555,11 +1570,13 @@ EOF
         ref_paddle36=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp36-cp36m-linux_x86_64.whl
         ref_paddle37=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp37-cp37m-linux_x86_64.whl
         ref_paddle38=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp38-cp38-linux_x86_64.whl
+        ref_paddle39=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp39-cp39-linux_x86_64.whl
         ref_paddle2_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp27-cp27mu-linux_x86_64.whl
         ref_paddle35_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp35-cp35m-linux_x86_64.whl
         ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp36-cp36m-linux_x86_64.whl
         ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp37-cp37m-linux_x86_64.whl
         ref_paddle38_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp38-cp38-linux_x86_64.whl
+        ref_paddle39_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp39-cp39-linux_x86_64.whl
     fi
 
     #ref_paddle2_mv1=""
@@ -1680,6 +1697,22 @@ EOF
         apt-get clean -y && \
         rm -f ${ref_paddle38} && \
         ldconfig
+EOF
+    cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
+    # run paddle version to install python packages first
+    RUN apt-get update && ${NCCL_DEPS}
+    RUN apt-get install -y make build-essential libssl-dev zlib1g-dev libbz2-dev \
+        libreadline-dev libsqlite3-dev wget curl llvm libncurses5-dev libncursesw5-dev \
+        xz-utils tk-dev libffi-dev liblzma-dev
+    RUN wget -q https://www.python.org/ftp/python/3.9.0/Python-3.9.0.tgz && \
+        tar -xzf Python-3.9.0.tgz && cd Python-3.9.0 && \
+        CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null && \
+        make -j8 > /dev/null && make altinstall > /dev/null && cd ../ && rm Python-3.9.0.tgz
+    RUN apt-get install -y libgtk2.0-dev dmidecode python3-tk && ldconfig && \
+        wget ${ref_web}/${ref_paddle39} && pip3.9 install ${ref_paddle39_whl}; apt-get install -f -y && \
+        apt-get clean -y && \
+        rm -f ${ref_paddle39} && \
+        ldconfig
 EOF
     cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
     # run paddle version to install python packages first
@@ -1830,6 +1863,10 @@ function test_op_benchmark() {
     bash ${PADDLE_ROOT}/tools/test_op_benchmark.sh
 }
 
+function test_model_benchmark() {
+    bash ${PADDLE_ROOT}/tools/test_model_benchmark.sh
+}
+
 function summary_check_problems() {
     set +x
     local check_style_code=$1
@@ -2024,6 +2061,9 @@ function main() {
       test_op_benchmark)
         test_op_benchmark
         ;;
+      test_model_benchmark)
+        test_model_benchmark
+        ;;
       *)
         print_usage
         exit 1
diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc
index a886f7a029837..1f11be7e3c726 100644
--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
@@ -29,8 +29,7 @@ int main(int argc, char** argv) {
 
   std::vector<std::string> envs;
   std::vector<std::string> undefok;
-#if defined(PADDLE_WITH_DISTRIBUTE) && !defined(PADDLE_WITH_GRPC) && \
-    !defined(PADDLE_WITH_PSLIB)
+#if defined(PADDLE_WITH_DISTRIBUTE) && !defined(PADDLE_WITH_PSLIB)
   std::string str_max_body_size;
   if (::GFLAGS_NAMESPACE::GetCommandLineOption("max_body_size",
                                                &str_max_body_size)) {
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 17bf2d544f31d..94091c94bb533 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -31,7 +31,21 @@
 monkey_patch_variable()
 monkey_patch_math_varbase()
 import paddle.framework
+from .framework.dtype import dtype as dtype
+from paddle.framework.dtype import uint8
+from paddle.framework.dtype import int8
+from paddle.framework.dtype import int16
+from paddle.framework.dtype import int32
+from paddle.framework.dtype import int64
+from paddle.framework.dtype import float16
+from paddle.framework.dtype import float32
+from paddle.framework.dtype import float64
+from paddle.framework.dtype import bfloat16
+from paddle.framework.dtype import bool
+from paddle.framework.dtype import complex64
+from paddle.framework.dtype import complex128
 from .framework import VarBase as Tensor
+Tensor.__qualname__ = 'Tensor'
 import paddle.compat
 import paddle.distributed
 import paddle.sysconfig
@@ -136,6 +150,7 @@
 from .tensor.manipulation import unbind  #DEFINE_ALIAS
 from .tensor.manipulation import roll  #DEFINE_ALIAS
 from .tensor.manipulation import chunk  #DEFINE_ALIAS
+from .tensor.manipulation import tolist  #DEFINE_ALIAS
 from .tensor.math import abs  #DEFINE_ALIAS
 from .tensor.math import acos  #DEFINE_ALIAS
 from .tensor.math import asin  #DEFINE_ALIAS
@@ -243,6 +258,7 @@
 
 from .framework import grad  #DEFINE_ALIAS
 from .framework import no_grad  #DEFINE_ALIAS
+from .framework import set_grad_enabled  #DEFINE_ALIAS
 from .framework import save  #DEFINE_ALIAS
 from .framework import load  #DEFINE_ALIAS
 from .framework import DataParallel  #DEFINE_ALIAS
@@ -284,6 +300,8 @@
 from .hapi import callbacks
 from .hapi import summary
 from .hapi import flops
+from .hapi import hub
+
 import paddle.text
 import paddle.vision
 
diff --git a/python/paddle/amp/grad_scaler.py b/python/paddle/amp/grad_scaler.py
index 64b34ce834563..72a67a92c4958 100644
--- a/python/paddle/amp/grad_scaler.py
+++ b/python/paddle/amp/grad_scaler.py
@@ -62,6 +62,7 @@ class GradScaler(AmpScaler):
             scaled = scaler.scale(loss)  # scale the loss 
             scaled.backward()            # do backward
             scaler.minimize(optimizer, scaled)  # update parameters     
+            optimizer.clear_grad()
     """
 
     def __init__(self,
@@ -105,6 +106,7 @@ def scale(self, var):
                 scaled = scaler.scale(loss)  # scale the loss 
                 scaled.backward()            # do backward
                 scaler.minimize(optimizer, scaled)  # update parameters  
+                optimizer.clear_grad()
         """
         return super(GradScaler, self).scale(var)
 
@@ -140,5 +142,6 @@ def minimize(self, optimizer, *args, **kwargs):
                 scaled = scaler.scale(loss)  # scale the loss 
                 scaled.backward()            # do backward
                 scaler.minimize(optimizer, scaled)  # update parameters  
+                optimizer.clear_grad()
         """
         return super(GradScaler, self).minimize(optimizer, *args, **kwargs)
diff --git a/python/paddle/autograd/__init__.py b/python/paddle/autograd/__init__.py
index 8b3f3086a4a72..71110e9581787 100644
--- a/python/paddle/autograd/__init__.py
+++ b/python/paddle/autograd/__init__.py
@@ -16,7 +16,6 @@
 
 from . import backward_mode
 from .backward_mode import backward
+from .py_layer import PyLayer, PyLayerContext
 
-__all__ = ['grad']
-
-__all__ += backward_mode.__all__
+__all__ = ['grad', 'backward', 'PyLayer', 'PyLayerContext']
diff --git a/python/paddle/autograd/py_layer.py b/python/paddle/autograd/py_layer.py
new file mode 100644
index 0000000000000..35e2cd2439177
--- /dev/null
+++ b/python/paddle/autograd/py_layer.py
@@ -0,0 +1,319 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle.fluid.framework import dygraph_only
+from paddle.fluid import core
+__all__ = ['PyLayer', 'PyLayerContext']
+
+
+class PyLayerContext(object):
+    """
+    The object of this class is a context that is used in PyLayer to enhance the function.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddle.autograd import PyLayer
+
+            class cus_tanh(PyLayer):
+                @staticmethod
+                def forward(ctx, x):
+                    # ctx is a object of PyLayerContext.
+                    y = paddle.tanh(x)
+                    ctx.save_for_backward(y)
+                    return y
+
+                @staticmethod
+                def backward(ctx, dy):
+                    # ctx is a object of PyLayerContext.
+                    y, = ctx.saved_tensor()
+                    grad = dy * (1 - paddle.square(y))
+                    return grad
+    """
+
+    def __init__(self):
+        self.container = None
+
+    def save_for_backward(self, *tensors):
+        """
+        Saves given tensors that backward need. Use ``saved_tensor`` in the `backward` to get the saved tensors.
+        
+        .. note::
+            This API should be called at most once, and only inside `forward`. 
+
+        Args:
+            tensors(list of Tensors): Tensors to be stored.
+
+        Returns:
+            None
+        
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                from paddle.autograd import PyLayer
+
+                class cus_tanh(PyLayer):
+                    @staticmethod
+                    def forward(ctx, x):
+                        # ctx is a context object that store some objects for backward.
+                        y = paddle.tanh(x)
+                        # Pass tensors to backward.
+                        ctx.save_for_backward(y)
+                        return y
+
+                    @staticmethod
+                    def backward(ctx, dy):
+                        # Get the tensors passed by forward.
+                        y, = ctx.saved_tensor()
+                        grad = dy * (1 - paddle.square(y))
+                        return grad
+
+        """
+        self.container = tensors
+
+    def saved_tensor(self):
+        """
+        Get the tensors stored by ``save_for_backward``.
+
+        Returns:
+            list of Tensors or None: If context contains tensors stored by `save_for_backward`, 
+            then return these tensors, otherwise return None.
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                from paddle.autograd import PyLayer
+
+                class cus_tanh(PyLayer):
+                    @staticmethod
+                    def forward(ctx, x):
+                        # ctx is a context object that store some objects for backward.
+                        y = paddle.tanh(x)
+                        # Pass tensors to backward.
+                        ctx.save_for_backward(y)
+                        return y
+
+                    @staticmethod
+                    def backward(ctx, dy):
+                        # Get the tensors passed by forward.
+                        y, = ctx.saved_tensor()
+                        grad = dy * (1 - paddle.square(y))
+                        return grad
+        """
+
+        return self.container
+
+
+def with_mateclass(meta, *bases):
+    class impl(meta):
+        def __new__(cls, name, temp_bases, attrs):
+            return meta(name, bases, attrs)
+
+    return type.__new__(impl, "impl", (), {})
+
+
+class CPyLayer(object):
+    @classmethod
+    @dygraph_only
+    def apply(cls, *args, **kwargs):
+        """
+        After building the custom PyLayer, run it through the ``apply``.
+
+        Args:
+            *args(tuple): input of PyLayer.
+            **kwargs(dict): input of PyLayer.
+
+        Returns:
+            tensors or other types : output of PyLayer.
+        
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                from paddle.autograd import PyLayer
+
+                class cus_tanh(PyLayer):
+                    @staticmethod
+                    def forward(ctx, x, func1, func2=paddle.square):
+                        ctx.func = func2
+                        y = func1(x)
+                        # Pass tensors to backward.
+                        ctx.save_for_backward(y)
+                        return y
+
+                    @staticmethod
+                    def backward(ctx, dy):
+                        # Get the tensors passed by forward.
+                        y, = ctx.saved_tensor()
+                        grad = dy * (1 - ctx.func(y))
+                        return grad
+
+
+                data = paddle.randn([2, 3], dtype="float64")
+                data.stop_gradient = False
+                # run custom Layer.
+                z = cus_tanh.apply(data, func1=paddle.tanh)
+        """
+        place = paddle.fluid.framework._current_expected_place()
+        with paddle.fluid.dygraph.no_grad():
+            return core.pylayer_apply(place, cls, *args, **kwargs)
+
+
+class PyLayerBackward(PyLayerContext):
+    def backward(self, *args, **kwargs):
+        with paddle.fluid.dygraph.guard():
+            with paddle.fluid.dygraph.no_grad():
+                return self._forward_cls.backward(*args, **kwargs)
+
+
+class LayerMeta(type):
+    def __init__(cls, name, bases, attrs):
+        cls._backward_function = type(name + '_backward', (PyLayerBackward, ),
+                                      {"_forward_cls": cls})
+
+        return super(LayerMeta, cls).__init__(name, bases, attrs)
+
+
+class PyLayer(with_mateclass(LayerMeta, CPyLayer)):
+    """
+    Build a custom `Layer` by creating subclasses. Subclasses need to follow the following rules:
+    1. Subclasses contain `forward` and `backward` function. Both forward and backward are @staticmethod.
+    Their first argument should be a context and `None` can not be included in the returned result.
+    2. Input of backward contains a context as the first argument, and the rest arguments are the 
+    gradient of forward's output tensors. so the number of backward's input tensors equal to 
+    the number of forward output tensors. If you need the forward's inputs or outputs in `backward`, 
+    you can use `save_for_backward` to store the required tensors, and then use them in the backward.
+    3. Output of backward function can only be `Tensor` or tuple/list of `Tensor`.
+    Output tensors of backward are the gradient of forward's input tensors, 
+    so the number of backward's output tensors equal to the number of forward input tensors.
+    After building the custom Layer, run it through the `apply` method.
+    
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddle.autograd import PyLayer
+
+            # Inherit from PyLayer
+            class cus_tanh(PyLayer):
+                @staticmethod
+                def forward(ctx, x, func1, func2=paddle.square):
+                    # ctx is a context object that store some objects for backward.
+                    ctx.func = func2
+                    y = func1(x)
+                    # Pass tensors to backward.
+                    ctx.save_for_backward(y)
+                    return y
+
+                @staticmethod
+                # forward has only one output, so there is only one gradient in the input of backward.
+                def backward(ctx, dy):
+                    # Get the tensors passed by forward.
+                    y, = ctx.saved_tensor()
+                    grad = dy * (1 - ctx.func(y))
+                    # forward has only one input, so only one gradient tensor is returned.
+                    return grad
+
+
+            data = paddle.randn([2, 3], dtype="float64")
+            data.stop_gradient = False
+            z = cus_tanh.apply(data, func1=paddle.tanh)
+            z.mean().backward()
+
+            print(data.grad)
+
+    """
+
+    @staticmethod
+    def forward(ctx, *args, **kwargs):
+        """
+        It is to be overloaded by subclasses. It must accept a object of `PyLayerContext` as 
+        the first argument, followed by any number of arguments (tensors or other types). 
+        `None` can not be included in the returned result.
+
+        Args:
+            *args(tuple): input of PyLayer.
+            **kwargs(dict): input of PyLayer.
+
+        Returns:
+            tensors or other types : output of PyLayer.
+        
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                from paddle.autograd import PyLayer
+
+                class cus_tanh(PyLayer):
+                    @staticmethod
+                    def forward(ctx, x):
+                        y = paddle.tanh(x)
+                        # Pass tensors to backward.
+                        ctx.save_for_backward(y)
+                        return y
+
+                    @staticmethod
+                    def backward(ctx, dy):
+                        # Get the tensors passed by forward.
+                        y, = ctx.saved_tensor()
+                        grad = dy * (1 - paddle.square(y))
+                        return grad
+        """
+        raise NotImplementedError(
+            "You must implement the forward function for PyLayer.")
+
+    @staticmethod
+    def backward(ctx, *args, **kwargs):
+        """
+        This is a function to calculate the gradient. It is to be overloaded by subclasses. 
+        It must accept a object of `PyLayerContext` as the first argument, and the rest 
+        arguments are the gradient of forward's output tensors. Output tensors of backward 
+        are the gradient of forward's input tensors.
+
+        Args:
+            *args(tuple): The gradient of forward's output tensor(s).
+            **kwargs(dict): The gradient of forward's output tensor(s).
+
+        Returns:
+            Tensor or list of Tensors: The gradient of forward's input tensor(s).
+        
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                from paddle.autograd import PyLayer
+
+                class cus_tanh(PyLayer):
+                    @staticmethod
+                    def forward(ctx, x):
+                        y = paddle.tanh(x)
+                        # Pass tensors to backward.
+                        ctx.save_for_backward(y)
+                        return y
+
+                    @staticmethod
+                    def backward(ctx, dy):
+                        # Get the tensors passed by forward.
+                        y, = ctx.saved_tensor()
+                        grad = dy * (1 - paddle.square(y))
+                        return grad
+        """
+
+        raise NotImplementedError(
+            "You must implement the backward function for PyLayer.")
diff --git a/python/paddle/device.py b/python/paddle/device.py
index 20453998fb7ae..035d240e713fe 100644
--- a/python/paddle/device.py
+++ b/python/paddle/device.py
@@ -119,28 +119,7 @@ def get_cudnn_version():
         return _cudnn_version
 
 
-def set_device(device):
-    """
-    Paddle supports running calculations on various types of devices, including CPU, GPU and XPU.
-    They are represented by string identifiers. This function can specify the global device
-    which the OP will run.
-
-    Parameters:
-        device(str): This parameter determines the specific running device.
-            It can be ``cpu``, ``gpu:x`` and ``xpu:x``, where ``x`` is the 
-            index of the GPUs or XPUs. 
-
-    Examples:
-
-     .. code-block:: python
-            
-        import paddle
-
-        paddle.set_device("cpu")
-        x1 = paddle.ones(name='x1', shape=[1, 2], dtype='int32')
-        x2 = paddle.zeros(name='x2', shape=[1, 2], dtype='int32')
-        data = paddle.stack([x1,x2], axis=1)
-    """
+def _convert_to_place(device):
     lower_device = device.lower()
     if lower_device == 'cpu':
         place = core.CPUPlace()
@@ -183,7 +162,32 @@ def set_device(device):
             device_id = device_info_list[1]
             device_id = int(device_id)
             place = core.XPUPlace(device_id)
+    return place
 
+
+def set_device(device):
+    """
+    Paddle supports running calculations on various types of devices, including CPU, GPU and XPU.
+    They are represented by string identifiers. This function can specify the global device
+    which the OP will run.
+
+    Parameters:
+        device(str): This parameter determines the specific running device.
+            It can be ``cpu``, ``gpu:x`` and ``xpu:x``, where ``x`` is the 
+            index of the GPUs or XPUs. 
+
+    Examples:
+
+     .. code-block:: python
+            
+        import paddle
+
+        paddle.set_device("cpu")
+        x1 = paddle.ones(name='x1', shape=[1, 2], dtype='int32')
+        x2 = paddle.zeros(name='x2', shape=[1, 2], dtype='int32')
+        data = paddle.stack([x1,x2], axis=1)
+    """
+    place = _convert_to_place(device)
     framework._set_expected_place(place)
     return place
 
diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py
index 2756dea72e84a..32c607ec672a3 100644
--- a/python/paddle/distributed/collective.py
+++ b/python/paddle/distributed/collective.py
@@ -692,6 +692,79 @@ def scatter(tensor, tensor_list=None, src=0, group=None, use_calc_stream=True):
         })
 
 
+def _c_identity(tensor, group=0):
+    """
+    Return a copy of the tensor, mainly used with model parallel.
+
+    Args:
+        tensor (Tensor): The input Tensor. Its data type
+            should be float16, float32, float64, int32 or int64.
+        group (int): The id of the process group to work on.
+
+    Returns:
+        Tensor.
+    """
+    op_type = 'c_identity'
+    helper = LayerHelper(op_type, **locals())
+    out = helper.create_variable_for_type_inference(dtype=tensor.dtype)
+    if in_dygraph_mode():
+        return core.ops.c_identity(out, tensor, 'use_calc_stream', True,
+                                   'ring_id', group, 'use_model_parallel', True)
+    check_variable_and_dtype(
+        tensor, 'tensor', ['float16', 'float32', 'float64', 'int32', 'int64'],
+        '_c_identity')
+    if not isinstance(group, int):
+        raise ValueError("The type of 'group' for _c_identity should be int.")
+    helper.append_op(
+        type=op_type,
+        inputs={'X': tensor},
+        outputs={'Out': out},
+        attrs={
+            'ring_id': group,
+            'use_calc_stream': True,
+            'use_model_parallel': True,
+        })
+    return out
+
+
+def _c_split(tensor, rank, nranks, group=0):
+    """
+    Split tensor evenly among all members, mainly used with model parallel.
+
+    Args:
+        tensor (Tensor): The input Tensor. Its data type
+            should be float16, float32, float64, int32 or int64.
+        rank (int): The rank of the current process.
+        group (int): The id of the process group to work on.
+
+    Returns:
+        Tensor.
+    """
+    op_type = 'c_split'
+    helper = LayerHelper(op_type, **locals())
+    out = helper.create_variable_for_type_inference(dtype=tensor.dtype)
+    if in_dygraph_mode():
+        return core.ops.c_split(out, tensor, 'use_calc_stream', True, 'ring_id',
+                                group, 'rank', rank, 'use_model_parallel', True)
+    check_variable_and_dtype(
+        tensor, 'tensor', ['float16', 'float32', 'float64', 'int32', 'int64'],
+        '_c_split')
+    if not isinstance(group, int):
+        raise ValueError("The type of 'group' for _identity should be int.")
+    helper.append_op(
+        type=op_type,
+        inputs={'X': tensor},
+        outputs={'Out': out},
+        attrs={
+            'ring_id': group,
+            'use_calc_stream': True,
+            'rank': rank,
+            'nranks': nranks,
+            'use_model_parallel': True,
+        })
+    return out
+
+
 def barrier(group=None):
     """
 
@@ -732,15 +805,27 @@ def barrier(group=None):
         attrs={'ring_id': ring_id})
 
 
-def _parallel_linear(x, num_rows, num_cols, axis, param_attr, bias_attr,
-                     gather_out, inner_rank, name):
+def _parallel_linear(x,
+                     num_rows,
+                     num_cols,
+                     axis,
+                     param_attr,
+                     bias_attr,
+                     gather_out,
+                     inner_rank,
+                     nranks,
+                     split_tensor,
+                     name,
+                     group=0):
     """
     Parallel Linear
     """
-    if not name:
-        name = "fc_by_row_rank_%d" % inner_rank if axis == 0 else "fc_by_col_rank_%d" % inner_rank
+    if axis == 0:
+        if split_tensor:
+            x = _c_split(x, inner_rank, nranks, group=group)
     else:
-        name = name + "_by_row_rank_%d" % inner_rank if axis == 0 else name + "_by_col_rank_%d" % inner_rank
+        x = _c_identity(x, group=group)
+
     linear = paddle.nn.Linear(
         num_rows,
         num_cols,
@@ -748,34 +833,60 @@ def _parallel_linear(x, num_rows, num_cols, axis, param_attr, bias_attr,
         bias_attr=bias_attr,
         name=name)
 
-    weight = linear.weight
-    weight.is_distributed = True
     linear_out = linear(x)
     startup_block = paddle.static.default_startup_program().global_block()
     main_block = paddle.static.default_main_program().global_block()
-    startup_block.vars[weight.name].is_distributed = True
-    main_block.vars[weight.name].is_distributed = True
-
-    if gather_out:
-        if axis == 0:
-            paddle.distributed.all_reduce(linear_out)
-        else:
-            output = []
-            paddle.distributed.all_gather(output, linear_out)
-            linear_out = paddle.concat(output, axis=len(linear_out.shape) - 1)
-    return linear_out
+    startup_block.vars[linear.weight.name].is_distributed = True
+    main_block.vars[linear.weight.name].is_distributed = True
+
+    if not gather_out: return linear_out
+
+    op_type = 'c_allreduce_sum' if axis == 0 else 'c_concat'
+    out_shape = list(linear_out.shape)
+    out_shape[0] *= 1 if axis == 0 else nranks
+    out = main_block.create_var(
+        shape=out_shape,
+        dtype=linear_out.dtype,
+        type=linear_out.type,
+        lod_level=linear_out.lod_level,
+        persistable=False,
+        is_data=False,
+        need_check_feed=linear_out.desc.need_check_feed())
+    if axis == 0:
+        main_block.append_op(
+            type='c_allreduce_sum',
+            inputs={'X': linear_out},
+            outputs={'Out': out},
+            attrs={
+                'ring_id': group,
+                'use_calc_stream': True,
+                'use_model_parallel': True
+            })
+    else:
+        main_block.append_op(
+            type='c_concat',
+            inputs={'X': linear_out},
+            outputs={'Out': out},
+            attrs={
+                'ring_id': group,
+                'nranks': nranks,
+                'use_calc_stream': True,
+                'use_model_parallel': True
+            })
+    return out
 
 
-def _parallel_embedding(x, per_part_embeddings, origin_size, param_attr,
-                        inner_rank, num_partitions, name):
+def _parallel_embedding(x,
+                        per_part_embeddings,
+                        origin_size,
+                        param_attr,
+                        inner_rank,
+                        num_partitions,
+                        name,
+                        group=0):
     """
     Parallel Embedding
     """
-    if not name:
-        name = "emb_rank_%d" % inner_rank
-    else:
-        name = name + "_rank_%d" % inner_rank
-
     origin_num_embeddings = origin_size[0]
     embedding = paddle.nn.Embedding(
         per_part_embeddings,
@@ -795,15 +906,29 @@ def _parallel_embedding(x, per_part_embeddings, origin_size, param_attr,
                                  inner_rank, per_part_embeddings - 1)
     if len(origin_input_shape) == 2:
         x_shard = paddle.squeeze(x_shard, axis=-1)
-
-    embedding.weight.is_distributed = True
     emb_out = embedding(x_shard)
     startup_block = paddle.static.default_startup_program().global_block()
     main_block = paddle.static.default_main_program().global_block()
     startup_block.vars[embedding.weight.name].is_distributed = True
     main_block.vars[embedding.weight.name].is_distributed = True
-    paddle.distributed.all_reduce(emb_out, group=None)
-    return emb_out
+    out = main_block.create_var(
+        shape=emb_out.shape,
+        dtype=emb_out.dtype,
+        type=emb_out.type,
+        lod_level=emb_out.lod_level,
+        persistable=False,
+        is_data=False,
+        need_check_feed=emb_out.desc.need_check_feed())
+    main_block.append_op(
+        type='c_allreduce_sum',
+        inputs={'X': emb_out},
+        outputs={'Out': out},
+        attrs={
+            'ring_id': group,
+            'use_calc_stream': True,
+            'use_model_parallel': True
+        })
+    return out
 
 
 def split(x,
@@ -896,8 +1021,10 @@ def split(x,
         "paddle.distributed.split must be one of {}.".format(
             supported_operations))
     if in_dygraph_mode():
-        rank = paddle.distributed.get_rank()
-        nranks = paddle.distributed.get_world_size()
+        raise ValueError(
+            "paddle.distributed.split cannot be used in dynamic "
+            "graph mode, plese use ParallelEmbedding, ParallelRowLinear, "
+            "ParallelColumnLinear instead.")
     else:
         assert fleet._role_maker, ("To use paddle.distributed.split, "
                                    "you must call fleet.init() firstly.")
@@ -915,10 +1042,18 @@ def split(x,
         if inner_rank == num_partitions - 1: per_part_size = last_part_size
         per_part_size += 1  # make the last row as the padding index
 
-        emb_out = _parallel_embedding(x, per_part_size, size, weight_attr,
-                                      inner_rank, num_partitions, name)
+        emb_out = _parallel_embedding(
+            x,
+            per_part_size,
+            size,
+            weight_attr,
+            inner_rank,
+            num_partitions,
+            name,
+            group=0)
         return emb_out
     else:
+        should_split = False
         if axis == 0:
             assert size[0] % num_partitions == 0, (
                 "Number of rows of the weight for linear ({}) must be"
@@ -926,11 +1061,7 @@ def split(x,
                                                            num_partitions))
             per_part_size = size[0] // num_partitions
             linear_size = (per_part_size, size[1])
-            assert x.shape[-1] == per_part_size, (
-                "The width ({}) of the input "
-                "x must be equal to the height ({}) of the weight. Maybe you "
-                "should split the input x using paddle.split.".format(
-                    x.shape[-1], per_part_size))
+            if x.shape[-1] == size[0]: should_split = True
 
         elif axis == 1:
             assert size[1] % num_partitions == 0, (
@@ -952,5 +1083,8 @@ def split(x,
             bias_attr,
             gather_out,
             inner_rank,
-            name=name)
+            num_partitions,
+            should_split,
+            name=name,
+            group=0)
         return linear_out
diff --git a/python/paddle/distributed/fleet/__init__.py b/python/paddle/distributed/fleet/__init__.py
index 6d4aedddba674..403a02496afaa 100644
--- a/python/paddle/distributed/fleet/__init__.py
+++ b/python/paddle/distributed/fleet/__init__.py
@@ -21,6 +21,7 @@
 from .data_generator import MultiSlotDataGenerator, MultiSlotStringDataGenerator
 from . import metrics
 from .base.topology import CommunicateTopology, HybridCommunicateGroup
+from .meta_parallel import *
 
 __all__ = [
     "DistributedStrategy", "UtilBase", "UserDefinedRoleMaker",
@@ -72,3 +73,5 @@
 state_dict = fleet.state_dict
 set_state_dict = fleet.set_state_dict
 shrink = fleet.shrink
+get_hybrid_communicate_group = fleet.get_hybrid_communicate_group
+distributed_scaler = fleet.distributed_scaler
diff --git a/python/paddle/distributed/fleet/ascend_utils.py b/python/paddle/distributed/fleet/ascend_utils.py
index 23e812041c8b2..b64149f27bcac 100644
--- a/python/paddle/distributed/fleet/ascend_utils.py
+++ b/python/paddle/distributed/fleet/ascend_utils.py
@@ -63,7 +63,6 @@ def _get_ascend_rankfile(rank_table_file_path):
     Returns:
         node_ips: node ip list
         device_count: number of npu per machine
-        
     """
     json_data = None
     with open(rank_table_file_path) as json_file:
@@ -82,24 +81,25 @@ def _get_ascend_rankfile(rank_table_file_path):
 
 def get_cloud_cluster(rank_table_file=None,
                       device_mode=DeviceMode.ASCEND_NPU,
-                      devices_per_proc=None,
                       start_port=6070):
     """
     Args:
     rank_table_file: string, ascend npu rank file path
     device_mode: DeviceMode(Int)
-    devices_per_proc:list
     start_port: the start port of current runtime env
     """
     if rank_table_file:
         # multi trainers
         node_ips, device_count = _get_ascend_rankfile(rank_table_file)
-        node_index = os.environ.get("PADDLE_TRAINER_ID")
-        node_ip = None
-        if node_index is None:
-            _, node_ip = get_host_name_ip()
+        if len(node_ips) == 1:
+            node_ip = node_ips[0]
         else:
-            node_ip = node_ips[int(node_index)]
+            node_index = os.environ.get("PADDLE_TRAINER_ID")
+            node_ip = None
+            if node_index:
+                node_ip = node_ips[int(node_index)]
+            else:
+                _, node_ip = get_host_name_ip()
 
         assert node_ip in node_ips, "Can't find your local ip {%s} in node_ips: {%s}" \
             % (node_ip, node_ips)
@@ -108,11 +108,8 @@ def get_cloud_cluster(rank_table_file=None,
         node_ips = ["127.0.0.1"]
         node_ip = node_ips[0]
         device_count = 1
-        devices_per_proc = None
-
-    if devices_per_proc is None:
-        devices_per_proc = [str(x) for x in range(device_count)]
 
+    devices_per_proc = [str(x) for x in range(device_count)]
     free_ports = [
         x for x in range(start_port, start_port + len(devices_per_proc))
     ]
diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py
index 626f6a37a982e..9fed3a8550c40 100755
--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -744,6 +744,8 @@ def sharding(self):
         idea from [ZeRO: Memory Optimizations Toward Training Trillion Parameter Models](https://arxiv.org/abs/1910.02054).
         Model parameters and Optimizer State are sharded into different ranks allowing to fit larger model.
 
+        In Hybrid parallelism scenario, we use sharding config as uniform API to set each parallelism.
+
         Default value: False
 
         Examples:
@@ -770,29 +772,51 @@ def sharding_configs(self):
         Set sharding configurations. 
 
         **Note**:
-            fuse_broadcast_MB(float): size of a fused group of broadcasted parameters. 
-            This configuration will affect the communication speed in sharding training, 
-            and should be an empirical value decided by your model size and network topology.
+            sharding_segment_strategy(string, optional): strategy used to segment the program(forward & backward operations). two strategise are 
+            available: "segment_broadcast_MB" and "segment_anchors". segment is a concept used in sharding to overlap computation and 
+            communication. Default is segment_broadcast_MB.
+
+            segment_broadcast_MB(float, optional): segment by the parameters broadcast volume. sharding will introduce parameter broadcast operations into program, and 
+            after every segment_broadcast_MB size parameter being broadcasted, the program will be cutted into one segment.
+            This configuration will affect the communication speed in sharding training, and should be an empirical value decided by your model size and network topology.
+            Only enable when sharding_segment_strategy = segment_broadcast_MB. Default is 32.0 .
+
+            segment_anchors(list): list of anchors used to segment the program, which allows a finner control of program segmentation. 
+            this strategy is experimental by now. Only enable when sharding_segment_strategy = segment_anchors.
+
+            sharding_degree(int, optional): specific the number of gpus within each sharding parallelism group; and sharding will be turn off if sharding_degree=1.  Default is 8.
+
+            gradient_merge_acc_step(int, optional): specific the accumulation steps in gradient merge; and gradient merge will be turn off if gradient_merge_acc_step=1.  Default is 1.
+
+            optimize_offload(bool, optional): enable the optimizer offload which will offload the moment vars to Host memory in order to saving GPU memory for fitting larger model. 
+            the moment var will be prefetch from and offloaded to Host memory during update stage. it is a stragtegy that trades off between training speed and GPU memory, and is recommened to be turn on only when gradient_merge_acc_step large, where
+            the number of time of update stage will be relatively small compared with forward&backward's.  Default is False.
+
+            dp_degree(int, optional): specific the number of data parallelism group; when dp_degree >= 2, it will introduce dp_degree ways data parallelism as the outer parallelsim for the inner parallelsim. User is responsible to ensure global_world_size = mp_degree * sharding_degree * pp_degree * dp_degree. Default is 1.
 
-            hybrid_dp(bool): enable hybrid data parallelism above the sharding parallelism. 
-            you are supposed to have at least double the number of gpu you have in normal sharding 
-            training to enable this feature.
+            mp_degree(int, optional): [Hybrid parallelism ONLY] specific the the number of gpus within each megatron parallelism group; and megatron parallelism will turn be off if mp_degree=1.  Default is 1.
+
+            pp_degree(int, optional): [Hybrid parallelism ONLY] specific the the number of gpus within each pipeline parallelism group; and pipeline parallelism will turn be off if pp_degree=1.  Default is 1.
+
+            pp_allreduce_in_optimize(bool, optional): [Hybrid parallelism ONLY] move the allreduce operations from backward stage to update(optimize) stage when pipeline parallelsim is on. 
+            This configuration will affect the communication speed of Hybrid parallelism training depeneded on network topology. this strategy is experimental by now..  Default is False.
 
-            sharding_group_size(int): attribute of hybrid_dp. specific the the number of gpus within
-            each sharding group; and therefore, the number of hybrid data parallelism ways will be equal
-            to (global_size / sharding_group_size).
 
         Examples:
 
           .. code-block:: python
 
+            # sharding-DP, 2 nodes with 8 gpus per node
             import paddle.distributed.fleet as fleet
             strategy = fleet.DistributedStrategy()
             strategy.sharding = True
             strategy.sharding_configs = {
-                "fuse_broadcast_MB": 32,
-                "hybrid_dp": True,
-                "sharding_group_size": 8}
+                "sharding_segment_strategy": "segment_broadcast_MB",
+                "segment_broadcast_MB": 32,
+                "sharding_degree": 8,
+                "sharding_degree": 2,
+                "gradient_merge_acc_step": 4,
+                }
         """
         return get_msg_dict(self.strategy.sharding_configs)
 
@@ -845,7 +869,7 @@ def pipeline_configs(self):
         **Notes**:
             **Detailed arguments for pipeline_configs**
 
-            **micro_batch**: the number of small batches in each user defined batch
+            **micro_batch_size**: the number of small batches in each user defined batch
 
         Examples:
 
@@ -854,7 +878,7 @@ def pipeline_configs(self):
             import paddle.distributed.fleet as fleet
             strategy = fleet.DistributedStrategy()
             strategy.pipeline = True
-            strategy.pipeline_configs = {"micro_batch": 12}
+            strategy.pipeline_configs = {"micro_batch_size": 12}
 
         """
 
@@ -867,6 +891,92 @@ def pipeline_configs(self, configs):
                           "pipeline_configs")
         assign_configs_value(self.strategy.pipeline_configs, configs)
 
+    @property
+    def tensor_parallel(self):
+        """
+        Indicating whether we are using tensor parallel for distributed training.
+
+        Examples:
+
+          .. code-block:: python
+
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.tensor_parallel = True
+
+        """
+        return self.strategy.tensor_parallel
+
+    @tensor_parallel.setter
+    @is_strict_auto
+    def tensor_parallel(self, flag):
+        if isinstance(flag, bool):
+            self.strategy.tensor_parallel = flag
+        else:
+            print("WARNING: tensor_parallel should have value of bool type")
+
+    @property
+    def tensor_parallel_configs(self):
+        """
+        Set tensor_parallel configurations.
+
+        **Notes**:
+            **Detailed arguments for tensor_parallel_configs**
+            **tensor_parallel_degree**: degree of tensor parallel
+
+        Examples:
+
+          .. code-block:: python
+
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.tensor_parallel = True
+            strategy.tensor_parallel_configs = {"tensor_parallel_degree": 4}
+
+        """
+        return get_msg_dict(self.strategy.tensor_parallel_configs)
+
+    @tensor_parallel_configs.setter
+    @is_strict_auto
+    def tensor_parallel_configs(self, configs):
+        check_configs_key(self.strategy.tensor_parallel_configs, configs,
+                          "tensor_parallel_configs")
+        assign_configs_value(self.strategy.tensor_parallel_configs, configs)
+
+    @property
+    def hybrid_configs(self):
+        """
+        Dynamic graph hybrid parallel strategy configuration. Three-way hybrid parallelism 
+        needs to meet the following relationships
+
+        total_number_GPUs = dp_degree * mp_degree * pp_degree
+
+        **Note**:
+            dp_degree(int): set number of GPUs in a data parallel group. Default -1.
+                                    This value should be an integer greater than 0.
+                                    If it is not set, or set to -1, its value will be inferred 
+                                    based on the total number of cards.
+            mp_degree(int): set number of GPUs in a model parallel group. Default 1
+            pp_degree(int): set number of GPUs in a pipeline parallel group. Default 1
+
+
+        Examples:
+          .. code-block:: python
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.hybrid_configs = {
+                "dp_degree": 1,
+                "mp_degree": 2,
+                "pp_degree": 1}
+        """
+        return get_msg_dict(self.strategy.hybrid_configs)
+
+    @hybrid_configs.setter
+    def hybrid_configs(self, configs):
+        check_configs_key(self.strategy.hybrid_configs, configs,
+                          "hybrid_configs")
+        assign_configs_value(self.strategy.hybrid_configs, configs)
+
     @property
     def localsgd(self):
         """
diff --git a/python/paddle/distributed/fleet/base/fleet_base.py b/python/paddle/distributed/fleet/base/fleet_base.py
index 0a60cbf78d523..178edc0fe88c5 100644
--- a/python/paddle/distributed/fleet/base/fleet_base.py
+++ b/python/paddle/distributed/fleet/base/fleet_base.py
@@ -26,6 +26,11 @@
 from .runtime_factory import RuntimeFactory
 from paddle.fluid.wrapped_decorator import wrap_decorator
 from paddle.fluid.dygraph import parallel_helper
+from . import topology as tp
+from .topology import ParallelMode
+from ..meta_parallel import ModelParallel
+from ..meta_optimizers import HybridParallelOptimizer
+from ..meta_optimizers import HybridParallelGradScaler
 
 
 def _inited_runtime_handler_(func):
@@ -218,6 +223,9 @@ def init(self, role_maker=None, is_collective=False, strategy=None):
 
         if paddle.fluid.framework.in_dygraph_mode():
             if self.worker_num() == 1:
+                # if worker_num is 1, should construct default topology & hcg
+                self._topology = tp.CommunicateTopology()
+                self._hcg = tp.HybridCommunicateGroup(self._topology)
                 return
             if parallel_helper._is_parallel_ctx_initialized():
                 warnings.warn(
@@ -234,6 +242,48 @@ def init(self, role_maker=None, is_collective=False, strategy=None):
                         self._user_defined_strategy.nccl_comm_num)
                 paddle.distributed.init_parallel_env()
 
+            # init hybrid parallel environment in dygraph
+            if tp._HYBRID_PARALLEL_GROUP is None:
+                self._init_hybrid_parallel_env()
+            else:
+                warnings.warn(
+                    "The dygraph hybrid parallel environment has been initialized."
+                )
+
+    def _init_hybrid_parallel_env(self):
+        """initialize the hybrid environment
+        """
+        self.hybrid_configs = self._user_defined_strategy.hybrid_configs
+        self.dp_degree = self.hybrid_configs["dp_degree"]
+        self.mp_degree = self.hybrid_configs["mp_degree"]
+        self.pp_degree = self.hybrid_configs["pp_degree"]
+
+        assert self.mp_degree >= 0, "mp_degree should be greater or equal to 0"
+        assert self.pp_degree >= 0, "pp_degree should be greater or equal to 0"
+
+        self.mp_degree = max(self.mp_degree, 1)
+        self.pp_degree = max(self.pp_degree, 1)
+
+        if self.dp_degree < 0:
+            nranks = paddle.distributed.get_world_size()
+            self.dp_degree = nranks // (self.mp_degree * self.pp_degree)
+
+        self.dp_degree = max(self.dp_degree, 1)
+
+        self._topology = tp.CommunicateTopology(
+            hybrid_group_names=["data", "pipe", "model"],
+            dims=[self.dp_degree, self.pp_degree, self.mp_degree])
+
+        self._hcg = tp.HybridCommunicateGroup(self._topology)
+
+    def get_hybrid_communicate_group(self):
+        assert self._hcg is not None
+        return self._hcg
+
+    def get_hybrid_parallel_topology(self):
+        assert self._topology is not None
+        return self._topology
+
     def is_first_worker(self):
         """
         Check whether the node is the first instance of worker.
@@ -651,10 +701,12 @@ def distributed_optimizer(self, optimizer, strategy=None):
 
         self._context = {}
 
-        # TODO(shenliang03): This is a temporary solution to support amp. In the case of a dynamic graph, 
-        # the optimizer is returned directly. This problem will be fixed in the future.
         if paddle.fluid.framework.in_dygraph_mode():
-            return optimizer
+            if self.worker_num() > 1:
+                return HybridParallelOptimizer(optimizer, self._hcg,
+                                               self._user_defined_strategy)
+            else:
+                return optimizer
         return self
 
     @dygraph_only
@@ -713,15 +765,22 @@ def forward(self, x):
 
 
         """
-        assert model is not None
-        self.model = paddle.DataParallel(
-            model,
-            comm_buffer_size=self._user_defined_strategy.fuse_grad_size_in_MB,
-            last_comm_buffer_size=self._user_defined_strategy.
-            last_comm_group_size_MB,
-            find_unused_parameters=self._user_defined_strategy.
-            find_unused_parameters)
-        return self.model
+        assert model is not None, "model should not be None"
+        if self.worker_num() <= 1:
+            return model
+        if self._hcg.get_parallel_mode() == ParallelMode.DATA_PARALLEL:
+            distributed_model = paddle.DataParallel(
+                model,
+                comm_buffer_size=self._user_defined_strategy.
+                fuse_grad_size_in_MB,
+                last_comm_buffer_size=self._user_defined_strategy.
+                last_comm_group_size_MB,
+                find_unused_parameters=self._user_defined_strategy.
+                find_unused_parameters)
+        elif self._hcg.get_parallel_mode() == ParallelMode.MODEL_PARALLEL:
+            distributed_model = ModelParallel(
+                model, self._hcg, strategy=self._user_defined_strategy)
+        return distributed_model
 
     @dygraph_only
     def state_dict(self):
@@ -983,6 +1042,28 @@ def forward(self, x):
         # imitate target optimizer retrieval
         return self.user_defined_optimizer.clear_grad()
 
+    def _get_amp_optimizer(self):
+        # imitate target optimizer retrieval
+        amp_optimizer = None
+        for optimizer in self.strategy_compiler._get_applied_meta_optimizer():
+            if hasattr(optimizer, 'amp_init'):
+                amp_optimizer = optimizer
+                break
+
+        if amp_optimizer is None:
+            if hasattr(self.user_defined_optimizer, 'amp_init'):
+                amp_optimizer = self.user_defined_optimizer
+
+        assert amp_optimizer is not None, \
+            "amp_init can only be used when the amp(auto mixed precision) strategy is turned on."
+        return amp_optimizer
+
+    def get_loss_scaling(self):
+        """Return the real-time loss scaling factor.
+        """
+        amp_optimizer = self._get_amp_optimizer()
+        return amp_optimizer.get_loss_scaling()
+
     def amp_init(self,
                  place,
                  scope=None,
@@ -1043,21 +1124,7 @@ def run_example_code():
                 if paddle.is_compiled_with_cuda() and len(paddle.static.cuda_places()) > 0:
                     run_example_code()       
         """
-
-        # imitate target optimizer retrieval
-        amp_optimizer = None
-        for optimizer in self.strategy_compiler._get_applied_meta_optimizer():
-            if hasattr(optimizer, 'amp_init'):
-                amp_optimizer = optimizer
-                break
-
-        if amp_optimizer is None:
-            if hasattr(self.user_defined_optimizer, 'amp_init'):
-                amp_optimizer = self.user_defined_optimizer
-
-        assert amp_optimizer is not None, \
-            "amp_init can only be used when the amp(auto mixed precision) strategy is turned on."
-
+        amp_optimizer = self._get_amp_optimizer()
         return amp_optimizer.amp_init(place, scope, test_program, use_fp16_test)
 
     def _final_strategy(self):
@@ -1269,3 +1336,7 @@ def minimize(self,
         fleet.util._set_strategy(context["valid_strategy"])
 
         return optimize_ops, params_grads
+
+    @dygraph_only
+    def distributed_scaler(self, scaler):
+        return HybridParallelGradScaler(scaler, self._hcg)
diff --git a/python/paddle/distributed/fleet/base/meta_optimizer_factory.py b/python/paddle/distributed/fleet/base/meta_optimizer_factory.py
index f845b3fcd8953..6989eec119f78 100755
--- a/python/paddle/distributed/fleet/base/meta_optimizer_factory.py
+++ b/python/paddle/distributed/fleet/base/meta_optimizer_factory.py
@@ -17,6 +17,10 @@
 meta_optimizer_names = list(
     filter(lambda name: name.endswith("Optimizer"), dir()))
 
+# Because HybridParallelOptimizer is dygraph optimizer, it 
+# should be removed
+meta_optimizer_names.remove("HybridParallelOptimizer")
+
 
 class MetaOptimizerFactory(object):
     def __init__(self):
diff --git a/python/paddle/distributed/fleet/base/topology.py b/python/paddle/distributed/fleet/base/topology.py
index 4e20ad5061139..8f38ba447fcb3 100644
--- a/python/paddle/distributed/fleet/base/topology.py
+++ b/python/paddle/distributed/fleet/base/topology.py
@@ -12,16 +12,30 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+import sys
 import paddle
 import collections
 import numpy as np
 from itertools import product
 from functools import reduce
+from ..utils.log_util import logger
+
 __all__ = ['CommunicateTopology', 'HybridCommunicateGroup']
 
+_HYBRID_PARALLEL_GROUP = None
+
+
+class ParallelMode(object):
+    DATA_PARALLEL = 0
+    MODEL_PARALLEL = 1
+    PIPELINE_PARALLEL = 2
+
 
 class CommunicateTopology(object):
-    def __init__(self, hybrid_group_names, dims):
+    def __init__(self,
+                 hybrid_group_names=["data", "pipe", "model"],
+                 dims=[1, 1, 1]):
         self._parallel_names = hybrid_group_names
         self._dims = dims
         self.coordinate = collections.namedtuple('Coordinate',
@@ -100,26 +114,58 @@ def __init__(self, topology):
         self.global_rank = paddle.distributed.get_rank()
         self._topo = topology
 
-        self._num_data_parallel = self._topo.get_dim('data')
-        self._num_model_parallel = self._topo.get_dim('model')
-        self._num_pipe_parallel = self._topo.get_dim('pipe')
+        self._dp_degree = self._topo.get_dim('data')
+        self._mp_degree = self._topo.get_dim('model')
+        self._pp_degree = self._topo.get_dim('pipe')
 
         self._data_parallel_id = self._get_data_parallel_id()
         self._model_parallel_id = self._get_model_parallel_id()
+        self.stage_id = self._get_pipe_parallel_id()
 
         assert self._check_vaild_topo(
-        ), "Here is an unreasonable topogy setting"
+        ), "Here is an unreasonable topogy setting. world_size: {}, but" \
+            "dp_num: {}, mp_num: {}, pp_num: {}".format(self.nranks, self._dp_degree,
+            self._mp_degree, self._pp_degree)
 
         # create comm group for data parallel
         self._dp_group, self._dp_comm_group = self._set_comm_group("data")
-        print("data parallel group", self._dp_group)
 
         # create comm group for model parallel
         self._mp_group, self._mp_comm_group = self._set_comm_group("model")
-        print("model parallel group", self._mp_group)
+
+        # create comm group for pipe parallel
+        self._pp_group, self._pp_comm_group = self._set_comm_group("pipe")
+
+        # create global group for check inf_nan / clip global norm
+        self._check_group, self._check_comm_group = self._set_check_group(
+            "data")
+
+        # create p2p group
+        self.is_first_stage = (self.stage_id == 0)
+        self.is_last_stage = (self.stage_id == (self._pp_degree - 1))
+
+        debug_str = "HybridParallelInfo: rank_id: %d, dp_degree: %d, " \
+                    "mp_degree: %d, pp_degree: %d" % (self.global_rank, self._dp_degree,
+                    self._mp_degree,self._pp_degree)
+        debug_str += "dp_group: %s, mp_group: %s, pp_group: %s, check/clip group: %s" % (
+            self._dp_group, self._mp_group, self._pp_group, self._check_group)
+        logger.info(debug_str)
+
+        global _HYBRID_PARALLEL_GROUP
+        _HYBRID_PARALLEL_GROUP = self
+
+    def get_parallel_mode(self):
+        # there are three modes : DataParallel / ModelParallel / PipelineParallel
+        if self._mp_degree == 1 and self._pp_degree == 1:
+            return ParallelMode.DATA_PARALLEL
+        elif self._mp_degree > 1 and self._pp_degree == 1:
+            # initialize the seed
+            return ParallelMode.MODEL_PARALLEL
+        elif self._pp_degree > 1:
+            return ParallelMode.PIPELINE_PARALLEL
 
     def _check_vaild_topo(self):
-        return self._num_data_parallel * self._num_model_parallel * self._num_pipe_parallel == self.nranks
+        return self._dp_degree * self._mp_degree * self._pp_degree == self.nranks
 
     def _set_comm_group(self, parallel_method="data"):
         parallel_group = []
@@ -137,6 +183,22 @@ def _set_comm_group(self, parallel_method="data"):
 
         return parallel_group, parallel_comm_group
 
+    def _set_check_group(self, parallel_method="data"):
+        parallel_group = []
+        parallel_comm_group = None
+        parallel_size = self._topo.get_dim(parallel_method)
+        for idx in range(parallel_size):
+            parallel_groups = self._topo.get_axis_list(parallel_method, idx)
+            comm_group = paddle.distributed.new_group(ranks=parallel_groups)
+            if self.global_rank in parallel_groups:
+                parallel_group = parallel_groups
+                parallel_comm_group = comm_group
+
+        assert len(parallel_group) > 0
+        assert parallel_comm_group is not None
+
+        return parallel_group, parallel_comm_group
+
     def topology(self):
         return self._topo
 
@@ -151,7 +213,7 @@ def get_data_parallel_rank(self):
         return self._data_parallel_id
 
     def get_data_parallel_world_size(self):
-        return self._num_data_parallel
+        return self._dp_degree
 
     def get_data_parallel_group(self):
         return self._dp_comm_group
@@ -167,10 +229,27 @@ def get_model_parallel_rank(self):
         return self._model_parallel_id
 
     def get_model_parallel_world_size(self):
-        return self._num_model_parallel
+        return self._mp_degree
 
     def get_model_parallel_group(self):
         return self._mp_comm_group
 
     def get_model_parallel_group_src_rank(self):
         return self._mp_comm_group.ranks[0]
+
+    # pipeline parallel message
+    def _get_pipe_parallel_id(self):
+        return self._topo.get_coord(self.global_rank).pipe
+
+    def get_stage_id(self):
+        return self.stage_id
+
+    def get_pipe_parallel_world_size(self):
+        return self._pp_degree
+
+    def get_pipe_parallel_group(self):
+        return self._pp_comm_group
+
+    # check parallel group
+    def get_check_parallel_group(self):
+        return self._check_comm_group
diff --git a/python/paddle/distributed/fleet/dataset/__init__.py b/python/paddle/distributed/fleet/dataset/__init__.py
index af33c4eafb396..24b68596f2541 100644
--- a/python/paddle/distributed/fleet/dataset/__init__.py
+++ b/python/paddle/distributed/fleet/dataset/__init__.py
@@ -12,3 +12,4 @@
 # See the License for the specific language governing permissions and
 
 from .dataset import *
+from .index_dataset import *
diff --git a/python/paddle/distributed/fleet/dataset/index_dataset.py b/python/paddle/distributed/fleet/dataset/index_dataset.py
new file mode 100644
index 0000000000000..dfd3daa9570b9
--- /dev/null
+++ b/python/paddle/distributed/fleet/dataset/index_dataset.py
@@ -0,0 +1,88 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddle.fluid import core
+
+
+class Index(object):
+    def __init__(self, name):
+        self._name = name
+
+
+class TreeIndex(Index):
+    def __init__(self, name, path):
+        super(TreeIndex, self).__init__(name)
+        self._wrapper = core.IndexWrapper()
+        self._wrapper.insert_tree_index(name, path)
+        self._tree = self._wrapper.get_tree_index(name)
+        self._height = self._tree.height()
+        self._branch = self._tree.branch()
+        self._total_node_nums = self._tree.total_node_nums()
+        self._emb_size = self._tree.emb_size()
+        self._layerwise_sampler = None
+
+    def height(self):
+        return self._height
+
+    def branch(self):
+        return self._branch
+
+    def total_node_nums(self):
+        return self._total_node_nums
+
+    def emb_size(self):
+        return self._emb_size
+
+    def get_all_leafs(self):
+        return self._tree.get_all_leafs()
+
+    def get_nodes(self, codes):
+        return self._tree.get_nodes(codes)
+
+    def get_layer_codes(self, level):
+        return self._tree.get_layer_codes(level)
+
+    def get_travel_codes(self, id, start_level=0):
+        return self._tree.get_travel_codes(id, start_level)
+
+    def get_ancestor_codes(self, ids, level):
+        return self._tree.get_ancestor_codes(ids, level)
+
+    def get_children_codes(self, ancestor, level):
+        return self._tree.get_children_codes(ancestor, level)
+
+    def get_travel_path(self, child, ancestor):
+        res = []
+        while (child > ancestor):
+            res.append(child)
+            child = int((child - 1) / self._branch)
+        return res
+
+    def get_pi_relation(self, ids, level):
+        codes = self.get_ancestor_codes(ids, level)
+        return dict(zip(ids, codes))
+
+    def init_layerwise_sampler(self,
+                               layer_sample_counts,
+                               start_sample_layer=1,
+                               seed=0):
+        assert self._layerwise_sampler is None
+        self._layerwise_sampler = core.IndexSampler("by_layerwise", self._name)
+        self._layerwise_sampler.init_layerwise_conf(layer_sample_counts,
+                                                    start_sample_layer, seed)
+
+    def layerwise_sample(self, user_input, index_input, with_hierarchy=False):
+        if self._layerwise_sampler is None:
+            raise ValueError("please init layerwise_sampler first.")
+        return self._layerwise_sampler.sample(user_input, index_input,
+                                              with_hierarchy)
diff --git a/python/paddle/distributed/fleet/launch.py b/python/paddle/distributed/fleet/launch.py
index bd5b67005ba92..89ca7e1961331 100644
--- a/python/paddle/distributed/fleet/launch.py
+++ b/python/paddle/distributed/fleet/launch.py
@@ -112,17 +112,8 @@ def _parse_args():
     base_group.add_argument(
         "--run_mode",
         type=str,
-        default="collective",
-        help="run mode of job, can be:collective/ps/ps-heter")
-
-    base_group.add_argument(
-        "--ascend_npus",
-        type=str,
         default=None,
-        help="It's for ascend npu training."
-        "For example:"
-        "--ascend_npus=\"0,1,2,3\" will launch four training processes each bound to one npu."
-    )
+        help="run mode of job, can be:collective/ps/ps-heter")
 
     if fluid.core.is_compiled_with_cuda():
         base_group.add_argument(
@@ -243,7 +234,6 @@ def launch_collective(args):
         cluster, pod = ascend_utils.get_cloud_cluster(
             rank_table_file=os.getenv("RANK_TABLE_FILE", None),
             device_mode=device_mode,
-            devices_per_proc=devices_per_proc,
             start_port=start_port)
     else:
         # trainers_num = 1 or not use paddlecloud ips="a,b"
diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py
index 9f6c186b35339..b4d5c58abbf2e 100644
--- a/python/paddle/distributed/fleet/launch_utils.py
+++ b/python/paddle/distributed/fleet/launch_utils.py
@@ -484,6 +484,11 @@ def start_local_trainers(cluster,
             proc_env["FLAGS_selected_gpus"] = "%s" % ",".join(
                 [str(g) for g in t.accelerators])
 
+        elif len(t.
+                 accelerators) > 0 and pod.device_mode == DeviceMode.ASCEND_NPU:
+            proc_env["FLAGS_selected_npus"] = "%s" % ",".join(
+                [str(g) for g in t.accelerators])
+
         if len(t.accelerators) > 0:
             proc_env["FLAGS_selected_accelerators"] = "%s" % ",".join(
                 [str(g) for g in t.accelerators])
@@ -589,17 +594,6 @@ def watch_local_trainers(procs, nranks):
     return alive
 
 
-def get_ascend_npus(npus):
-    if npus is None:
-        count = fluid.core.NPUDevice.get_device_count()
-        if count <= 0:
-            return None
-        ret = [str(x) for x in range(count)]
-    else:
-        ret = [x.strip() for x in npus.split(',')]
-    return ret
-
-
 def get_gpus(gpus):
     if gpus is None:
         gpus_num = fluid.core.get_cuda_device_count()
@@ -697,9 +691,7 @@ def get_device_proc_info(args):
         else:
             devices_per_proc = gpus
     elif device_mode == DeviceMode.ASCEND_NPU:
-        npus = get_ascend_npus(args.ascend_npus)
-        assert args.nproc_per_node is None, "ascend_npus need't nproc_per_node arguments"
-        devices_per_proc = npus
+        devices_per_proc = None
     elif device_mode == DeviceMode.XPU:
         xpus = get_xpus(args.xpus)
         if args.nproc_per_node is not None:
diff --git a/python/paddle/distributed/fleet/meta_optimizers/__init__.py b/python/paddle/distributed/fleet/meta_optimizers/__init__.py
index cdc8162f6dee5..827835fde20e3 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/__init__.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/__init__.py
@@ -25,3 +25,6 @@
 from .lamb_optimizer import LambOptimizer
 from .fp16_allreduce_optimizer import FP16AllReduceOptimizer
 from .sharding_optimizer import ShardingOptimizer
+from .dygraph_optimizer import HybridParallelOptimizer
+from .dygraph_optimizer import HybridParallelGradScaler
+from .tensor_parallel_optimizer import TensorParallelOptimizer
diff --git a/python/paddle/distributed/fleet/meta_optimizers/common.py b/python/paddle/distributed/fleet/meta_optimizers/common.py
index a7f938647ad71..9e2723dad729a 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/common.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/common.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 from __future__ import print_function
+import os
 
 import paddle.fluid as fluid
 from paddle.fluid import core, unique_name
@@ -77,6 +78,7 @@ def _init_communicator(self,
         nranks = len(endpoints)
         other_endpoints = endpoints[:]
         other_endpoints.remove(current_endpoint)
+
         if rank == 0 and wait_port:
             wait_server_ready(other_endpoints)
 
@@ -163,6 +165,33 @@ def _add_sync_by_allreduce(block):
                     'ring_id': ring_id,
                     OP_ROLE_KEY: OpRole.Forward
                 })
+        elif core.is_compiled_with_npu():
+            hccl_id_var = block.create_var(
+                name=unique_name.generate('hccl_id'),
+                persistable=True,
+                type=core.VarDesc.VarType.RAW)
+            endpoint_to_index_map = {e: idx for idx, e in enumerate(endpoints)}
+            block.append_op(
+                type='c_gen_hccl_id',
+                inputs={},
+                outputs={'Out': hccl_id_var},
+                attrs={
+                    'rank': rank,
+                    'endpoint': current_endpoint,
+                    'other_endpoints': other_endpoints,
+                    OP_ROLE_KEY: OpRole.Forward
+                })
+            block.append_op(
+                type='c_comm_init_hccl',
+                inputs={'X': hccl_id_var},
+                outputs={},
+                attrs={
+                    'rank': rank,
+                    'ring_id': ring_id,
+                    'device_id': int(os.getenv("FLAGS_selected_npus")),
+                    'rank_ids': nranks,
+                    OP_ROLE_KEY: OpRole.Forward
+                })
         else:
             raise ValueError(
                 "comm_id must be generated in paddlepaddle-xpu or paddlepaddle-xpu."
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/__init__.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/__init__.py
new file mode 100644
index 0000000000000..4e41723cb622d
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/__init__.py
@@ -0,0 +1,14 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+from .hybrid_parallel_optimizer import HybridParallelOptimizer
+from .hybrid_parallel_gradscaler import HybridParallelGradScaler
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py
new file mode 100644
index 0000000000000..11bb897a678b7
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py
@@ -0,0 +1,77 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import sys
+from paddle.optimizer import Optimizer
+from ...base.topology import ParallelMode
+from paddle.fluid.dygraph import base as imperative_base
+from paddle.fluid import framework
+from paddle.fluid.framework import Variable
+import types
+from paddle.fluid import core
+import paddle
+
+
+class HybridParallelGradScaler:
+    def __init__(self, scaler, hcg):
+        self._scaler = scaler
+        self._hcg = hcg
+        self._is_mp = (
+            self._hcg.get_parallel_mode() == ParallelMode.MODEL_PARALLEL)
+
+    def scale(self, var):
+        return self._scaler.scale(var)
+
+    def minimize(self, optimizer, *args, **kwargs):
+        if not self._enable:
+            return optimizer.minimize(*args, **kwargs)
+
+        #  unscale the grad
+        self._unscale(optimizer)
+
+        optimize_ops, params_grads = (None, None)
+
+        if self._found_inf:
+            self._cache_founf_inf = True
+        else:
+            optimize_ops, params_grads = optimizer.minimize(*args, **kwargs)
+            self._cache_founf_inf = False
+
+        if self._use_dynamic_loss_scaling:
+            self._update()
+
+        return optimize_ops, params_grads
+
+    @imperative_base.no_grad
+    def _unscale(self, optimizer):
+        if not self._enable:
+            return
+        param_grads = [
+            param._grad_ivar() for param in optimizer._parameter_list
+            if param._grad_ivar() is not None
+        ]
+        core.ops.check_finite_and_unscale(param_grads, self._scale, param_grads,
+                                          self._found_inf)
+        # allreduce_max found_inf in check_group
+        if self._is_mp:
+            self._found_inf = paddle.cast(self._found_inf, dtype="int32")
+            paddle.distributed.all_reduce(
+                self._found_inf,
+                op=paddle.distributed.ReduceOp.MAX,
+                group=self._hcg.get_check_parallel_group())
+            self._found_inf = paddle.cast(self._found_inf, dtype="bool")
+
+    def __getattr__(self, item):
+        return getattr(self._scaler, item)
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
new file mode 100644
index 0000000000000..52e87173684a3
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
@@ -0,0 +1,127 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import sys
+from paddle.optimizer import Optimizer
+from paddle.fluid.clip import ClipGradByGlobalNorm
+from ...utils.hybrid_parallel_util import fused_allreduce_gradients
+from ...base.topology import ParallelMode
+from paddle.fluid.dygraph import base as imperative_base
+from paddle.fluid import framework
+from paddle.fluid.framework import Variable
+from ...utils.log_util import logger
+
+
+class HybridParallelClipGrad:
+    def __init__(self, clip, hcg):
+        self._clip = clip
+        self._hcg = hcg
+
+    @imperative_base.no_grad
+    def _dygraph_clip(self, params_grads):
+        params_and_grads = []
+        sum_square_list = []
+        for p, g in params_grads:
+            if g is None:
+                continue
+            if getattr(p, 'need_clip', True) is False:
+                continue
+            merge_grad = g
+            if g.type == core.VarDesc.VarType.SELECTED_ROWS:
+                merge_grad = layers.merge_selected_rows(g)
+                merge_grad = layers.get_tensor_from_selected_rows(merge_grad)
+            square = layers.square(merge_grad)
+            sum_square = layers.reduce_sum(square)
+            sum_square_list.append(sum_square)
+
+        # all parameters have been filterd out
+        if len(sum_square_list) == 0:
+            return params_grads
+
+        global_norm_var = layers.concat(sum_square_list)
+        global_norm_var = layers.reduce_sum(global_norm_var)
+        # add all reduce to get global norm in world size
+        paddle.distributed.all_reduce(global_norm_var,
+                                      self._hcg.get_check_parallel_group())
+        global_norm_var = layers.sqrt(global_norm_var)
+
+        max_global_norm = layers.fill_constant(
+            shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm)
+        clip_var = layers.elementwise_div(
+            x=max_global_norm,
+            y=layers.elementwise_max(
+                x=global_norm_var, y=max_global_norm))
+        for p, g in params_grads:
+            if g is None:
+                continue
+            if getattr(p, 'need_clip', True) is False:
+                params_and_grads.append((p, g))
+                continue
+            new_grad = layers.elementwise_mul(x=g, y=clip_var)
+            params_and_grads.append((p, new_grad))
+
+        return params_and_grads
+
+    def __getattr__(self, item):
+        return getattr(self._clip, item)
+
+    def __call__(self, params_grads):
+        return self._clip(params_grads)
+
+
+class HybridParallelOptimizer:
+    # adapter wrapper for optimizer
+    def __init__(self, optimizer, hcg, strategy):
+        self._inner_opt = optimizer
+        self._strategy = strategy
+        self._hcg = hcg
+        self._is_mp = (
+            self._hcg.get_parallel_mode() == ParallelMode.MODEL_PARALLEL)
+        self._need_dp = (self._hcg.get_data_parallel_world_size() > 1)
+
+        if isinstance(self._inner_opt._grad_clip,
+                      ClipGradByGlobalNorm) and self._is_mp:
+            logger.warning("using ClipGradByGlobalNorm in ModelParallel, the origin " \
+                  "optmizer'grad clip will be changed.")
+            self._inner_opt._grad_clip = HybridParallelClipGrad(
+                self._inner_opt._grad_clip, hcg)
+
+    @imperative_base.no_grad
+    @framework.dygraph_only
+    def step(self):
+        if self._is_mp and self._need_dp:
+            fused_allreduce_gradients(
+                list(self._inner_opt._parameter_list), self._hcg)
+        self._inner_opt.step()
+
+    @imperative_base.no_grad
+    def minimize(self,
+                 loss,
+                 startup_program=None,
+                 parameters=None,
+                 no_grad_set=None):
+        assert isinstance(loss, Variable), "The loss should be an Tensor."
+
+        parameter_list = parameters if parameters \
+            else self._parameter_list
+
+        if self._is_mp and self._need_dp:
+            fused_allreduce_gradients(list(parameter_list), self._hcg)
+
+        return self._inner_opt.minimize(loss, startup_program, parameters,
+                                        no_grad_set)
+
+    def __getattr__(self, item):
+        return getattr(self._inner_opt, item)
diff --git a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py
index dd13f9bc5d4e7..f6d2af0b416d2 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py
@@ -38,6 +38,23 @@ def _can_apply(self):
         k_steps = self.user_defined_strategy.a_sync_configs["k_steps"]
         return True if k_steps >= 0 else False
 
+    def get_dist_env(self):
+        trainer_id = int(os.getenv('PADDLE_TRAINER_ID', '0'))
+        trainer_endpoints = ''
+        current_endpoint = ''
+        num_trainers = 0
+        if os.getenv('PADDLE_TRAINER_ENDPOINTS'):
+            trainer_endpoints = os.getenv('PADDLE_TRAINER_ENDPOINTS')
+            current_endpoint = trainer_endpoints.split(',')[trainer_id]
+            num_trainers = len(trainer_endpoints.split(','))
+
+        return {
+            'trainer_id': trainer_id,
+            'num_trainers': num_trainers,
+            'current_endpoint': current_endpoint,
+            'trainer_endpoints': trainer_endpoints
+        }
+
     def _get_distributed_strategy(self):
         from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory
 
@@ -64,6 +81,8 @@ def _build_trainer_programs(self, compiled_config):
         _main = compiled_config.origin_main_program.clone()
         _startup = compiled_config.origin_startup_program.clone()
 
+        use_ps_gpu = self.user_defined_strategy.a_sync_configs["use_ps_gpu"]
+
         if not compiled_config.is_geo_mode():
             from paddle.fluid.incubate.fleet.parameter_server.ir.public import _add_lr_decay_table_pass
             _add_lr_decay_table_pass(
@@ -71,14 +90,28 @@ def _build_trainer_programs(self, compiled_config):
                 self.user_defined_strategy.a_sync_configs["lr_decay_steps"])
 
             # for main program
-            _main = worker.delete_optimizer_pass(_main, compiled_config)
-            _main = worker.distributed_ops_pass(_main, compiled_config)
-            _main = worker.append_send_ops_pass(_main, compiled_config)
-
-            # for startup program
+            _main = worker.distributed_ops_pass(_main, compiled_config,
+                                                use_ps_gpu)
+            if not use_ps_gpu:
+                _main = worker.delete_optimizer_pass(_main, compiled_config)
+                _main = worker.append_send_ops_pass(_main, compiled_config)
+                _startup = worker.delet_extra_optimizes_pass(_startup,
+                                                             compiled_config)
+
+                # for startup program
             _startup = worker.fake_init_ops_pass(_startup, compiled_config)
-            _startup = worker.delet_extra_optimizes_pass(_startup,
-                                                         compiled_config)
+            if use_ps_gpu:
+                _main = worker.ps_gpu_pass(_main)
+                from paddle.fluid.transpiler.collective import SingleProcessMultiThread
+                t = SingleProcessMultiThread()
+                env = self.get_dist_env()
+                t.transpile(
+                    startup_program=_startup,
+                    main_program=_main,
+                    rank=env["trainer_id"],
+                    endpoints=env["trainer_endpoints"],
+                    current_endpoint=env['current_endpoint'],
+                    wait_port=False)
 
             compiled_config.set_origin_ps_main_program(_main)
             compiled_config.set_origin_ps_startup_program(_startup)
diff --git a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
index ae2daa9b9d859..1aa51a6671c17 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
@@ -13,6 +13,7 @@
 
 from __future__ import print_function
 from __future__ import division
+import os
 
 import paddle.fluid as fluid
 from paddle.fluid import core, unique_name
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
index 2c4ad33c361e0..852421523b15b 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
@@ -365,8 +365,8 @@ def minimize_impl(self,
                   'w') as f:
             f.writelines(str(main_block.program))
 
-        self._wait()
-
+        if core.is_compiled_with_cuda():
+            self._wait()
         return optimize_ops, params_grads
 
     def _init_comm(self):
diff --git a/python/paddle/distributed/fleet/meta_optimizers/tensor_parallel_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/tensor_parallel_optimizer.py
new file mode 100644
index 0000000000000..2ba0195156082
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_optimizers/tensor_parallel_optimizer.py
@@ -0,0 +1,231 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+from __future__ import print_function
+from __future__ import division
+
+import paddle.fluid as fluid
+from paddle.fluid import core, unique_name
+from .meta_optimizer_base import MetaOptimizerBase
+from .common import OpRole, OP_ROLE_KEY, OP_ROLE_VAR_KEY, CollectiveHelper, is_update_op, is_loss_grad_op, is_backward_op, is_optimizer_op
+
+
+class TensorParallelOptimizer(MetaOptimizerBase):
+    def __init__(self, optimizer):
+        super(TensorParallelOptimizer, self).__init__(optimizer)
+        self.inner_opt = optimizer
+        self.meta_optimizers_white_list = [
+            "RecomputeOptimizer",
+            "AMPOptimizer",
+            "LarsOptimizer",
+            "LambOptimizer",
+        ]
+        self.meta_optimizers_black_list = ["GraphExecutionOptimizer", ]
+        self.mp_ring_id = 0
+        self.global_ring_id = 1
+        self.dp_ring_id = 2
+
+    def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
+                        user_defined_strategy):
+        super(TensorParallelOptimizer, self)._set_basic_info(
+            loss, role_maker, user_defined_optimizer, user_defined_strategy)
+        self.mp_degree = user_defined_strategy.tensor_parallel_configs[
+            'tensor_parallel_degree']
+
+    def _can_apply(self):
+        if not self.role_maker._is_collective:
+            return False
+
+        if self.user_defined_strategy.tensor_parallel == True:
+            return True
+        return False
+
+    def _disable_strategy(self, dist_strategy):
+        dist_strategy.tensor_parallel = False
+        dist_strategy.tensor_parallel_configs = {}
+
+    def _enable_strategy(self, dist_strategy, context):
+        dist_strategy.tensor_parallel = True
+        dist_strategy.tensor_parallel_configs = {"tensor_parallel_degree": 1, }
+
+    def _broadcast_params(self, ring_id, mp_mode):
+        block = self.startup_program.global_block()
+        param = None
+        for param in block.iter_parameters():
+            if param.is_distributed and mp_mode:
+                continue
+
+            block.append_op(
+                type='c_broadcast',
+                inputs={'X': param},
+                outputs={'Out': param},
+                attrs={
+                    'ring_id': ring_id,
+                    'root': 0,
+                    OP_ROLE_KEY: OpRole.Forward
+                })
+
+        if not param: return  # no parameter on this device
+        block.append_op(
+            type='c_sync_comm_stream',
+            inputs={'X': param},
+            outputs={'Out': param},
+            attrs={'ring_id': ring_id,
+                   OP_ROLE_KEY: OpRole.Forward})
+
+    def _get_process_group_info(self):
+        # global ring info
+        self.global_endpoints = self.endpoints
+        self.global_rank = self.rank
+        self.global_nranks = self.nranks
+
+        # model parallel ring info
+        self.mp_rank = self.rank % self.mp_degree
+        self.mp_nranks = self.mp_degree
+        mp_group = self.rank // self.mp_degree
+        self.mp_endpoints = [
+            self.endpoints[i] for i in range(self.global_nranks)
+            if i // self.mp_degree == mp_group
+        ]
+
+        # data parallel ring info
+        if self.nranks > self.mp_degree:
+            self.dp_rank = self.rank // self.mp_degree
+            self.dp_nranks = self.nranks // self.mp_degree
+            start_index = self.rank % self.mp_degree
+            self.dp_endpoints = [
+                self.endpoints[start_index + i * self.mp_degree]
+                for i in range(self.dp_nranks)
+            ]
+
+    def _init_process_group(self):
+        self._get_process_group_info()
+        collective_helper = CollectiveHelper(self.role_maker, wait_port=False)
+
+        # Create global ring for all gpus
+        collective_helper._init_communicator(
+            self.startup_program, self.current_endpoint, self.global_endpoints,
+            self.global_rank, self.global_ring_id, True, self.global_ring_id,
+            True)
+
+        # Create model parallel ring for all gpus
+        collective_helper._init_communicator(
+            self.startup_program, self.current_endpoint, self.mp_endpoints,
+            self.mp_rank, self.mp_ring_id, True, self.global_ring_id, True)
+        #self._broadcast_params(self.mp_ring_id, mp_mode=True)
+
+        # Create dp rings
+        if self.nranks > self.mp_degree:
+            collective_helper._init_communicator(
+                self.startup_program, self.current_endpoint, self.dp_endpoints,
+                self.dp_rank, self.dp_ring_id, True, self.global_ring_id, True)
+            self._broadcast_params(self.dp_ring_id, mp_mode=False)
+
+    def minimize_impl(self,
+                      loss,
+                      startup_program=None,
+                      parameter_list=None,
+                      no_grad_set=None):
+        self.endpoints = self.role_maker._get_trainer_endpoints()
+        self.current_endpoint = self.endpoints[self.role_maker._worker_index()]
+        self.startup_program = startup_program
+        if startup_program is None:
+            self.startup_program = fluid.default_startup_program()
+
+        optimize_ops, params_grads = self.inner_opt.minimize(
+            loss, self.startup_program, parameter_list, no_grad_set)
+
+        self.main_program = loss.block.program
+        self.nranks = len(self.endpoints)
+        self.rank = self.role_maker._worker_index()
+
+        self._init_process_group()
+
+        assert self.nranks % self.mp_degree == 0
+
+        if self.nranks > self.mp_degree:
+            # data parallelism
+            dp_degree = self.nranks // self.mp_degree
+            self._transpile_main_program(loss, dp_degree)
+        return optimize_ops, params_grads
+
+    def _transpile_main_program(self, loss, dp_degree):
+        self._insert_loss_grad_ops(loss, dp_degree)
+        self._insert_allreduce_ops(loss, self.dp_ring_id)
+
+    def _insert_loss_grad_ops(self, loss, dp_degree):
+        """
+        In order to keep the learning rate consistent in different numbers of
+        training workers, we scale the loss grad by the number of workers
+        """
+        block = loss.block
+        for idx, op in reversed(list(enumerate(block.ops))):
+            if is_loss_grad_op(op):
+                loss_grad_var = block.vars[op.output_arg_names[0]]
+                block._insert_op(
+                    idx + 1,
+                    type='scale',
+                    inputs={'X': loss_grad_var},
+                    outputs={'Out': loss_grad_var},
+                    attrs={
+                        'scale': 1.0 / dp_degree,
+                        OP_ROLE_KEY: OpRole.Backward
+                    })
+                break
+
+    def _insert_allreduce_ops(self, loss, ring_id):
+        block = loss.block
+        grad = None
+        for idx, op in reversed(list(enumerate(block.ops))):
+            if is_backward_op(op) and OP_ROLE_VAR_KEY in op.attr_names:
+                op_role_var = op.attr(OP_ROLE_VAR_KEY)
+                if len(op_role_var) == 0:
+                    continue
+                assert len(op_role_var) % 2 == 0
+                offset = idx
+                for i in range(0, len(op_role_var), 2):
+                    param = block.vars[op_role_var[i]]
+                    grad = block.vars[op_role_var[i + 1]]
+                    if offset == idx:
+                        offset += 1
+                        block._insert_op(
+                            offset,
+                            type='c_sync_calc_stream',
+                            inputs={'X': grad},
+                            outputs={'Out': grad},
+                            attrs={OP_ROLE_KEY: OpRole.Backward})
+                        offset += 1
+
+                    block._insert_op(
+                        offset,
+                        type='c_allreduce_sum',
+                        inputs={'X': grad},
+                        outputs={'Out': grad},
+                        attrs={
+                            'ring_id': ring_id,
+                            OP_ROLE_KEY: OpRole.Backward
+                        })
+
+        if grad is None:
+            return
+
+        for idx, op in list(enumerate(block.ops)):
+            if is_optimizer_op(op):
+                block._insert_op(
+                    idx,
+                    type='c_sync_comm_stream',
+                    inputs={'X': grad},
+                    outputs={'Out': grad},
+                    attrs={'ring_id': ring_id,
+                           OP_ROLE_KEY: OpRole.Backward})
+                break
diff --git a/python/paddle/fluid/contrib/utils/__init__.py b/python/paddle/distributed/fleet/meta_parallel/__init__.py
similarity index 64%
rename from python/paddle/fluid/contrib/utils/__init__.py
rename to python/paddle/distributed/fleet/meta_parallel/__init__.py
index 1c1c2fb227091..81fb9a6ea6d4e 100644
--- a/python/paddle/fluid/contrib/utils/__init__.py
+++ b/python/paddle/distributed/fleet/meta_parallel/__init__.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,12 +12,5 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from __future__ import print_function
-from . import lookup_table_utils
-from .lookup_table_utils import *
-from . import hdfs_utils
-from .hdfs_utils import *
-
-__all__ = []
-__all__ += lookup_table_utils.__all__
-__all__ += hdfs_utils.__all__
+from .parallel_layers import *
+from .model_parallel import ModelParallel
diff --git a/python/paddle/distributed/fleet/meta_parallel/meta_parallel_base.py b/python/paddle/distributed/fleet/meta_parallel/meta_parallel_base.py
new file mode 100644
index 0000000000000..6c8bf68fd1fb6
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_parallel/meta_parallel_base.py
@@ -0,0 +1,42 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.fluid.dygraph.layers import Layer
+
+
+class MetaParallelBase(Layer):
+    def __init__(self, layers, hcg, strategy):
+        super(MetaParallelBase,
+              self).__init__(layers.full_name() + "_meta_parallel_base")
+        self._layers = layers
+        self._hcg = hcg
+        self._prepare_for_model()
+
+    def _prepare_for_model(self):
+        pass
+
+    def _pre_forward(self, *inputs, **kwargs):
+        pass
+
+    def forward(self, *inputs, **kwargs):
+        self._pre_forward(*inputs, **kwargs)
+
+        output = self._layers(*inputs, **kwargs)
+
+        self._post_forward(output)
+
+        return output
+
+    def _post_forward(self, output):
+        pass
diff --git a/python/paddle/distributed/fleet/meta_parallel/model_parallel.py b/python/paddle/distributed/fleet/meta_parallel/model_parallel.py
new file mode 100644
index 0000000000000..ebf26498d9324
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_parallel/model_parallel.py
@@ -0,0 +1,36 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.fluid.dygraph.layers import Layer
+from .meta_parallel_base import MetaParallelBase
+from ..utils.hybrid_parallel_util import *
+from ..utils.log_util import logger
+
+
+class ModelParallel(MetaParallelBase):
+    def __init__(self, layers, hcg, **kwargs):
+        super(ModelParallel, self).__init__(layers, hcg, **kwargs)
+
+    def _prepare_for_model(self):
+        logger.info("start broadcast mp parameters")
+        broadcast_mp_parameters(self._layers, self._hcg)
+
+        logger.info("start broadcast mp parameters")
+        broadcast_dp_parameters(self._layers, self._hcg)
+
+        logger.info("mp's parameters is ready")
+
+    def _pre_forward(self, *inputs, **kwargs):
+        logger.debug("mp start broadcast input data")
+        return broadcast_input_data(self._hcg, *inputs, **kwargs)
diff --git a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/__init__.py b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/__init__.py
new file mode 100644
index 0000000000000..c4ec61e84ffa5
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .mp_layers import *
+from .pp_layers import *
+from .random import *
diff --git a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/layers_help.py b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/layers_help.py
new file mode 100644
index 0000000000000..e32db686efd44
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/layers_help.py
@@ -0,0 +1,116 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.autograd import PyLayer
+from ...base import topology as tp
+import paddle
+
+# Follow this paper to achieve the file:
+# Shoeybi M, Patwary M, Puri R, et al. Megatron-lm: Training multi-billion parameter 
+# language models using model parallelism[J]. arXiv preprint arXiv:1909.08053, 2019. (https://arxiv.org/abs/1909.08053)
+
+
+def mp_reduce(x):
+    if tp._HYBRID_PARALLEL_GROUP.get_model_parallel_world_size() == 1:
+        return x
+
+    paddle.distributed.all_reduce(
+        x, group=tp._HYBRID_PARALLEL_GROUP.get_model_parallel_group())
+
+    return x
+
+
+def mp_split(x):
+    world_size = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_world_size()
+
+    if world_size == 1:
+        return x
+
+    rank = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_rank()
+    last_dim = len(x.shape) - 1
+    input_list = paddle.split(x, num_or_sections=world_size, axis=last_dim)
+    output = input_list[rank]
+
+    return output
+
+
+def mp_gather(x):
+    world_size = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_world_size()
+
+    if world_size == 1:
+        return x
+
+    output = []
+    paddle.distributed.all_gather(
+        output, x, group=tp._HYBRID_PARALLEL_GROUP.get_model_parallel_group())
+
+    output = paddle.concat(output, axis=len(x.shape) - 1)
+
+    return output
+
+
+class _IdentityInModelParallel(PyLayer):
+    @staticmethod
+    def forward(ctx, x):
+        return x
+
+    @staticmethod
+    def backward(ctx, dx):
+        return mp_reduce(dx)
+
+
+class _ReduceInModelParallel(PyLayer):
+    @staticmethod
+    def forward(ctx, x):
+        return mp_reduce(x)
+
+    @staticmethod
+    def backward(ctx, dx):
+        return dx
+
+
+class _ScatterInModelParallel(PyLayer):
+    @staticmethod
+    def forward(ctx, x):
+        return mp_split(x)
+
+    @staticmethod
+    def backward(ctx, dx):
+        return mp_gather(dx)
+
+
+class _GatherInModelParallel(PyLayer):
+    @staticmethod
+    def forward(ctx, x):
+        return mp_gather(x)
+
+    @staticmethod
+    def backward(ctx, dx):
+        return mp_split(dx)
+
+
+def identity_in_model_parallel(x):
+    return _IdentityInModelParallel.apply(x)
+
+
+def reduce_in_model_parallel(x):
+    return _ReduceInModelParallel.apply(x)
+
+
+def scatter_in_model_parallel(x):
+    return _ScatterInModelParallel.apply(x)
+
+
+def gather_in_model_parallel(x):
+    return _GatherInModelParallel.apply(x)
diff --git a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/mp_layers.py b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/mp_layers.py
new file mode 100644
index 0000000000000..b7512afd9a6de
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/mp_layers.py
@@ -0,0 +1,190 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle.fluid.dygraph.layers import Layer
+from .random import get_rng_state_tracker
+from paddle.nn import functional as F
+from paddle import framework
+from ...base import topology as tp
+from .layers_help import identity_in_model_parallel, gather_in_model_parallel, reduce_in_model_parallel, scatter_in_model_parallel
+
+__all__ = [
+    'VocabParallelEmbedding', 'ColumnParallelLinear', 'RowParallelLinear'
+]
+
+# Follow this paper to achieve the file:
+# Shoeybi M, Patwary M, Puri R, et al. Megatron-lm: Training multi-billion parameter 
+# language models using model parallelism[J]. arXiv preprint arXiv:1909.08053, 2019. (https://arxiv.org/abs/1909.08053)
+
+
+class VocabParallelEmbedding(Layer):
+    def __init__(self,
+                 num_embeddings,
+                 embedding_dim,
+                 weight_attr=None,
+                 name=None):
+        super(VocabParallelEmbedding, self).__init__()
+
+        self.model_parallel_group = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_group(
+        )
+        self.world_size = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_world_size(
+        )
+        self.rank = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_rank()
+
+        self.origin_num_embeddings = num_embeddings
+
+        per_part_size = (
+            num_embeddings + self.world_size - 1) // self.world_size
+        last_part_size = num_embeddings - per_part_size * (self.world_size - 1)
+        if self.rank == self.world_size - 1:
+            per_part_size = last_part_size
+        per_part_size += 1  # make the last row as the padding index
+        self.per_part_size = per_part_size
+
+        self.embedding = paddle.nn.Embedding(
+            per_part_size,
+            embedding_dim,
+            padding_idx=per_part_size - 1,
+            sparse=False,
+            weight_attr=weight_attr,
+            name=name)
+        self.embedding.weight.is_distributed = True
+
+    def forward(self, x):
+        origin_input_shape = x.shape
+        if len(origin_input_shape) == 2:
+            x = paddle.unsqueeze(x, axis=-1)
+        else:
+            assert origin_input_shape[-1] == 1, (
+                "The last dimension size of x must be 1.")
+        x_shard = paddle.shard_index(x, self.origin_num_embeddings,
+                                     self.world_size, self.rank,
+                                     self.per_part_size - 1)
+        if len(origin_input_shape) == 2:
+            x_shard = paddle.squeeze(x_shard, axis=-1)
+
+        emb_out_ = self.embedding(x_shard)
+        emb_out = reduce_in_model_parallel(emb_out_)
+        return emb_out
+
+
+class ColumnParallelLinear(Layer):
+    def __init__(self,
+                 in_features,
+                 out_features,
+                 weight_attr=None,
+                 has_bias=None,
+                 gather_output=True,
+                 name=None):
+        super(ColumnParallelLinear, self).__init__()
+
+        self.model_parallel_group = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_group(
+        )
+        self.world_size = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_world_size(
+        )
+
+        self.name = name
+        self.gather_output = gather_output
+        assert out_features % self.world_size == 0, (
+            "Number of column of the weight for linear ({}) must be"
+            " divisible by model parallel size ({})".format(out_features,
+                                                            self.world_size))
+        self.output_size_per_partition = out_features // self.world_size
+
+        self._weight_attr = weight_attr
+        self._dtype = self._helper.get_default_dtype()
+
+        self.weight = self.create_parameter(
+            shape=[in_features, self.output_size_per_partition],
+            attr=self._weight_attr,
+            dtype=self._dtype)
+        self.weight.is_distributed = True
+
+        if has_bias:
+            # initialize bias to zero like Megatron
+            self.bias = self.create_parameter(
+                shape=[self.output_size_per_partition],
+                attr=paddle.nn.initializer.Constant(value=0.0),
+                dtype=self._dtype)
+            self.bias.is_distributed = True
+        else:
+            self.bias = None
+
+    def forward(self, x):
+        input_parallel = identity_in_model_parallel(x)
+        output_parallel = F.linear(
+            input_parallel, self.weight, self.bias, name=self.name)
+        if self.gather_output:
+            output = gather_in_model_parallel(output_parallel)
+        else:
+            output = output_parallel
+        return output
+
+
+class RowParallelLinear(Layer):
+    def __init__(self,
+                 in_features,
+                 out_features,
+                 weight_attr=None,
+                 has_bias=True,
+                 input_is_parallel=False,
+                 name=None):
+        super(RowParallelLinear, self).__init__()
+
+        self.in_features = in_features
+        self.out_features = out_features
+        self.input_is_parallel = input_is_parallel
+        self._weight_attr = weight_attr
+        self._dtype = self._helper.get_default_dtype()
+        self.name = name
+
+        self.model_parallel_group = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_group(
+        )
+        self.world_size = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_world_size(
+        )
+        self.rank = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_rank()
+
+        assert in_features % self.world_size == 0, (
+            "Number of row of the weight for linear ({}) must be"
+            " divisible by model parallel size ({})".format(in_features,
+                                                            self.world_size))
+
+        self.input_size_per_partition = in_features // self.world_size
+
+        self.weight = self.create_parameter(
+            shape=[self.input_size_per_partition, self.out_features],
+            attr=self._weight_attr,
+            dtype=self._dtype)
+        self.weight.is_distributed = True
+
+        if has_bias:
+            self.bias = self.create_parameter(
+                shape=[self.out_features],
+                attr=paddle.nn.initializer.Constant(value=0.0),
+                dtype=self._dtype)
+        else:
+            self.bias = None
+
+    def forward(self, x):
+        if self.input_is_parallel:
+            input_parallel = x
+        else:
+            # split last dim
+            input_parallel = scatter_in_model_parallel(x)
+
+        output_parallel = F.linear(input_parallel, self.weight, name=self.name)
+        output_ = reduce_in_model_parallel(output_parallel)
+        output = output_ + self.bias if self.bias is not None else output_
+        return output
diff --git a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py
new file mode 100644
index 0000000000000..e2db689eb7674
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py
@@ -0,0 +1,156 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import paddle
+from paddle.fluid.dygraph.layers import Layer
+from ...utils.log_util import logger, layer_to_str
+
+__all__ = ['LayerDesc', 'PipelineLayer']
+
+
+class SegmentLayers(object):
+    def __init__(self, layers_desc, num_parts, method="uniform"):
+        self._layers_desc = layers_desc
+        self.method = method
+        self.num_parts = num_parts
+        self.num_items = len(layers_desc)
+        assert self.num_items >= self.num_parts, "layer number should be greater than number of segments"
+
+    def do_segment(self):
+        if self.method == "uniform":
+            return self.uniform(self.num_items, self.num_parts)
+
+    def uniform(self, num_items, num_parts):
+        result = [0 for _ in range(num_parts + 1)]
+        part_size = math.floor(num_items / num_parts)
+        for i in range(num_parts):
+            result[i] = int(min(part_size * i, num_items))
+        result[num_parts] = num_items
+        return result
+
+
+class LayerDesc(object):
+    def __init__(self, layer_func, *inputs, **kwargs):
+        self.layer_func = layer_func
+        self.inputs = inputs
+        self.kwargs = kwargs
+
+        if not issubclass(layer_func, Layer):
+            raise TypeError(
+                "The input(layer_func) should be a derived class of Layer.")
+
+    def build_layer(self):
+        return self.layer_func(*self.inputs, **self.kwargs)
+
+    def __repr__(self):
+        return layer_to_str(self.layer_func.__name__, *self.inputs,
+                            **self.kwargs)
+
+
+class PipelineLayer(Layer):
+    def __init__(self,
+                 layers,
+                 num_stages=None,
+                 topology=None,
+                 loss_fn=None,
+                 seg_method="uniform"):
+        super(PipelineLayer, self).__init__()
+        if num_stages is None and topology is None:
+            raise ValueError("should provide num_stages or topology")
+
+        # lazy import
+        import paddle.distributed as dist
+        from paddle.distributed import fleet
+
+        self.device_id = dist.ParallelEnv().device_id
+        self.layers = layers
+        self._loss_fn = loss_fn
+        self._topo = topology
+        word_size = dist.get_world_size()
+        self.global_rank = dist.get_rank()
+
+        if self._topo:
+            self._stage_id = self._topo.get_coord(self.global_rank).pipe
+            self._num_stages = self._topo.get_dim_size("pipe")
+            if num_stages:
+                assert self._num_stages == num_stages, "num_stages should be equal to be %d" % (
+                    self._num_stages)
+        else:
+            # construct default topology
+            if word_size % num_stages != 0:
+                raise ValueError("should provide correct num_stages({}) "
+                                 "which can be divided by word_size({})".format(
+                                     num_stages, word_size))
+            dp_num = word_size // num_stages
+            self._topo = fleet.CommunicateTopology(["data", "pipe", "model"],
+                                                   [dp_num, num_stages, 1])
+            self._stage_id = self._topo.get_coord(self.global_rank).pipe
+            self._num_stages = self._topo.get_dim_size("pipe")
+
+        # initialize segment
+        self._layers_desc = list(self.layers)
+        self._num_layers = len(self._layers_desc)
+        self._start_pos = 0
+        self._end_pos = self._num_layers - 1
+        self._segment_network(seg_method)
+
+        # construct layer
+        self.run_function = []
+        self._build_layer()
+        self.to(paddle.CUDAPlace(self.device_id))
+
+    def _segment_network(self, seg_method):
+        logger.info("start segment network..")
+        seg = SegmentLayers(
+            self._layers_desc, num_parts=self._num_stages, method=seg_method)
+        self.segment_parts = seg.do_segment()
+
+        self._start_pos = self.segment_parts[self._stage_id]
+        self._end_pos = self.segment_parts[self._stage_id + 1]
+
+        # print information for debug
+        for stage in range(self._num_stages):
+            start = self.segment_parts[stage]
+            end = self.segment_parts[stage + 1]
+            logger.info("stage={}, global_rank={} ,layer_number={}".format(
+                stage, self.global_rank, end - start))
+
+            for index, layer in enumerate(self._layers_desc[start:end]):
+                logger.info("{}: {}".format(index + start, str(layer)))
+
+        if self._loss_fn:
+            try:
+                logger.info("loss: {}".format(self._loss_fn.__name__))
+            except AttributeError:
+                logger.info("loss: {}".format(self._loss_fn.__class__.__name__))
+
+    def _build_layer(self):
+        start = self._start_pos
+        end = self._end_pos
+        for index, layer in enumerate(self._layers_desc[start:end]):
+            layer_index = start + index
+            if isinstance(layer, Layer):
+                self.run_function.append(layer)
+                self.add_sublayer(str(layer_index), layer)
+            elif isinstance(layer, LayerDesc):
+                model = layer.build_layer()
+                self.run_function.append(model)
+                self.add_sublayer(str(layer_index), model)
+            else:
+                self.run_function.append(layer)
+
+    def forward(self, input):
+        for layer in self.run_function:
+            input = layer(input)
+        return input
diff --git a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/random.py b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/random.py
new file mode 100644
index 0000000000000..56c741dbd3cad
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/random.py
@@ -0,0 +1,79 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import contextlib
+__all__ = [
+    'RNGStatesTracker', 'model_parallel_random_seed', 'get_rng_state_tracker'
+]
+
+MODEL_PARALLEL_RNG = 'model_parallel_rng'
+
+
+class RNGStatesTracker:
+    """
+    Tracker the RNG states.
+    """
+
+    def __init__(self):
+        # Map from name to the rng state.
+        self.states_ = {}
+        self.seeds_ = set()
+
+    def reset(self):
+        self.states_ = {}
+        self.seeds_ = set()
+
+    def add(self, name, seed):
+        if seed in self.seeds_:
+            raise ValueError('seed {} already exists'.format(seed))
+        self.seeds_.add(seed)
+        if name in self.states_:
+            raise ValueError('state {} already exists'.format(name))
+        orig_rng_state = paddle.get_cuda_rng_state()
+        paddle.seed(seed)
+        self.states_[name] = paddle.get_cuda_rng_state()
+        paddle.set_cuda_rng_state(orig_rng_state)
+
+    @contextlib.contextmanager
+    def rng_state(self, name=MODEL_PARALLEL_RNG):
+        if name not in self.states_:
+            raise ValueError('state {} does not exist'.format(name))
+        orig_cuda_rng_state = paddle.get_cuda_rng_state()
+        paddle.set_cuda_rng_state(self.states_[name])
+        try:
+            yield
+        finally:
+            self.states_[name] = paddle.get_cuda_rng_state()
+            paddle.set_cuda_rng_state(orig_cuda_rng_state)
+
+
+RNG_STATE_TRACKER = RNGStatesTracker()
+
+
+def get_rng_state_tracker():
+    return RNG_STATE_TRACKER
+
+
+def model_parallel_random_seed(seed=2048):
+    import paddle.distributed.fleet as fleet
+    hcg = fleet.get_hybrid_communicate_group()
+    rank = hcg.get_model_parallel_rank()
+
+    local_seed = seed + 1024 + rank
+    global_seed = seed
+
+    RNG_STATE_TRACKER.reset()
+    paddle.seed(global_seed)
+    RNG_STATE_TRACKER.add(MODEL_PARALLEL_RNG, local_seed)
diff --git a/python/paddle/distributed/fleet/runtime/the_one_ps.py b/python/paddle/distributed/fleet/runtime/the_one_ps.py
index aa7df57e3c58b..df07a7a6e7783 100644
--- a/python/paddle/distributed/fleet/runtime/the_one_ps.py
+++ b/python/paddle/distributed/fleet/runtime/the_one_ps.py
@@ -453,6 +453,17 @@ def _init_worker(self):
         worker = self._get_fleet_proto(is_server=False, is_sync=is_sync)
         server = self._get_fleet_proto(is_server=True, is_sync=is_sync)
 
+        dist_strategy = self.context["valid_strategy"]
+        use_ps_gpu = dist_strategy.a_sync_configs["use_ps_gpu"]
+        if use_ps_gpu:
+            main_program = self.context['loss'].block.program
+            if not main_program._fleet_opt:
+                main_program._fleet_opt = {}
+            main_program._fleet_opt["use_ps_gpu"] = True
+            gpus_env = os.getenv("FLAGS_selected_gpus")
+            main_program._fleet_opt[
+                "worker_places"] = [int(s) for s in gpus_env.split(",")]
+
         def sync_strategy_envs():
             kwargs = {}
             kwargs[
@@ -741,6 +752,11 @@ def _get_tables():
             downpour_server = DownpourServer()
 
             service = Service()
+            dist_strategy = self.context["valid_strategy"]
+            use_ps_gpu = dist_strategy.a_sync_configs["use_ps_gpu"]
+            if use_ps_gpu:
+                service.server_class = "PsLocalServer"
+                service.client_class = "PsLocalClient"
             downpour_server.set_service_param(service)
 
             tables = _get_tables()
diff --git a/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py b/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
new file mode 100644
index 0000000000000..1f4222d478cd9
--- /dev/null
+++ b/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
@@ -0,0 +1,98 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import six
+import numpy as np
+import warnings
+
+from paddle import framework
+import paddle
+from paddle.fluid import core
+from paddle.fluid.dygraph.parallel import _split_tensors, sync_params_buffers, build_groups
+from collections import OrderedDict
+from .log_util import logger
+
+
+def _apply_collective_grads(parameters, comm_group):
+    grad_var_set = set()
+    grad_vars = []
+    sparse_grad_vars = []
+
+    for param in parameters:
+        if param.trainable and (param._grad_ivar() is not None):
+            g_var = param._grad_ivar()
+            assert not g_var._is_sparse(
+            ), "Now, it doesn't support sparse parameters"
+            grad_vars.append(g_var)
+            assert g_var not in grad_var_set
+            grad_var_set.add(g_var)
+
+    coalesced_grads_and_vars = build_groups(grad_vars, 128 * 1024 * 1024)
+
+    for coalesced_grad, _, _ in coalesced_grads_and_vars:
+        # need to div nranks
+        coalesced_grad = coalesced_grad / comm_group.nranks
+        paddle.distributed.all_reduce(coalesced_grad, group=comm_group)
+
+    _split_tensors(coalesced_grads_and_vars)
+
+
+def broadcast_input_data(hcg, *inputs, **kwargs):
+    model_parallel_group = hcg.get_model_parallel_group()
+    src_rank = hcg.get_model_parallel_group_src_rank()
+
+    for input_ in inputs:
+        if isinstance(input_, core.VarBase):
+            with framework.no_grad():
+                paddle.distributed.broadcast(
+                    input_,
+                    src=src_rank,
+                    group=model_parallel_group,
+                    use_calc_stream=True)
+        else:
+            logger.error("it doesn't support data type {}".format(type(input_)))
+
+    for k, v in kwargs.items():
+        if isinstance(v, core.VarBase):
+            with framework.no_grad():
+                paddle.distributed.broadcast(
+                    v,
+                    src=src_rank,
+                    group=model_parallel_group,
+                    use_calc_stream=True)
+            kwargs[k] = v
+        else:
+            logger.error("it doesn't support data type {}".format(type(v)))
+    return inputs, kwargs
+
+
+def broadcast_mp_parameters(model, hcg):
+    model_parallel_group = hcg.get_model_parallel_group()
+    src_rank = hcg.get_model_parallel_group_src_rank()
+    sync_params_buffers(
+        model, model_parallel_group, src_rank, is_model_parallel=True)
+
+
+def broadcast_dp_parameters(model, hcg):
+    data_parallel_group = hcg.get_data_parallel_group()
+    src_rank = hcg.get_data_parallel_group_src_rank()
+    sync_params_buffers(
+        model, data_parallel_group, src_rank, is_model_parallel=False)
+
+
+def fused_allreduce_gradients(parameter_list, hcg):
+    data_parallel_group = hcg.get_data_parallel_group()
+    logger.debug("dp start fuse allreduce gradients")
+    with framework.no_grad():
+        _apply_collective_grads(parameter_list, data_parallel_group)
diff --git a/python/paddle/distributed/fleet/utils/log_util.py b/python/paddle/distributed/fleet/utils/log_util.py
new file mode 100644
index 0000000000000..12c0bf699c1e6
--- /dev/null
+++ b/python/paddle/distributed/fleet/utils/log_util.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import sys
+
+
+class LoggerFactory:
+    @staticmethod
+    def build_logger(name=None, level=logging.INFO):
+        assert name is not None, "name for logger should not be None"
+
+        formatter = logging.Formatter(
+            "%(asctime)s-%(levelname)s: "
+            "[%(filename)s:%(lineno)d:%(funcName)s] %(message)s")
+
+        _logger = logging.getLogger(name)
+        _logger.setLevel(level)
+        _logger.propagate = False
+        handler = logging.StreamHandler(stream=sys.stderr)
+        handler.setFormatter(formatter)
+        handler.setLevel(level)
+        _logger.addHandler(handler)
+        return _logger
+
+
+logger = LoggerFactory.build_logger(name="HybridParallel", level=logging.INFO)
+
+
+def layer_to_str(base, *args, **kwargs):
+    name = base + "("
+    if args:
+        name += ", ".join(str(arg) for arg in args)
+        if kwargs:
+            name += ", "
+    if kwargs:
+        name += ", ".join("{}={}".format(key, str(value))
+                          for key, value in kwargs.items())
+    name += ")"
+    return name
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 6dd1478dc1f45..3b73034dfde2e 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 import os
 import sys
+import atexit
 
 # The legacy core need to be removed before "import core",
 # in case of users installing paddlepadde without -U option
@@ -71,7 +72,6 @@
 from .core import LoDTensor, LoDTensorArray, Scope, _Scope
 from .core import CPUPlace, XPUPlace, CUDAPlace, CUDAPinnedPlace, NPUPlace
 from .incubate import fleet
-from .incubate import data_generator
 from .transpiler import DistributeTranspiler, \
     memory_optimize, release_memory, DistributeTranspilerConfig
 from .lod_tensor import create_lod_tensor, create_random_int_lodtensor
@@ -255,3 +255,8 @@ def __bootstrap__():
 monkey_patch_variable()
 __bootstrap__()
 monkey_patch_varbase()
+
+# NOTE(zhiqiu): register npu_finalize on the exit of Python,
+# do some clean up manually.
+if core.is_compiled_with_npu():
+    atexit.register(core.npu_finalize)
diff --git a/python/paddle/fluid/contrib/__init__.py b/python/paddle/fluid/contrib/__init__.py
index df41e649ca8cb..30981f531289a 100644
--- a/python/paddle/fluid/contrib/__init__.py
+++ b/python/paddle/fluid/contrib/__init__.py
@@ -22,11 +22,7 @@
 from .op_frequence import *
 from . import quantize
 from .quantize import *
-from . import reader
-from .reader import *
 from . import slim
-from . import utils
-from .utils import *
 from . import extend_optimizer
 from .extend_optimizer import *
 from . import model_stat
@@ -42,8 +38,6 @@
 __all__ += memory_usage_calc.__all__
 __all__ += op_frequence.__all__
 __all__ += quantize.__all__
-__all__ += reader.__all__
-__all__ += utils.__all__
 __all__ += extend_optimizer.__all__
 __all__ += ['mixed_precision']
 __all__ += layers.__all__
diff --git a/python/paddle/fluid/contrib/mixed_precision/amp_nn.py b/python/paddle/fluid/contrib/mixed_precision/amp_nn.py
index 3bfc078971d7a..588eb2a29f555 100644
--- a/python/paddle/fluid/contrib/mixed_precision/amp_nn.py
+++ b/python/paddle/fluid/contrib/mixed_precision/amp_nn.py
@@ -20,7 +20,7 @@
 __all__ = ['check_finite_and_unscale', 'update_loss_scaling']
 
 
-def check_finite_and_unscale(x, scale, name=None):
+def check_finite_and_unscale(x, scale, name=None, float_status=None):
     """
     Check if input X contains all finite data, if yes, scale it by input Scale.
 
@@ -30,9 +30,11 @@ def check_finite_and_unscale(x, scale, name=None):
     FoundInfinite will be 1 (True), and Out will not be scaled. In this case, the data of 
     Out should not be used, and its data may not be deterministic. 
     Otherwise, FoundInfinite will be 0 (False).
+
     Args:
         x(list|tuple): The input tensors of check_finite_and_unscale operator.
         scale: The scale of check_finite_and_unscale operator.
+        float_status(Tensor): (Only used on NPU) The float status to check overflow.
     """
     check_type(x, 'x', (tuple, list), 'check_finite_and_unscale')
     for e in x:
@@ -43,6 +45,11 @@ def check_finite_and_unscale(x, scale, name=None):
     found_inf = helper.create_variable_for_type_inference(dtype='bool')
 
     inputs = {'X': x, 'Scale': scale}
+    if core.is_compiled_with_npu():
+        check_variable_and_dtype(float_status, "float_status",
+                                 ['float16', 'float32'],
+                                 'check_finite_and_unscale')
+        inputs['FloatStatus'] = float_status
     outputs = {'Out': x, 'FoundInfinite': found_inf}
     helper.append_op(
         type='check_finite_and_unscale', inputs=inputs, outputs=outputs)
diff --git a/python/paddle/fluid/contrib/mixed_precision/decorator.py b/python/paddle/fluid/contrib/mixed_precision/decorator.py
index d37e90b4695d0..3cb9fe75559b1 100644
--- a/python/paddle/fluid/contrib/mixed_precision/decorator.py
+++ b/python/paddle/fluid/contrib/mixed_precision/decorator.py
@@ -29,6 +29,7 @@
 from .amp_nn import update_loss_scaling
 import types
 import warnings
+import paddle
 
 __all__ = ["decorate"]
 
@@ -98,6 +99,7 @@ def _set_distributed(self, flag):
     def get_loss_scaling(self):
         """Return the real-time loss scaling factor.
         """
+        assert self._loss_scaling is not None, 'Please call minimize() before calling get_loss_scaling().'
         return self._loss_scaling
 
     def get_scaled_loss(self):
@@ -164,6 +166,17 @@ def backward(self,
         train_program = loss.block.program
         self._train_program = train_program
 
+        # NOTE(zhiqiu): _float_status is only used for NPU.
+        if core.is_compiled_with_npu():
+            float_status = paddle.static.data(
+                name="float_status", shape=[8], dtype='float32')
+            self._train_program.global_block().append_op(
+                type="alloc_float_status",
+                outputs={"FloatStatus": float_status}, )
+            self._float_status = float_status
+        else:
+            self._float_status = None
+
         with program_guard(self._train_program, startup_program):
             self._init_amp_var()
 
@@ -293,7 +306,10 @@ def apply_gradients(self, params_grads):
             for p, g in params_grads:
                 with self._train_program._optimized_guard([p, g]):
                     _, found_inf = check_finite_and_unscale(
-                        [g, ], self._loss_scaling, name="find_infinite_scale")
+                        [g, ],
+                        self._loss_scaling,
+                        name="find_infinite_scale",
+                        float_status=self._float_status)
                     found_infs.append(found_inf)
         elif self._use_pure_fp16:
             if fp32_grads:
@@ -301,19 +317,24 @@ def apply_gradients(self, params_grads):
                     _, fp32_found_inf = check_finite_and_unscale(
                         fp32_grads,
                         self._loss_scaling,
-                        name="find_infinite_scale_fp32")
+                        name="find_infinite_scale_fp32",
+                        float_status=self._float_status)
                 found_infs.append(fp32_found_inf)
             if fp16_grads:
                 with self._train_program._optimized_guard(fp16_grads):
                     _, fp16_found_inf = check_finite_and_unscale(
                         fp16_grads,
                         self._loss_scaling,
-                        name="find_infinite_scale_fp16")
+                        name="find_infinite_scale_fp16",
+                        float_status=self._float_status)
                 found_infs.append(fp16_found_inf)
         else:
             with self._train_program._optimized_guard(grads):
                 _, found_inf = check_finite_and_unscale(
-                    grads, self._loss_scaling, name="find_infinite_scale")
+                    grads,
+                    self._loss_scaling,
+                    name="find_infinite_scale",
+                    float_status=self._float_status)
 
         if self._use_dynamic_loss_scaling:
             if self._is_distributed or self._use_pure_fp16:
@@ -393,6 +414,7 @@ def minimize(self,
             The scaled loss by scaling factor, the list of optimize ops, and a
             list of scaled parameters and gradients.
         """
+
         opt_dict = self._optimizer.__class__.__dict__
         if 'minimize' in opt_dict and isinstance(opt_dict['minimize'],
                                                  types.FunctionType):
diff --git a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
index 67e83a2ec4617..65b62e7e5ab55 100644
--- a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
+++ b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
@@ -103,7 +103,7 @@ def _insert_cast_op(block, op, idx, src_dtype, dest_dtype):
             if in_name not in {'X', 'Z'}:
                 continue
         for in_var_name in op.input(in_name):
-            in_var = block.var(in_var_name)
+            in_var = block._find_var_recursive(in_var_name)
             if in_var.type not in _valid_types or in_var.dtype == dest_dtype:
                 continue
             if in_var.dtype == src_dtype:
@@ -116,7 +116,7 @@ def _insert_cast_op(block, op, idx, src_dtype, dest_dtype):
                         persistable=False,
                         stop_gradient=in_var.stop_gradient)
 
-                    block._insert_op(
+                    block._insert_op_without_sync(
                         idx,
                         type="cast",
                         inputs={"X": in_var},
@@ -490,6 +490,7 @@ def rewrite_program(main_prog, amp_lists):
         main_prog (Program): The main program for training.
     """
     block = main_prog.global_block()
+    block._sync_with_cpp()
     ops = block.ops
     white_op_set = set()
     black_op_set = set()
@@ -578,6 +579,7 @@ def update_role_var_grad(main_prog, params_grads):
         params_grads (list): A list of params and grads.
     """
     block = main_prog.global_block()
+    block._sync_with_cpp()
     BACKWARD = core.op_proto_and_checker_maker.OpRole.Backward
     OPTIMIZE = core.op_proto_and_checker_maker.OpRole.Optimize
     for p, g in params_grads:
@@ -585,7 +587,7 @@ def update_role_var_grad(main_prog, params_grads):
         if g.dtype == core.VarDesc.VarType.FP32 and op.type == 'cast':
             role = op.attr('op_role')
             if role & int(BACKWARD) and op.has_attr('op_role_var'):
-                op.desc.remove_attr("op_role_var")
+                op._remove_attr("op_role_var")
             else:
                 raise ValueError("The cast op {0} must be in BACKWARD role "
                                  "and have op_role_var attr.".format(op))
@@ -610,11 +612,19 @@ def update_role_var_grad(main_prog, params_grads):
                 raise ValueError("The cast op {0}'s output should not be"
                                  "used by a non-optimize op, however, it"
                                  "is used by {1}".format(op, post_ops[0]))
+            #add new op in the python and cpp at the same time 
             new_op_desc = block.desc.append_op()
             new_op_desc.copy_from(op.desc)
-
+            new_op = framework.Operator(
+                block=block,
+                desc=new_op_desc,
+                type=None,
+                inputs=None,
+                outputs=None,
+                attrs=None)
+            block.ops.append(new_op)
             op_idx = find_op_index(block.desc, op.desc)
             if op_idx == -1:
                 raise ValueError("The op {0} is not in program".format(op))
-            block.desc._remove_op(op_idx, op_idx + 1)
-        block._sync_with_cpp()
+            block._remove_op(op_idx, sync=False)
+    block._sync_with_cpp()
diff --git a/python/paddle/fluid/contrib/reader/README.md b/python/paddle/fluid/contrib/reader/README.md
deleted file mode 100644
index f043a17493ec2..0000000000000
--- a/python/paddle/fluid/contrib/reader/README.md
+++ /dev/null
@@ -1,25 +0,0 @@
-## CTR READER
-
-An multi-thread cpp reader that has the same interface with py_reader. It
-uses cpp multi-thread to read file and is much more faster then the Python read
-thread in py_reader.
-
-Currently, it support two types of file:
- - gzip
- - plain text file
-
-and two types of data format:
- - cvs data format is :
-   * label dense_fea,dense_fea sparse_fea,sparse_fea
- - the svm data format is :
-   * label slot1:fea_sign slot2:fea_sign slot1:fea_sign
-
-## Distributed reader
-
-The distributed reader is mainly used by multi-process tasks, and the input must be a batch reader.
-
-Cons:
-  - It can be operated conveniently so that different processes can read different data.
-
-Pros:
-  - If batch_reader produces training data, and batch_reader loads or preprocesses data for a long time, this data reading method may be slower.
diff --git a/python/paddle/fluid/contrib/reader/distributed_reader.py b/python/paddle/fluid/contrib/reader/distributed_reader.py
deleted file mode 100644
index ecee769218f54..0000000000000
--- a/python/paddle/fluid/contrib/reader/distributed_reader.py
+++ /dev/null
@@ -1,65 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import os
-
-__all__ = ["distributed_batch_reader"]
-
-
-def distributed_batch_reader(batch_reader):
-    """
-    Create a reader for multi-process training. The input must be a batch reader.
-
-    Args:
-        batch_reader (callable): The input reader should be a batch reader.
-
-    Examples:
-
-    .. code-block:: python
-           import paddle
-           import paddle.fluid as fluid
-
-           train_reader = paddle.batch(paddle.dataset.mnist.train(),
-                    batch_size=32,drop_last=True)
-           train_reader = fluid.contrib.reader.distributed_batch_reader(
-                    train_reader)
-
-    """
-    trainers_num = int(os.environ.get('PADDLE_TRAINERS_NUM', 1))
-    trainer_id = int(os.getenv("PADDLE_TRAINER_ID", 0))
-    assert trainer_id < trainers_num
-
-    def decorate_for_multi_process():
-        if trainers_num > 1:
-            print("start data reader (trainers_num: {}, trainer_id: {})".format(
-                trainers_num, trainer_id))
-
-        train_data, idx = None, 1
-        for batch_id, data in enumerate(batch_reader()):
-            if trainers_num > 1:
-                if idx < trainers_num:
-                    if idx == trainer_id + 1:
-                        train_data = data
-                    idx += 1
-                else:
-                    if idx == trainer_id + 1:
-                        train_data = data
-                    assert train_data is not None, "train data should not be None."
-                    yield train_data
-                    train_data, idx = None, 1
-            else:
-                yield data
-
-    return decorate_for_multi_process
diff --git a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
index aba6005f0cfdf..bc2e2dc9b6562 100644
--- a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
+++ b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
@@ -55,7 +55,7 @@ def _set_variable_data(scope, place, var_name, np_value):
     Set the value of var node by name, if the node exits,
     '''
     assert isinstance(np_value, np.ndarray), \
-        'The type of value should be numpy array.'
+       'The type of value should be numpy array.'
     var_node = scope.find_var(var_name)
     if var_node != None:
         tensor = var_node.get_tensor()
@@ -138,8 +138,10 @@ def __init__(self,
                  batch_size=10,
                  batch_nums=None,
                  algo="KL",
+                 hist_percent=0.99999,
                  quantizable_op_type=["conv2d", "depthwise_conv2d", "mul"],
                  is_full_quantize=False,
+                 bias_correction=False,
                  activation_bits=8,
                  weight_bits=8,
                  activation_quantize_type='range_abs_max',
@@ -180,7 +182,13 @@ def __init__(self,
                 get the KL threshold for quantized activations and get the abs_max
                 value for quantized weights. If algo='abs_max', get the abs max 
                 value for activations and weights. If algo= 'min_max', get the min 
-                and max value for quantized activations and weights. Default is KL.
+                and max value for quantized activations and weights. If algo='avg',
+                get the average value among the max values for activations. If 
+                algo= 'hist', get the value of 'hist_percent' quantile as the threshold.
+                If algo='mse', get the value which makes the quantization mse loss 
+                minimal. Default is KL.
+            hist_percent(float, optional): The threshold of algo 'hist' for activations.
+                Default is 0.99999.
             quantizable_op_type(list[str], optional): List the type of ops 
                 that will be quantized. Default is ["conv2d", "depthwise_conv2d", 
                 "mul"].
@@ -188,6 +196,8 @@ def __init__(self,
                 apply quantization to all supported quantizable op type. If set
                 is_full_quantized as False, only apply quantization to the op type 
                 according to the input quantizable_op_type.
+            bias_correction(bool, optional): If set as True, use the bias correction
+                method of https://arxiv.org/abs/1810.05723. Default is False.
             activation_bits(int): quantization bit number for activation.
             weight_bits(int, optional): quantization bit number for weights.
             activation_quantize_type(str): quantization type for activation,
@@ -255,7 +265,9 @@ def __init__(self,
             'range_abs_max', 'moving_average_abs_max', 'abs_max'
         ]
         self._support_weight_quantize_type = ['abs_max', 'channel_wise_abs_max']
-        self._support_algo_type = ['KL', 'abs_max', 'min_max']
+        self._support_algo_type = [
+            'KL', 'hist', 'avg', 'mse', 'abs_max', 'min_max'
+        ]
         self._dynamic_quantize_op_type = ['lstm']
         self._support_quantize_op_type = \
             list(set(QuantizationTransformPass._supported_quantizable_op_type +
@@ -270,7 +282,7 @@ def __init__(self,
             "cannot be None in the same time."
         assert batch_size > 0, "The batch_size should be greater than 0."
         assert algo in self._support_algo_type, \
-            "The algo should be KL, abs_max or min_max."
+            "The algo should be KL, hist, mse, avg, abs_max or min_max."
         assert activation_quantize_type in self._support_activation_quantize_type, \
             "The activation_quantize_type ({}) should in ({}).".format(
             activation_quantize_type, self._support_activation_quantize_type)
@@ -279,6 +291,7 @@ def __init__(self,
             weight_quantize_type, self._support_weight_quantize_type)
 
         # Save input params
+        self._bias_correction = bias_correction
         self._executor = executor
         self._scope = global_scope() if scope == None else scope
         self._model_dir = model_dir
@@ -289,6 +302,7 @@ def __init__(self,
         self._batch_size = batch_size
         self._batch_nums = batch_nums
         self._algo = algo
+        self._hist_percent = hist_percent
         self._activation_bits = activation_bits
         self._weight_bits = weight_bits
         self._activation_quantize_type = activation_quantize_type
@@ -314,17 +328,21 @@ def __init__(self,
         self._quantized_weight_var_name = set()
         self._quantized_act_var_name = set()
         self._weight_op_pairs = {}
-        # The vars for alog = KL
+        # The vars for alog = KL or hist
         self._sampling_act_abs_min_max = {}
         self._sampling_act_histogram = {}
         self._sampling_data = {}
-        self._quantized_var_kl_threshold = {}
+        self._quantized_var_threshold = {}
         self._histogram_bins = 2048
         # The vars for algo = min_max
         self._quantized_var_min = {}
         self._quantized_var_max = {}
-        # The vars for algo = abs_max
-        self._quantized_var_abs_max = {}
+        # The vars for algo = avg
+        self._quantized_var_avg = {}
+        # The best loss of algo = mse
+        self._best_mse_loss = {}
+        # The threshold for algo = abs_max, mse or avg
+        self._quantized_threshold = {}
 
     def quantize(self):
         '''
@@ -341,7 +359,7 @@ def quantize(self):
         self._collect_target_varnames()
         self._set_activation_persistable()
 
-        if self._algo == "KL":
+        if self._algo in ["KL", "hist"]:
             _logger.info("Preparation stage ...")
             batch_id = 0
             for data in self._data_loader():
@@ -374,13 +392,14 @@ def quantize(self):
             if self._batch_nums and batch_id >= self._batch_nums:
                 break
         _logger.info("Finish sampling stage, all batch: " + str(batch_id))
-
         self._reset_activation_persistable()
-
-        if self._algo == "KL":
-            self._calculate_kl_threshold()
-
-        if self._algo in ["KL", "abs_max"]:
+        if self._algo == 'avg':
+            for var_name in self._quantized_act_var_name:
+                self._quantized_threshold[var_name] = \
+                np.array(self._quantized_var_avg[var_name]).mean()
+        if self._algo in ["KL", "hist"]:
+            self._calculate_kl_hist_threshold()
+        if self._algo in ["KL", "abs_max", "hist", "avg", "mse"]:
             self._update_program()
         else:
             self._save_input_threhold()
@@ -526,14 +545,84 @@ def _sampling(self):
         '''
         if self._algo == "abs_max":
             self._sample_abs_max()
+        elif self._algo == "avg":
+            self._sample_avg()
         elif self._algo == "min_max":
             self._sample_min_max()
-        elif self._algo == "KL":
+        elif self._algo == "mse":
+            self._sample_mse()
+        elif self._algo in ["KL", "hist"]:
             self._sample_histogram()
 
+    def _sample_mse(self):
+        if self._quantized_threshold == {}:
+            for var_name in self._quantized_weight_var_name:
+                var_tensor = _load_variable_data(self._scope, var_name)
+                if self._weight_quantize_type == "abs_max":
+                    abs_max_value = float(np.max(np.abs(var_tensor)))
+                elif self._weight_quantize_type == "channel_wise_abs_max":
+                    abs_max_value = []
+                    if self._weight_op_pairs[
+                            var_name] in _channelwise_quant_axis1_ops:
+                        for i in range(var_tensor.shape[1]):
+                            abs_max_value.append(
+                                float(np.max(np.abs(var_tensor[:, i]))))
+                    else:
+                        for i in range(var_tensor.shape[0]):
+                            abs_max_value.append(
+                                float(np.max(np.abs(var_tensor[i]))))
+                self._quantized_threshold[var_name] = abs_max_value
+        _logger.info("MSE searching stage ...")
+        for var_name in self._quantized_act_var_name:
+            var_tensor = _load_variable_data(self._scope, var_name)
+            var_tensor = var_tensor.flatten()
+            abs_max_value = float(np.max(np.abs(var_tensor)))
+            s = 0.3
+            if var_name not in self._best_mse_loss:
+                self._best_mse_loss[var_name] = float('inf')
+            while s <= 1.0:
+                scale = s * abs_max_value
+                s += 0.02
+                bins = 2**(self._activation_bits - 1) - 1
+                quant_dequant_var = np.round(
+                    np.clip(var_tensor, 0.0, scale) / scale *
+                    bins) / bins * scale
+                mse_loss = ((var_tensor - quant_dequant_var)**2).mean()
+                if mse_loss <= self._best_mse_loss[var_name]:
+                    self._best_mse_loss[var_name] = mse_loss
+                    self._quantized_threshold[var_name] = scale
+
+    def _sample_avg(self):
+        if self._quantized_threshold == {}:
+            for var_name in self._quantized_weight_var_name:
+                var_tensor = _load_variable_data(self._scope, var_name)
+                if self._weight_quantize_type == "abs_max":
+                    abs_max_value = float(np.max(np.abs(var_tensor)))
+                elif self._weight_quantize_type == "channel_wise_abs_max":
+                    abs_max_value = []
+                    if self._weight_op_pairs[
+                            var_name] in _channelwise_quant_axis1_ops:
+                        for i in range(var_tensor.shape[1]):
+                            abs_max_value.append(
+                                float(np.max(np.abs(var_tensor[:, i]))))
+                    else:
+                        for i in range(var_tensor.shape[0]):
+                            abs_max_value.append(
+                                float(np.max(np.abs(var_tensor[i]))))
+                self._quantized_threshold[var_name] = abs_max_value
+
+        for var_name in self._quantized_act_var_name:
+            var_tensor = _load_variable_data(self._scope, var_name)
+            abs_max_value = float(np.max(np.abs(var_tensor)))
+            if (var_name not in self._quantized_var_avg):
+                self._quantized_var_avg[var_name] = []
+            abs_avg_value = float(np.mean(np.max(  \
+            np.abs(var_tensor.reshape(var_tensor.shape[0], -1)), axis=(1))))
+            self._quantized_var_avg[var_name].append(abs_avg_value)
+            continue
+
     def _sample_abs_max(self):
-        # Only calculate abs_max value for weight for once
-        if self._quantized_var_abs_max == {}:
+        if self._quantized_threshold == {}:
             for var_name in self._quantized_weight_var_name:
                 var_tensor = _load_variable_data(self._scope, var_name)
                 if self._weight_quantize_type == "abs_max":
@@ -549,14 +638,14 @@ def _sample_abs_max(self):
                         for i in range(var_tensor.shape[0]):
                             abs_max_value.append(
                                 float(np.max(np.abs(var_tensor[i]))))
-                self._quantized_var_abs_max[var_name] = abs_max_value
+                self._quantized_threshold[var_name] = abs_max_value
 
         for var_name in self._quantized_act_var_name:
             var_tensor = _load_variable_data(self._scope, var_name)
             abs_max_value = float(np.max(np.abs(var_tensor)))
-            if (var_name not in self._quantized_var_abs_max) or \
-                (abs_max_value > self._quantized_var_abs_max[var_name]):
-                self._quantized_var_abs_max[var_name] = abs_max_value
+            if (var_name not in self._quantized_threshold) or \
+                (abs_max_value > self._quantized_threshold[var_name]):
+                self._quantized_threshold[var_name] = abs_max_value
 
     def _sample_min_max(self):
         if self._quantized_var_min == {} and self._quantized_var_max == {}:
@@ -646,12 +735,12 @@ def _init_sampling_act_histogram(self):
                     [], bins=self._histogram_bins, range=(min_val, max_val))
                 self._sampling_act_histogram[var_name] = [hist, hist_edeges]
 
-    def _calculate_kl_threshold(self):
+    def _calculate_kl_hist_threshold(self):
         '''
-        Calculate the KL threshold of quantized variables.
+        Calculate the KL or hist threshold of quantized variables.
         '''
-        _logger.info("Calculate KL threshold ...")
-        assert self._algo == "KL", "The algo should be KL to calculate kl threshold."
+        _logger.info("Calculate {} threshold ...".format(self._algo))
+        assert self._algo in ["KL", "hist"], "The algo should be KL or hist."
 
         # Abs_max threshold for weights
         for var_name in self._quantized_weight_var_name:
@@ -669,18 +758,22 @@ def _calculate_kl_threshold(self):
                     for i in range(weight_data.shape[0]):
                         weight_threshold.append(
                             float(np.max(np.abs(weight_data[i]))))
-            self._quantized_var_kl_threshold[var_name] = weight_threshold
+            self._quantized_var_threshold[var_name] = weight_threshold
 
         for var_name in self._quantized_act_var_name:
             hist, hist_edeges = self._sampling_act_histogram[var_name]
-            self._quantized_var_kl_threshold[var_name] = \
-                self._get_kl_scaling_factor(hist, hist_edeges)
+            if self._algo == "KL":
+                self._quantized_var_threshold[var_name] = \
+                    self._get_kl_scaling_factor(hist, hist_edeges)
+            elif self._algo == "hist":
+                self._quantized_var_threshold[var_name] = \
+                    self._get_hist_scaling_factor(hist, hist_edeges)
 
     def _update_program(self):
         '''
         Use QuantizationTransformPass and AddQuantDequantPass to insert 
         fake_quantize, fake_dequantize and fake_quant_dequant op. 
-        Besides, save all kl threshold to the scale var node.
+        Besides, save all threshold to the scale var node.
         '''
         _logger.info("Update the program ...")
         graph = IrGraph(core.Graph(self._program.desc), for_test=True)
@@ -711,11 +804,11 @@ def _update_program(self):
             quantizable_op_type=minor_quantizable_op_types)
         add_quant_dequant_pass.apply(graph)
 
-        # save abs_max or KL threshold to scale var node
-        if self._algo == "KL":
-            scale_dict = self._quantized_var_kl_threshold
+        # save threshold to scale var node
+        if self._algo in ["KL", "hist"]:
+            scale_dict = self._quantized_var_threshold
         else:
-            scale_dict = self._quantized_var_abs_max
+            scale_dict = self._quantized_threshold
         for key, val in scale_dict.items():
             _set_variable_data(
                 self._scope,
@@ -734,6 +827,7 @@ def _update_program(self):
         freeze_pass = QuantizationFreezePass(
             scope=self._scope,
             place=self._place,
+            bias_correction=self._bias_correction,
             weight_bits=self._weight_bits,
             activation_bits=self._activation_bits,
             weight_quantize_type=self._weight_quantize_type,
@@ -761,20 +855,28 @@ def analysis_and_save_info(op_node, out_var_name):
                 out_var_name + " is not the output of the op"
             if self._algo == "KL":
                 # For compatibility, we save output threshold by two methods.
-                save_info(op_node, out_var_name,
-                          self._quantized_var_kl_threshold, "out_threshold",
-                          "post_kl")
+                save_info(op_node, out_var_name, self._quantized_var_threshold,
+                          "out_threshold", "post_kl")
                 save_info(
-                    op_node, out_var_name, self._quantized_var_kl_threshold,
+                    op_node, out_var_name, self._quantized_var_threshold,
                     argname_index[0] + str(argname_index[1]) + "_threshold",
                     "post_kl")
-            elif self._algo == "abs_max":
-                save_info(op_node, out_var_name, self._quantized_var_abs_max,
-                          "out_threshold", "post_abs_max")
+            elif self._algo == "hist":
+                # For compatibility, we save output threshold by two methods.
+                save_info(op_node, out_var_name, self._quantized_var_threshold,
+                          "out_threshold", "post_hist")
                 save_info(
-                    op_node, out_var_name, self._quantized_var_abs_max,
+                    op_node, out_var_name, self._quantized_var_threshold,
                     argname_index[0] + str(argname_index[1]) + "_threshold",
-                    "post_kl")
+                    "post_hist")
+
+            elif self._algo in ["avg", "abs_max", "mse"]:
+                save_info(op_node, out_var_name, self._quantized_threshold,
+                          "out_threshold", "post_" + str(self._algo))
+                save_info(
+                    op_node, out_var_name, self._quantized_threshold,
+                    argname_index[0] + str(argname_index[1]) + "_threshold",
+                    "post_" + str(self._algo))
             elif self._algo == "min_max":
                 save_info(op_node, out_var_name, self._quantized_var_min,
                           "out_min", "post_min_max")
@@ -817,10 +919,27 @@ def _collect_dynamic_quantize_op_threshold(self, target_ops_type):
                     op._set_attr("quantization_type", quantization_type)
                     op._set_attr("bit_length", self._weight_bits)
 
-    def _get_kl_scaling_factor(self, hist, hist_edeges, num_quantized_bins=255):
+    def _get_hist_scaling_factor(self, hist, hist_edges):
+        '''
+        Using the hist method to get the scaling factor.
+        '''
+        threshold_rate = self._hist_percent
+        hist = hist / float(sum(hist))
+        hist_sum = 0
+        hist_index = 0
+        for i in range(len(hist)):
+            hist_sum += hist[i]
+            if hist_sum >= threshold_rate:
+                hist_index = i + 1
+                break
+        bin_width = hist_edges[1] - hist_edges[0]
+        return (hist_index - 0.5) * bin_width
+
+    def _get_kl_scaling_factor(self, hist, hist_edeges):
         '''
         Using the KL-divergenc method to get the more precise scaling factor.
         '''
+        num_quantized_bins = 2**(self._activation_bits - 1) - 1
         ending_iter = self._histogram_bins - 1
         starting_iter = int(ending_iter * 0.7)
         bin_width = hist_edeges[1] - hist_edeges[0]
diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
index 3f9ff7295dd6b..ec215a3e5757e 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
@@ -60,6 +60,7 @@
     "swish",
     "softmax",
     "batch_norm",
+    "layer_norm",
     "elementwise_add",
     "pool2d",
     "reshape2",
@@ -67,6 +68,7 @@
     "concat",
     "elementwise_mul",
     "scale",
+    "slice",
     "hard_swish",
     "hard_sigmoid",
     "conv2d_transpose",
@@ -119,6 +121,7 @@
     "swish": [["X"], ["Out"]],
     "dropout": [["X"], ["Out"]],
     "batch_norm": [["X"], ["Y"]],
+    "layer_norm": [["X"], ["Y"]],
     "sigmoid": [["X"], ["Out"]],
     "elementwise_mul": [["X", "Y"], ["Out"]],
     "scale": [["X"], ["Out"]],
@@ -1070,6 +1073,7 @@ class QuantizationFreezePass(object):
     def __init__(self,
                  scope,
                  place,
+                 bias_correction=False,
                  weight_bits=8,
                  activation_bits=8,
                  weight_quantize_type='abs_max',
@@ -1085,6 +1089,8 @@ def __init__(self,
             scope(fluid.Scope): scope is used to get the weight tensor values.
             place(fluid.CPUPlace|fluid.CUDAPlace|str): place is used to restore the weight tensors.
                 If it's string, It can be ``cpu``, and ``gpu:x``, where ``x`` is the index of the GPUs.
+            bias_correction(bool): whether use bias correction for post-training quantization.
+                 https://arxiv.org/abs/1810.05723.
             weight_bits(int): quantization bit number for weights.
             activation_bits(int): quantization bit number for activation.
             weight_quantize_type(str): quantization type for weights, support 'abs_max' and 
@@ -1098,6 +1104,7 @@ def __init__(self,
         assert place is not None, \
             'The place cannot be set None.'
         self._scope = scope
+        self._bias_correction = bias_correction
         self._place = _get_paddle_place(place)
         self._weight_bits = weight_bits
         self._activation_bits = activation_bits
@@ -1154,7 +1161,10 @@ def apply(self, graph):
                     else:
                         quant_axis = 0
                     quantized_param_v = self._quant(
-                        param_v, scale_v, self._weight_bits, quant_axis)
+                        param_v.copy(), scale_v, self._weight_bits, quant_axis)
+                    if self._bias_correction == True:
+                        quantized_param_v = self._bias_correction_w(
+                            param_v, quantized_param_v, scale_v, quant_axis)
                     self._restore_var(input_arg_name, quantized_param_v)
                     self._remove_fake_quant_and_dequant_op(graph, op_node)
 
@@ -1373,6 +1383,8 @@ def _clip(x, scale):
 
         if isinstance(scale, list):
             for i, s in enumerate(scale):
+                if s == 0.0:
+                    s = 1e-8
                 if quant_axis == 0:
                     x[i] = _clip(x[i], s)
                     x[i] = np.round(x[i] / s * bnt)
@@ -1384,6 +1396,46 @@ def _clip(x, scale):
             x = np.round(x / scale * bnt)
         return x
 
+    def _bias_correction_w(self, x, x_quant, scale_v, quant_axis):
+        '''
+        Bias correction for weight
+        '''
+        eps = 1e-8
+        bnt = (1 << (self._weight_bits - 1)) - 1
+        x_dequant = x_quant.copy()
+        if isinstance(scale_v, list):
+            if quant_axis == 0:
+                for i, s in enumerate(scale_v):
+                    x_dequant[i] = x_dequant[i] * s / bnt
+                quant_bias = x - x_dequant
+                mean_bias = quant_bias.reshape(quant_bias.shape[0], -1).mean(-1)
+                std_orig = x.reshape(x.shape[0], -1).std(-1)
+                std_quant = x_dequant.reshape(x_dequant.shape[0], -1).std(-1)
+                std_bias = std_orig / (std_quant + eps)
+            else:
+                for i, s in enumerate(scale_v):
+                    x_dequant[:, i] = x_quant[:, i] * s / bnt
+                quant_bias = x - x_dequant
+                mean_bias = np.array([
+                    quant_bias[:, i].mean() for i in range(quant_bias.shape[1])
+                ])
+                std_orig = np.array([x[:, i].std() for i in range(x.shape[1])])
+                std_quant = np.array(
+                    [x_dequant[:, i].std() for i in range(x_dequant.shape[1])])
+                std_bias = std_orig / (std_quant + eps)
+        else:
+            x_dequant = x_quant * scale_v / bnt
+            mean_bias = (x - x_dequant).mean()
+            std_bias = x.std() / (x_dequant.std() + eps)
+        if mean_bias.ndim == 1:
+            std_bias = np.resize(std_bias, x.shape)
+            mean_bias = np.resize(mean_bias, x.shape)
+
+        x_dequant = (mean_bias + x_dequant) * std_bias
+        quantized_param_v = self._quant(x_dequant, scale_v, self._weight_bits,
+                                        quant_axis)
+        return quantized_param_v
+
 
 class ConvertToInt8Pass(object):
     def __init__(self, scope, place, quantizable_op_type=None):
@@ -1700,7 +1752,7 @@ class AddQuantDequantPass(object):
         "bilinear_interp", "nearest_interp", "trilinear_interp", "slice",
         "squeeze", "elementwise_sub", "mul", "matmul", "relu", "relu6",
         "leaky_relu", "tanh", "swish", "scale", "transpose", "transpose2",
-        "sigmoid", "pad2d", "flatten", "flatten2", "batch_norm"
+        "sigmoid", "pad2d", "flatten", "flatten2", "batch_norm", "layer_norm"
     ]
 
     # To be compatible with PaddleSlim, not remove _activation_type for now
diff --git a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py
index 3ea1c84f976a8..da5c5d6dc9441 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py
@@ -204,6 +204,66 @@ def test_post_training_kl(self):
                       quant_iterations)
 
 
+class TestPostTraininghistForMnist(TestPostTrainingQuantization):
+    def test_post_training_hist(self):
+        model_name = "mnist_model"
+        data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model.tar.gz"
+        data_md5 = "be71d3997ec35ac2a65ae8a145e2887c"
+        algo = "hist"
+        quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
+        is_full_quantize = False
+        is_use_cache_file = False
+        is_optimize_model = True
+        diff_threshold = 0.01
+        batch_size = 10
+        infer_iterations = 50
+        quant_iterations = 5
+        self.run_test(model_name, data_url, data_md5, algo, quantizable_op_type,
+                      is_full_quantize, is_use_cache_file, is_optimize_model,
+                      diff_threshold, batch_size, infer_iterations,
+                      quant_iterations)
+
+
+class TestPostTrainingmseForMnist(TestPostTrainingQuantization):
+    def test_post_training_mse(self):
+        model_name = "mnist_model"
+        data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model.tar.gz"
+        data_md5 = "be71d3997ec35ac2a65ae8a145e2887c"
+        algo = "mse"
+        quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
+        is_full_quantize = False
+        is_use_cache_file = False
+        is_optimize_model = True
+        diff_threshold = 0.01
+        batch_size = 10
+        infer_iterations = 50
+        quant_iterations = 5
+        self.run_test(model_name, data_url, data_md5, algo, quantizable_op_type,
+                      is_full_quantize, is_use_cache_file, is_optimize_model,
+                      diff_threshold, batch_size, infer_iterations,
+                      quant_iterations)
+
+
+class TestPostTrainingavgForMnist(TestPostTrainingQuantization):
+    def test_post_training_avg(self):
+        model_name = "mnist_model"
+        data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model.tar.gz"
+        data_md5 = "be71d3997ec35ac2a65ae8a145e2887c"
+        algo = "avg"
+        quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
+        is_full_quantize = False
+        is_use_cache_file = False
+        is_optimize_model = True
+        diff_threshold = 0.01
+        batch_size = 10
+        infer_iterations = 50
+        quant_iterations = 5
+        self.run_test(model_name, data_url, data_md5, algo, quantizable_op_type,
+                      is_full_quantize, is_use_cache_file, is_optimize_model,
+                      diff_threshold, batch_size, infer_iterations,
+                      quant_iterations)
+
+
 class TestPostTrainingAbsMaxForMnist(TestPostTrainingQuantization):
     def test_post_training_abs_max(self):
         model_name = "mnist_model"
diff --git a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py
index 18389d9433b9a..7161104861006 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py
@@ -328,6 +328,50 @@ def test_post_training_kl_mobilenetv1(self):
                       diff_threshold)
 
 
+class TestPostTrainingavgForMobilenetv1(TestPostTrainingQuantization):
+    def test_post_training_avg_mobilenetv1(self):
+        model = "MobileNet-V1"
+        algo = "avg"
+        data_urls = [
+            'http://paddle-inference-dist.bj.bcebos.com/int8/mobilenetv1_int8_model.tar.gz'
+        ]
+        data_md5s = ['13892b0716d26443a8cdea15b3c6438b']
+        quantizable_op_type = [
+            "conv2d",
+            "depthwise_conv2d",
+            "mul",
+        ]
+        is_full_quantize = False
+        is_use_cache_file = False
+        is_optimize_model = True
+        diff_threshold = 0.025
+        self.run_test(model, algo, data_urls, data_md5s, quantizable_op_type,
+                      is_full_quantize, is_use_cache_file, is_optimize_model,
+                      diff_threshold)
+
+
+class TestPostTraininghistForMobilenetv1(TestPostTrainingQuantization):
+    def test_post_training_hist_mobilenetv1(self):
+        model = "MobileNet-V1"
+        algo = "hist"
+        data_urls = [
+            'http://paddle-inference-dist.bj.bcebos.com/int8/mobilenetv1_int8_model.tar.gz'
+        ]
+        data_md5s = ['13892b0716d26443a8cdea15b3c6438b']
+        quantizable_op_type = [
+            "conv2d",
+            "depthwise_conv2d",
+            "mul",
+        ]
+        is_full_quantize = False
+        is_use_cache_file = False
+        is_optimize_model = True
+        diff_threshold = 0.025
+        self.run_test(model, algo, data_urls, data_md5s, quantizable_op_type,
+                      is_full_quantize, is_use_cache_file, is_optimize_model,
+                      diff_threshold)
+
+
 class TestPostTrainingAbsMaxForMobilenetv1(TestPostTrainingQuantization):
     def test_post_training_abs_max_mobilenetv1(self):
         model = "MobileNet-V1"
diff --git a/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py b/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
index 768a9ba7cfc3e..790213d4b0292 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
@@ -257,6 +257,7 @@ def freeze_graph(self,
                      use_cuda,
                      seed,
                      activation_quant_type,
+                     bias_correction=False,
                      weight_quant_type='abs_max',
                      for_ci=True,
                      quant_skip_pattern='skip_quant'):
@@ -355,7 +356,8 @@ def build_program(main, startup, is_test):
 
         # Freeze graph for inference, but the weight of fc/conv is still float type.
         freeze_pass = QuantizationFreezePass(
-            scope=scope, place=place, weight_quantize_type=weight_quant_type)
+            scope=scope, place=place, bias_correction=bias_correction, \
+            weight_quantize_type=weight_quant_type)
         freeze_pass.apply(test_graph)
         if not for_ci:
             marked_nodes = set()
@@ -472,6 +474,13 @@ def test_freeze_graph_cpu_dynamic(self):
     def test_freeze_graph_cuda_static(self):
         if fluid.core.is_compiled_with_cuda():
             with fluid.unique_name.guard():
+                self.freeze_graph(
+                    True,
+                    seed=1,
+                    activation_quant_type='range_abs_max',
+                    bias_correction=True,
+                    weight_quant_type='abs_max',
+                    for_ci=True)
                 self.freeze_graph(
                     True,
                     seed=1,
@@ -496,6 +505,13 @@ def test_freeze_graph_cuda_static(self):
                     activation_quant_type='moving_average_abs_max',
                     weight_quant_type='channel_wise_abs_max',
                     for_ci=True)
+                self.freeze_graph(
+                    True,
+                    seed=1,
+                    activation_quant_type='moving_average_abs_max',
+                    bias_correction=True,
+                    weight_quant_type='channel_wise_abs_max',
+                    for_ci=True)
 
     def test_freeze_graph_cpu_static(self):
         with fluid.unique_name.guard():
diff --git a/python/paddle/fluid/contrib/tests/test_distributed_reader.py b/python/paddle/fluid/contrib/tests/test_distributed_reader.py
deleted file mode 100644
index b964168eb3a2f..0000000000000
--- a/python/paddle/fluid/contrib/tests/test_distributed_reader.py
+++ /dev/null
@@ -1,45 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-import paddle.fluid as fluid
-import os
-
-
-def data_generator():
-    data = [0, 1, 2, 3]
-    for val in data:
-        yield val
-
-
-class TestDistributedReader(unittest.TestCase):
-    def test_distributed_reader(self):
-        trainer_num = 4
-        os.environ['PADDLE_TRAINER_ID'] = str(1)
-        os.environ['PADDLE_TRAINERS_NUM'] = str(trainer_num)
-
-        reader = fluid.contrib.reader.distributed_batch_reader(data_generator)
-        data = next(reader())
-        assert data == 1
-
-        #Note: windows python3 don't have unsetenv
-        del os.environ['PADDLE_TRAINER_ID']
-        del os.environ['PADDLE_TRAINERS_NUM']
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/contrib/utils/hdfs_utils.py b/python/paddle/fluid/contrib/utils/hdfs_utils.py
deleted file mode 100644
index 9572552f0f2be..0000000000000
--- a/python/paddle/fluid/contrib/utils/hdfs_utils.py
+++ /dev/null
@@ -1,603 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""hdfs_utils.py will move to fluid/incubate/fleet/utils/hdfs.py"""
-
-import os
-import sys
-import subprocess
-import multiprocessing
-from datetime import datetime
-
-import re
-import copy
-import errno
-
-import logging
-from paddle.fluid.log_helper import get_logger
-
-__all__ = ["HDFSClient", "multi_download", "multi_upload"]
-
-_logger = get_logger(
-    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
-
-
-class HDFSClient(object):
-    r"""
-    A tool of HDFS 
-
-    Args:
-        hadoop_home (string): hadoop_home 
-        configs (dict): hadoop config, it is a dict, please contain \
-            key "fs.default.name" and "hadoop.job.ugi"
-        Can be a float value
-    Examples:
-        hadoop_home = "/home/client/hadoop-client/hadoop/"
-
-        configs = {
-            "fs.default.name": "hdfs://xxx.hadoop.com:54310",
-            "hadoop.job.ugi": "hello,hello123"
-        }
-
-        client = HDFSClient(hadoop_home, configs)
-
-        client.ls("/user/com/train-25")
-        files = client.lsr("/user/com/train-25/models")
-    """
-
-    def __init__(self, hadoop_home, configs):
-        self.pre_commands = []
-        hadoop_bin = '%s/bin/hadoop' % hadoop_home
-        self.pre_commands.append(hadoop_bin)
-        dfs = 'fs'
-        self.pre_commands.append(dfs)
-
-        for k, v in configs.items():
-            config_command = '-D%s=%s' % (k, v)
-            self.pre_commands.append(config_command)
-
-    def __run_hdfs_cmd(self, commands, retry_times=5):
-        whole_commands = copy.deepcopy(self.pre_commands)
-        whole_commands.extend(commands)
-
-        print('Running system command: {0}'.format(' '.join(whole_commands)))
-
-        ret_code = 0
-        ret_out = None
-        ret_err = None
-        whole_commands = " ".join(whole_commands)
-        for x in range(retry_times + 1):
-            proc = subprocess.Popen(
-                whole_commands,
-                stdout=subprocess.PIPE,
-                stderr=subprocess.PIPE,
-                shell=True)
-            (output, errors) = proc.communicate()
-            ret_code, ret_out, ret_err = proc.returncode, output, errors
-            if ret_code:
-                _logger.warn(
-                    'Times: %d, Error running command: %s. Return code: %d, Error: %s'
-                    % (x, ' '.join(whole_commands), proc.returncode, errors))
-            else:
-                break
-        return ret_code, ret_out, ret_err
-
-    def upload(self, hdfs_path, local_path, overwrite=False, retry_times=5):
-        """
-        upload the local file to hdfs
-
-        Args:
-            hdfs_path(str): the hdfs file path
-            local_path(str): the local file path
-            overwrite(bool|None): will overwrite the file on HDFS or not
-            retry_times(int|5): retry times
-
-        Returns:
-                True or False
-        """
-        assert hdfs_path is not None
-        assert local_path is not None and os.path.exists(local_path)
-
-        if os.path.isdir(local_path):
-            _logger.warn(
-                "The Local path: {} is dir and I will support it later, return".
-                format(local_path))
-            return False
-
-        base = os.path.basename(local_path)
-        if not self.is_exist(hdfs_path):
-            self.makedirs(hdfs_path)
-        else:
-            if self.is_exist(os.path.join(hdfs_path, base)):
-                if overwrite:
-                    _logger.error(
-                        "The HDFS path: {} is exist and overwrite is True, delete it".
-                        format(hdfs_path))
-                    self.delete(hdfs_path)
-                else:
-                    _logger.error(
-                        "The HDFS path: {} is exist and overwrite is False, return".
-                        format(hdfs_path))
-                    return False
-
-        put_commands = ["-put", local_path, hdfs_path]
-        returncode, output, errors = self.__run_hdfs_cmd(put_commands,
-                                                         retry_times)
-        if returncode:
-            _logger.error("Put local path: {} to HDFS path: {} failed".format(
-                local_path, hdfs_path))
-            return False
-        else:
-            _logger.info("Put local path: {} to HDFS path: {} successfully".
-                         format(local_path, hdfs_path))
-            return True
-
-    def download(self, hdfs_path, local_path, overwrite=False, unzip=False):
-        """
-        download file from HDFS
-
-        Args:
-            hdfs_path(str): the hdfs file path
-            local_path(str): the local file path
-            overwrite(bool|None): will overwrite the file on HDFS or not
-            unzip(bool|False): if the download file is compressed by zip, unzip it or not.
-
-        Returns:
-            True or False
-        """
-        _logger.info('Downloading %r to %r.', hdfs_path, local_path)
-        _logger.info('Download of %s to %r complete.', hdfs_path, local_path)
-
-        if not self.is_exist(hdfs_path):
-            print("HDFS path: {} do not exist".format(hdfs_path))
-            return False
-        if self.is_dir(hdfs_path):
-            _logger.error(
-                "The HDFS path: {} is dir and I will support it later, return".
-                format(hdfs_path))
-
-        if os.path.exists(local_path):
-            base = os.path.basename(hdfs_path)
-            local_file = os.path.join(local_path, base)
-            if os.path.exists(local_file):
-                if overwrite:
-                    os.remove(local_file)
-                else:
-                    _logger.error(
-                        "The Local path: {} is exist and overwrite is False, return".
-                        format(local_file))
-                    return False
-
-        self.make_local_dirs(local_path)
-
-        download_commands = ["-get", hdfs_path, local_path]
-        returncode, output, errors = self.__run_hdfs_cmd(download_commands)
-        if returncode:
-            _logger.error("Get local path: {} from HDFS path: {} failed".format(
-                local_path, hdfs_path))
-            return False
-        else:
-            _logger.info("Get local path: {} from HDFS path: {} successfully".
-                         format(local_path, hdfs_path))
-            return True
-
-    def is_exist(self, hdfs_path=None):
-        """
-        whether the remote HDFS path exists
-
-        Args:
-            hdfs_path(str): the hdfs file path
-
-        Returns:
-            True or False
-        """
-        exist_cmd = ['-test', '-e', hdfs_path]
-        returncode, output, errors = self.__run_hdfs_cmd(
-            exist_cmd, retry_times=1)
-
-        if returncode:
-            _logger.error("HDFS is_exist HDFS path: {} failed".format(
-                hdfs_path))
-            return False
-        else:
-            _logger.info("HDFS is_exist HDFS path: {} successfully".format(
-                hdfs_path))
-            return True
-
-    def is_dir(self, hdfs_path=None):
-        """
-        whether the remote HDFS path is directory
-
-        Args:
-            hdfs_path(str): the hdfs file path
-
-        Returns:
-            True or False
-        """
-
-        if not self.is_exist(hdfs_path):
-            return False
-
-        dir_cmd = ['-test', '-d', hdfs_path]
-        returncode, output, errors = self.__run_hdfs_cmd(dir_cmd, retry_times=1)
-
-        if returncode:
-            _logger.error("HDFS path: {} failed is not a directory".format(
-                hdfs_path))
-            return False
-        else:
-            _logger.info("HDFS path: {} successfully is a directory".format(
-                hdfs_path))
-            return True
-
-    def delete(self, hdfs_path):
-        """
-        Remove a file or directory from HDFS.
-
-        whether the remote HDFS path exists
-
-        Args:
-        hdfs_path: HDFS path.
-
-        Returns:
-            True or False
-            This function returns `True` if the deletion was successful and `False` if
-            no file or directory previously existed at `hdfs_path`.
-        """
-        _logger.info('Deleting %r.', hdfs_path)
-
-        if not self.is_exist(hdfs_path):
-            _logger.warn("HDFS path: {} do not exist".format(hdfs_path))
-            return True
-
-        if self.is_dir(hdfs_path):
-            del_cmd = ['-rmr', hdfs_path]
-        else:
-            del_cmd = ['-rm', hdfs_path]
-
-        returncode, output, errors = self.__run_hdfs_cmd(del_cmd, retry_times=0)
-
-        if returncode:
-            _logger.error("HDFS path: {} delete files failure".format(
-                hdfs_path))
-            return False
-        else:
-            _logger.info("HDFS path: {} delete files successfully".format(
-                hdfs_path))
-            return True
-
-    def rename(self, hdfs_src_path, hdfs_dst_path, overwrite=False):
-        """
-        Move a file or folder on HDFS.
-
-        Args:
-        hdfs_path(str): HDFS path.
-        overwrite(bool|False): If the path already exists and overwrite is False, will return False.
-
-        Returns:
-            True or False
-        """
-        assert hdfs_src_path is not None
-        assert hdfs_dst_path is not None
-
-        if not self.is_exist(hdfs_src_path):
-            _logger.info("HDFS path do not exist: {}".format(hdfs_src_path))
-        if self.is_exist(hdfs_dst_path) and not overwrite:
-            _logger.error("HDFS path is exist: {} and overwrite=False".format(
-                hdfs_dst_path))
-
-        rename_command = ['-mv', hdfs_src_path, hdfs_dst_path]
-        returncode, output, errors = self.__run_hdfs_cmd(
-            rename_command, retry_times=1)
-
-        if returncode:
-            _logger.error("HDFS rename path: {} to {} failed".format(
-                hdfs_src_path, hdfs_dst_path))
-            return False
-        else:
-            _logger.info("HDFS rename path: {} to {} successfully".format(
-                hdfs_src_path, hdfs_dst_path))
-            return True
-
-    @staticmethod
-    def make_local_dirs(local_path):
-        """
-        create a directory local, is same to mkdir
-        Args:
-            local_path: local path that wants to create a directory.
-        """
-        try:
-            os.makedirs(local_path)
-        except OSError as e:
-            if e.errno != errno.EEXIST:
-                raise
-
-    def makedirs(self, hdfs_path):
-        """
-        Create a remote directory, recursively if necessary.
-
-        Args:
-        hdfs_path(str): Remote path. Intermediate directories will be created appropriately.
-
-        Returns:
-            True or False
-        """
-        _logger.info('Creating directories to %r.', hdfs_path)
-        assert hdfs_path is not None
-
-        if self.is_exist(hdfs_path):
-            _logger.error("HDFS path is exist: {}".format(hdfs_path))
-            return
-
-        mkdirs_commands = ['-mkdir', hdfs_path]
-        returncode, output, errors = self.__run_hdfs_cmd(
-            mkdirs_commands, retry_times=1)
-
-        if returncode:
-            _logger.error("HDFS mkdir path: {} failed".format(hdfs_path))
-            return False
-        else:
-            _logger.error("HDFS mkdir path: {} successfully".format(hdfs_path))
-            return True
-
-    def ls(self, hdfs_path):
-        """
-        ls directory contents about HDFS hdfs_path
-
-        Args:
-        hdfs_path(str): Remote HDFS path will be ls.
-
-        Returns:
-            List: a contents list about hdfs_path.
-        """
-        assert hdfs_path is not None
-
-        if not self.is_exist(hdfs_path):
-            return []
-
-        ls_commands = ['-ls', hdfs_path]
-        returncode, output, errors = self.__run_hdfs_cmd(
-            ls_commands, retry_times=1)
-
-        if returncode:
-            _logger.error("HDFS list path: {} failed".format(hdfs_path))
-            return []
-        else:
-            _logger.info("HDFS list path: {} successfully".format(hdfs_path))
-
-            ret_lines = []
-            regex = re.compile(r'\s+')
-            out_lines = output.strip().split("\n")
-            for line in out_lines:
-                re_line = regex.split(line)
-                if len(re_line) == 8:
-                    ret_lines.append(re_line[7])
-            return ret_lines
-
-    def lsr(self, hdfs_path, only_file=True, sort=True):
-        """
-        list directory contents about HDFS hdfs_path recursively
-
-        Args:
-        hdfs_path(str): Remote HDFS path.
-        only_file(bool|True): will discard folders.
-        sort(bool|True): will be sorted by create time.
-
-        Returns:
-            List: a contents list about hdfs_path.
-        """
-
-        def sort_by_time(v1, v2):
-            v1_time = datetime.strptime(v1[1], '%Y-%m-%d %H:%M')
-            v2_time = datetime.strptime(v2[1], '%Y-%m-%d %H:%M')
-            return v1_time > v2_time
-
-        assert hdfs_path is not None
-
-        if not self.is_exist(hdfs_path):
-            return []
-
-        ls_commands = ['-lsr', hdfs_path]
-        returncode, output, errors = self.__run_hdfs_cmd(
-            ls_commands, retry_times=1)
-
-        if returncode:
-            _logger.error("HDFS list all files: {} failed".format(hdfs_path))
-            return []
-        else:
-            _logger.info("HDFS list all files: {} successfully".format(
-                hdfs_path))
-            lines = []
-            regex = re.compile(r'\s+')
-            out_lines = output.strip().split("\n")
-            for line in out_lines:
-                re_line = regex.split(line)
-                if len(re_line) == 8:
-                    if only_file and re_line[0][0] == "d":
-                        continue
-                    else:
-                        lines.append(
-                            (re_line[7], re_line[5] + " " + re_line[6]))
-            if sort:
-                sorted(lines, cmp=sort_by_time)
-            ret_lines = [ret[0] for ret in lines]
-            return ret_lines
-
-
-def multi_download(client,
-                   hdfs_path,
-                   local_path,
-                   trainer_id,
-                   trainers,
-                   multi_processes=5):
-    """
-    Download files from HDFS using multi process.
-
-    Args:
-        client(HDFSClient): instance of HDFSClient
-        hdfs_path(str): path on hdfs
-        local_path(str): path on local
-        trainer_id(int): current trainer id
-        trainers(int): all trainers number
-        multi_processes(int|5): the download data process at the same time, default=5
-
-    Returns:
-        List:
-        Download files in local folder.
-    """
-
-    def __subprocess_download(datas):
-        for data in datas:
-            re_path = os.path.relpath(os.path.dirname(data), hdfs_path)
-            if re_path == os.curdir:
-                sub_local_re_path = local_path
-            else:
-                sub_local_re_path = os.path.join(local_path, re_path)
-            client.download(data, sub_local_re_path)
-
-    assert isinstance(client, HDFSClient)
-
-    client.make_local_dirs(local_path)
-    _logger.info("Make local dir {} successfully".format(local_path))
-
-    all_need_download = client.lsr(hdfs_path, sort=True)
-    need_download = all_need_download[trainer_id::trainers]
-    _logger.info("Get {} files From all {} files need to be download from {}".
-                 format(len(need_download), len(all_need_download), hdfs_path))
-
-    _logger.info("Start {} multi process to download datas".format(
-        multi_processes))
-    procs = []
-    for i in range(multi_processes):
-        process_datas = need_download[i::multi_processes]
-        p = multiprocessing.Process(
-            target=__subprocess_download, args=(process_datas, ))
-        procs.append(p)
-        p.start()
-
-    # complete the processes
-    for proc in procs:
-        proc.join()
-
-    _logger.info("Finish {} multi process to download datas".format(
-        multi_processes))
-
-    local_downloads = []
-    for data in need_download:
-        data_name = os.path.basename(data)
-        re_path = os.path.relpath(os.path.dirname(data), hdfs_path)
-        if re_path == os.curdir:
-            local_re_path = os.path.join(local_path, data_name)
-        else:
-            local_re_path = os.path.join(local_path, re_path, data_name)
-        local_downloads.append(local_re_path)
-
-    return local_downloads
-
-
-def getfilelist(path):
-    rlist = []
-    for dir, folder, file in os.walk(path):
-        for i in file:
-            t = os.path.join(dir, i)
-            rlist.append(t)
-    for r in rlist:
-        print(r)
-
-
-def multi_upload(client,
-                 hdfs_path,
-                 local_path,
-                 multi_processes=5,
-                 overwrite=False,
-                 sync=True):
-    """
-    Upload files to HDFS using multi process.
-
-    Args:
-        client(HDFSClient): instance of HDFSClient
-        hdfs_path(str): path on hdfs
-        local_path(str): path on local
-        multi_processes(int|5): the upload data process at the same time, default=5
-        overwrite(bool|False): will overwrite file on HDFS or not
-        sync(bool|True): upload files sync or not.
-
-    Returns:
-        None
-    """
-
-    def __subprocess_upload(datas):
-        for data in datas:
-            re_path = os.path.relpath(os.path.dirname(data), local_path)
-            hdfs_re_path = os.path.join(hdfs_path, re_path)
-            client.upload(hdfs_re_path, data, overwrite, retry_times=5)
-
-    def get_local_files(path):
-        rlist = []
-
-        if not os.path.isdir(path):
-            return rlist
-
-        for dirname, folder, files in os.walk(path):
-            for i in files:
-                t = os.path.join(dirname, i)
-                rlist.append(t)
-        return rlist
-
-    assert isinstance(client, HDFSClient)
-
-    all_files = get_local_files(local_path)
-    if not all_files:
-        _logger.info("there are nothing need to upload, exit")
-        return
-    _logger.info("Start {} multi process to upload datas".format(
-        multi_processes))
-    procs = []
-    for i in range(multi_processes):
-        process_datas = all_files[i::multi_processes]
-        p = multiprocessing.Process(
-            target=__subprocess_upload, args=(process_datas, ))
-        procs.append(p)
-        p.start()
-
-    # complete the processes
-    for proc in procs:
-        proc.join()
-
-    _logger.info("Finish {} multi process to upload datas".format(
-        multi_processes))
-
-
-if __name__ == "__main__":
-    hadoop_home = "/home/client/hadoop-client/hadoop/"
-
-    configs = {
-        "fs.default.name": "hdfs://xxx.hadoop.com:54310",
-        "hadoop.job.ugi": "hello,hello123"
-    }
-
-    client = HDFSClient(hadoop_home, configs)
-
-    client.ls("/user/com/train-25")
-    files = client.lsr("/user/com/train-25/models")
-
-    downloads = multi_download(
-        client,
-        "/user/com/train-25/model",
-        "/home/xx/data1",
-        1,
-        5,
-        100,
-        multi_processes=5)
-
-    multi_upload(client, "/user/com/train-25/model", "/home/xx/data1")
diff --git a/python/paddle/fluid/contrib/utils/lookup_table_utils.py b/python/paddle/fluid/contrib/utils/lookup_table_utils.py
deleted file mode 100644
index 7d30de565e7a4..0000000000000
--- a/python/paddle/fluid/contrib/utils/lookup_table_utils.py
+++ /dev/null
@@ -1,496 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""lookup_table_utils.py will move to fluid/incubate/fleet/utils/lookup_table.py"""
-
-from __future__ import print_function
-
-import os
-import time
-import logging
-
-import paddle
-from paddle.fluid import core
-from paddle.fluid import io
-from paddle.fluid import Program
-from paddle.fluid.log_helper import get_logger
-
-__all__ = [
-    "load_persistables_for_increment", "load_persistables_for_inference",
-    "convert_dist_to_sparse_program"
-]
-
-_logger = get_logger(
-    'lookup_table_utils',
-    logging.INFO,
-    fmt='%(asctime)s-%(levelname)s: %(message)s')
-
-model_filename = "__model__"
-lookup_table_dir = "__lookup_table__"
-
-
-def __insert_lookup_sparse_table_op(main_program, idx, ids, w, out):
-    main_program.global_block()._insert_op(
-        index=idx,
-        type="lookup_sparse_table",
-        inputs={"Ids": [ids],
-                "W": [w]},
-        outputs={"Out": [out]},
-        attrs={
-            "is_distributed": False,
-            "is_sparse": True,
-            "grad_inplace": False
-        })
-
-
-def __get_prefetch_op_tuples(main_program):
-    # current lookup tables op is split_ids->prefetch->merge_ids
-    prefetch_op_tuples = None
-    op_types = [op.type for op in main_program.global_block().ops]
-
-    for i in range(len(op_types)):
-        if op_types[i] == "prefetch":
-            if op_types[i - 1] == "split_ids" and op_types[i +
-                                                           1] == "merge_ids":
-                split_ids_op_id = i - 1
-                split_ids_inputs = main_program.global_block().ops[i - 1].input(
-                    "Ids")
-                prefetch_op_inputs = main_program.global_block().ops[i].input(
-                    "X")
-                prefetch_op_outputs = main_program.global_block().ops[i].output(
-                    "Out")
-                merge_ids_outputs = main_program.global_block().ops[
-                    i + 1].output("Out")
-
-                need_delete_vars = []
-                need_delete_vars.extend(prefetch_op_inputs)
-                need_delete_vars.extend(prefetch_op_outputs)
-
-                prefetch_op_tuples = (split_ids_op_id, split_ids_inputs,
-                                      merge_ids_outputs, need_delete_vars)
-                break
-    return prefetch_op_tuples
-
-
-def convert_dist_to_sparse_program(program):
-    """
-    WARNING: this function will only be used for distributed training with distributed lookup table.
-    when we train model with distributed lookup table but want to do the local inference, we can use
-    this function to convert the train program with distributed lookup table to sparse lookup table.
-
-    Args:
-        program(Program): the program must be the trainer program, which will be get by the distribute transpiler.
-    Returns:
-        program: The `program` is a Program, it's the program replace distributed lookup table to sparse lookup table.
-    """
-    if not program._distributed_lookup_table:
-        _logger.warn(
-            "There are no distributed lookup tables need to be converted")
-        return
-
-    # create table param and grad var in pserver program
-    origin_emb_var = "{}.origin".format(program._distributed_lookup_table)
-    emb_var = program._distributed_lookup_table
-    program.global_block()._rename_var(emb_var, origin_emb_var)
-    origin_param_var = program.global_block().vars[origin_emb_var]
-
-    param_var = program.global_block().create_var(
-        name=emb_var,
-        shape=origin_param_var.shape,
-        dtype=origin_param_var.dtype,
-        type=core.VarDesc.VarType.SELECTED_ROWS,
-        persistable=True)
-    # parameter must be selected rows
-    param_var.desc.set_type(core.VarDesc.VarType.SELECTED_ROWS)
-    program._sync_with_cpp()
-
-    prefetch_op_tuples = __get_prefetch_op_tuples(program)
-
-    split_ids_id = prefetch_op_tuples[0]
-
-    for idx in range(split_ids_id + 2, split_ids_id - 1, -1):
-        program.global_block()._remove_op(idx)
-    program.desc.flush()
-
-    in_out_pairs = zip(prefetch_op_tuples[1], prefetch_op_tuples[2])
-
-    for in_out_pair in in_out_pairs:
-        idx = split_ids_id
-        ids = program.global_block().vars[in_out_pair[0]]
-        out = program.global_block().vars[in_out_pair[1]]
-        __insert_lookup_sparse_table_op(program, idx, ids, param_var, out)
-        program.desc.flush()
-    return program
-
-
-def load_persistables_for_increment(dirname, executor, program,
-                                    lookup_table_var, lookup_table_var_path):
-    """
-    WARNING: this function will only be used for distributed training with distributed lookup table.
-    for increment training, the pserver will not only load dense variables,
-    but also load the suitable lookup table var. Because of sliced lookup table
-    var with HASH, we must load the correct sliced var.
-
-    Args:
-        dirname(str): The directory path
-        executor(Executor): The executor to run for loading inference model.
-        program(Program): The parameter server program, which will run on Pserver.
-        lookup_table_var: the distributed lookup tables var name.
-        lookup_table_var_path: the the distributed lookup tables var location.
-
-    Returns:
-        None
-    """
-
-    def _load_persistable_vars(executor, dirname, need_load_vars):
-        load_prog = Program()
-        load_block = load_prog.global_block()
-        need_delete_vars = []
-
-        for param in need_load_vars:
-            origin_var = param.origin
-            slice_var = param.slice
-            is_slice = param.is_slice
-            offset = param.offset
-
-            if is_slice:
-                origin = load_block.create_var(
-                    name="{}.load".format(origin_var.name),
-                    type=origin_var.type,
-                    shape=origin_var.shape,
-                    dtype=origin_var.dtype,
-                    persistable=True)
-
-                load_block.append_op(
-                    type='load',
-                    inputs={},
-                    outputs={'Out': [origin]},
-                    attrs={
-                        'file_path': os.path.join(dirname, origin_var.name)
-                    })
-
-                slice = load_block.create_var(
-                    name=slice_var.name,
-                    type=slice_var.type,
-                    shape=slice_var.shape,
-                    dtype=slice_var.dtype,
-                    persistable=True)
-
-                dim1_flatten = reduce(lambda x, y: x * y, slice.shape[1:])
-                start = int(offset / dim1_flatten)
-                end = int(offset / dim1_flatten + slice.shape[0])
-
-                load_block.append_op(
-                    type="slice",
-                    inputs={'Input': origin},
-                    outputs={'Out': slice},
-                    attrs={'axes': [0],
-                           'starts': [start],
-                           'ends': [end]})
-
-                need_delete_vars.append(origin)
-            else:
-                origin = load_block.create_var(
-                    name="{}".format(origin_var.name),
-                    type=origin_var.type,
-                    shape=origin_var.shape,
-                    dtype=origin_var.dtype,
-                    persistable=True)
-                load_block.append_op(
-                    type='load',
-                    inputs={},
-                    outputs={'Out': [origin]},
-                    attrs={
-                        'file_path': os.path.join(dirname, origin_var.name)
-                    })
-
-        load_block.append_op(
-            type='delete_var',
-            inputs={'X': need_delete_vars}, )
-
-        executor.run(load_prog)
-
-    def __load_lookup_table_vars(executor, main_program, lookup_table_var,
-                                 lookup_table_var_path):
-        emb_var = main_program.global_block().var(lookup_table_var)
-
-        load_program = Program()
-        load_block = load_program.global_block()
-        load_block.append_op(
-            type='load',
-            inputs={},
-            outputs={'Out': [emb_var]},
-            attrs={'file_path': lookup_table_var_path})
-        executor.run(load_program)
-
-    if not os.path.isdir(dirname):
-        raise ValueError("There is no directory named '%s'", dirname)
-
-    if not os.path.exists(lookup_table_var_path):
-        raise ValueError("There is no file named '%s'", lookup_table_var_path)
-
-    if not isinstance(program, Program):
-        raise ValueError("program must be an instance of fluid.Program")
-
-    _logger.info("Start Load Sparse Program With "
-                 "Distributed Lookup Table Vars from {}, time = {}".format(
-                     dirname, time.ctime()))
-
-    need_load_vars = program._parameters_on_pservers.get_distributed_vars_by_ep(
-        program._ps_endpoint)
-    _load_persistable_vars(executor, dirname, need_load_vars)
-    __load_lookup_table_vars(executor, program, lookup_table_var,
-                             lookup_table_var_path)
-
-    _logger.info("Finish Load Sparse Program With "
-                 "Distributed Lookup Table Vars from {}, time = {}".format(
-                     dirname, time.ctime()))
-
-
-def load_persistables_for_inference(dirname, executor, program,
-                                    lookup_table_var_name):
-    """
-    WARNING: this function will only be used for inference with distributed lookup table.
-    Inference with distributed lookup table is a little funky, this function will load distributed
-    lookup table vars into sparse var, can be used in local inference mode.
-
-    Args:
-        dirname(str): The directory path
-        executor(Executor): The executor to run for loading inference model.
-        program(Program): The parameter server program, which will run on Pserver.
-        lookup_table_var_name: the distributed lookup tables var name.
-    Returns:
-        None
-    """
-
-    def _load_persistable_vars(executor, dirname, program, lookup_table_vars):
-        def _is_checkpoint_var(exclude_fluid_vars=None):
-            """
-            the checkpoint will not save or load all the variables.
-            var type is FEED_MINIBATCH/FETCH_LIST/RAW or var name ends with @GRAD are discarded.
-
-            : param var(Variable)
-            """
-
-            if exclude_fluid_vars is None:
-                exclude_fluid_vars = []
-
-            def is_valid(var):
-                if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
-                        var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
-                        var.desc.type() == core.VarDesc.VarType.RAW:
-                    return False
-                # @GRAD are named for gradient variables, checkpoint will not save it.
-                if "@GRAD" in var.name:
-                    return False
-                # .trainer_ are named for distribute train variables, checkpoint will not save it.
-                if ".trainer_" in var.name:
-                    return False
-
-                # .block is named for distribute train variables, checkpoint will not save it.
-                if ".block" in var.name:
-                    return False
-
-                if "tmp_" in var.name:
-                    return False
-
-                if var.name in exclude_fluid_vars:
-                    return False
-
-                return var.persistable
-
-            return is_valid
-
-        io.load_vars(
-            executor,
-            dirname=dirname,
-            main_program=program,
-            predicate=_is_checkpoint_var(lookup_table_vars),
-            filename=None)
-
-    def _load_lookup_table_vars(executor, dirname, main_program,
-                                lookup_table_vars):
-        if not os.path.isdir(dirname):
-            raise ValueError("There is no directory named '%s'", dirname)
-
-        lookup_table_dirname = os.path.join(dirname, lookup_table_dir)
-
-        emb_var_name = lookup_table_vars[0]
-        emb_var = main_program.global_block().var(emb_var_name)
-
-        emb_files = []
-        for emb_name in os.listdir(lookup_table_dirname):
-            if emb_var_name in emb_name:
-                emb_files.append(emb_name)
-
-        convert_program = Program()
-        global_block = convert_program.global_block()
-
-        emb_var = global_block.create_var(
-            name=emb_var.name,
-            shape=emb_var.shape,
-            dtype=emb_var.dtype,
-            type=core.VarDesc.VarType.SELECTED_ROWS,
-            persistable=True)
-        emb_var.desc.set_type(core.VarDesc.VarType.SELECTED_ROWS)
-
-        sums = []
-
-        for i, emb_file in enumerate(emb_files):
-            var_name = "{}_{}".format(emb_var.name, i)
-            param_var = global_block.create_var(
-                name=var_name,
-                shape=emb_var.shape,
-                dtype=emb_var.dtype,
-                type=core.VarDesc.VarType.SELECTED_ROWS,
-                persistable=True)
-            param_var.desc.set_type(core.VarDesc.VarType.SELECTED_ROWS)
-            global_block.append_op(
-                type='load',
-                inputs={},
-                outputs={'Out': [param_var]},
-                attrs={
-                    'file_path': os.path.join(lookup_table_dirname, var_name)
-                })
-            sums.append(param_var)
-        global_block.append_op(
-            type='merge_sparse_lookup_table',
-            inputs={"X": sums},
-            outputs={'Out': emb_var},
-            attrs={})
-        global_block.append_op(
-            type='save',
-            inputs={"X": [emb_var]},
-            outputs={},
-            attrs={
-                'file_path': os.path.join(lookup_table_dirname, emb_var.name)
-            })
-        global_block.append_op(type='delete_var', inputs={'X': sums})
-        executor.run(convert_program)
-
-    if not os.path.isdir(dirname):
-        raise ValueError("There is no directory named '%s'", dirname)
-
-    if program:
-        if not isinstance(program, Program):
-            raise ValueError("program must be an instance of fluid.Program")
-    else:
-        local_model = os.path.join(dirname, model_filename)
-
-        with open(local_model, "rb") as f:
-            program_desc_str = f.read()
-
-        program = Program.parse_from_string(program_desc_str)
-
-        if not core._is_program_version_supported(program._version()):
-            raise ValueError("Unsupported program version: %d\n" %
-                             program._version())
-
-    _logger.info("Start Load Sparse Program With "
-                 "Distributed Lookup Table Vars from {}, time = {}".format(
-                     dirname, time.ctime()))
-
-    _load_persistable_vars(executor, dirname, program, [lookup_table_var_name])
-    _load_lookup_table_vars(executor, dirname, program, [lookup_table_var_name])
-
-    _logger.info("Finish Load Sparse Program With "
-                 "Distributed Lookup Table Vars from {}, time = {}".format(
-                     dirname, time.ctime()))
-
-    return program
-
-
-def get_inference_model(main_program, feeded_var_names, target_vars):
-    """
-    Prune the given `main_program` to build a new program especially for inference with distributed lookup table ,
-    and then add `feeded_vars` and `target_vars` in this program.
-
-    Args:
-        main_program(Program|None): The original program, which will be pruned to
-                                    build the inference model. If is set None,
-                                    the default main program will be used.
-                                    Default: None.
-        feeded_var_names(list[str]): Names of variables that need to be fed data
-                                     during inference.
-        target_vars(list[Variable]): Variables from which we can get inference
-                                     results.
-    Returns:
-        program(Program)
-
-    Raises:
-        ValueError: If `feed_var_names` is not a list of basestring.
-        ValueError: If `target_vars` is not a list of Variable.
-
-    """
-
-    def prepend_feed_ops(inference_program,
-                         feed_target_names,
-                         feed_holder_name='feed'):
-        if len(feed_target_names) == 0:
-            return
-
-        global_block = inference_program.global_block()
-
-        feed_var = global_block.create_var(
-            name=feed_holder_name,
-            type=core.VarDesc.VarType.FEED_MINIBATCH,
-            persistable=True)
-
-        for i, name in enumerate(feed_target_names):
-            out = global_block.var(name)
-            global_block._prepend_op(
-                type='feed',
-                inputs={'X': [feed_var]},
-                outputs={'Out': [out]},
-                attrs={'col': i})
-
-    def append_fetch_ops(inference_program,
-                         fetch_target_names,
-                         fetch_holder_name='fetch'):
-        global_block = inference_program.global_block()
-        fetch_var = global_block.create_var(
-            name=fetch_holder_name,
-            type=core.VarDesc.VarType.FETCH_LIST,
-            persistable=True)
-
-        for i, name in enumerate(fetch_target_names):
-            global_block.append_op(
-                type='fetch',
-                inputs={'X': [name]},
-                outputs={'Out': [fetch_var]},
-                attrs={'col': i})
-
-    origin_program = main_program.clone()
-    main_program = main_program.clone()
-    global_block = main_program.global_block()
-
-    need_to_remove_op_index = []
-    for i, op in enumerate(global_block.ops):
-        op.desc.set_is_target(False)
-        if op.type == "feed" or op.type == "fetch":
-            need_to_remove_op_index.append(i)
-
-    for index in need_to_remove_op_index[::-1]:
-        global_block._remove_op(index)
-
-    main_program.desc.flush()
-
-    main_program = main_program._prune(targets=target_vars)
-    main_program = main_program._inference_optimize(prune_read_op=True)
-
-    fetch_var_names = [v.name for v in target_vars]
-
-    prepend_feed_ops(main_program, feeded_var_names)
-    append_fetch_ops(main_program, fetch_var_names)
-
-    return main_program
diff --git a/python/paddle/fluid/core.py b/python/paddle/fluid/core.py
index d3dc26c946df4..49bcaf6dd608c 100644
--- a/python/paddle/fluid/core.py
+++ b/python/paddle/fluid/core.py
@@ -270,6 +270,10 @@ def to_list(s):
         from .core_avx import _load_static_dict
         from .core_avx import _save_dygraph_dict
         from .core_avx import _load_dygraph_dict
+        from .core_avx import _save_lod_tensor
+        from .core_avx import _load_lod_tensor
+        from .core_avx import _save_selected_rows
+        from .core_avx import _load_selected_rows
         from .core_avx import _create_loaded_parameter
         from .core_avx import _cuda_synchronize
         from .core_avx import _promote_types_if_complex_exists
@@ -325,6 +329,10 @@ def to_list(s):
         from .core_noavx import _load_static_dict
         from .core_noavx import _save_dygraph_dict
         from .core_noavx import _load_dygraph_dict
+        from .core_noavx import _save_lod_tensor
+        from .core_noavx import _load_lod_tensor
+        from .core_noavx import _save_selected_rows
+        from .core_noavx import _load_selected_rows
         from .core_noavx import _create_loaded_parameter
         from .core_noavx import _cuda_synchronize
         from .core_noavx import _promote_types_if_complex_exists
diff --git a/python/paddle/fluid/device_worker.py b/python/paddle/fluid/device_worker.py
index 0f98af5772313..7fed27ee45978 100644
--- a/python/paddle/fluid/device_worker.py
+++ b/python/paddle/fluid/device_worker.py
@@ -102,6 +102,12 @@ def _gen_worker_desc(self, trainer_desc):
         # when opt_info is None or empty dict, it should return
         if not opt_info:
             return
+        downpour = trainer_desc.downpour_param
+        hogwild = trainer_desc.hogwild_param
+        if opt_info["stat_var_names"]:
+            for i in opt_info["stat_var_names"]:
+                hogwild.stat_var_names.extend([i])
+                downpour.stat_var_names.extend([i])
 
         from paddle.fluid.incubate.fleet.parameter_server import version
 
@@ -109,8 +115,6 @@ def _gen_worker_desc(self, trainer_desc):
             return
 
         program_configs = opt_info["program_configs"]
-        downpour = trainer_desc.downpour_param
-        hogwild = trainer_desc.hogwild_param
 
         for pid in program_configs:
             if pid == program_id:
@@ -161,10 +165,6 @@ def _gen_worker_desc(self, trainer_desc):
             sparse_table.emb_dim = -1
             # not use hard code click
             sparse_table.label_var_name = ""
-        if opt_info["stat_var_names"]:
-            for i in opt_info["stat_var_names"]:
-                hogwild.stat_var_names.extend([i])
-                downpour.stat_var_names.extend([i])
 
         for i in worker.get_desc().dense_table:
             if i.table_id in dense_table_set:
@@ -433,7 +433,10 @@ def _gen_worker_desc(self, trainer_desc):
         # cfg.program_desc.CopyFrom(program.program._get_desc())
         place = pipeline_opt["place"]
         place_id = pipeline_opt["place_id"]
-        assert isinstance(place, core.CUDAPlace)
+        if core.is_compiled_with_cuda():
+            assert isinstance(place, core.CUDAPlace)
+        elif core.is_compiled_with_npu():
+            assert isinstance(place, core.NPUPlace)
         cfg.place = cfg.CUDAPlace
         cfg.place_id = place_id
 
diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py
index 36637abc6d0b8..18dfff434a2aa 100644
--- a/python/paddle/fluid/dygraph/layers.py
+++ b/python/paddle/fluid/dygraph/layers.py
@@ -36,6 +36,7 @@
 from paddle.fluid.executor import Executor, global_scope
 from paddle.fluid.framework import in_dygraph_mode
 from paddle.fluid.framework import _current_expected_place as _get_device
+from paddle.fluid.dygraph import no_grad
 import paddle.utils.deprecated as deprecated
 
 __all__ = ['Layer']
@@ -515,9 +516,6 @@ def forward(self, input):
     def parameters(self, include_sublayers=True):
         """Returns a list of all Parameters from current layer and its sub-layers.
 
-        Parameters:
-            include_sublayers(bool, optional): Whether include the parameters of sublayers. If True, also include the parameters from sublayers. Default: True
-
         Returns:
             list of Tensor : a list of Parameters.
 
@@ -587,11 +585,11 @@ def named_children(self):
                 memo.add(layer)
                 yield name, layer
 
-    def sublayers(self, include_sublayers=True):
+    def sublayers(self, include_self=False):
         """Returns a list of sub layers.
 
         Parameters:
-            include_sublayers(bool, optional): Whether return the sublayers of sublayers. If True, also include the sublayers of sublayers. Default: True
+            include_self(bool, optional): Whether return self as sublayers. Default: False
 
         Returns:
             list of Layer : a list of sub layers.
@@ -618,8 +616,7 @@ def forward(self, input):
         """
         ret = [
             layer
-            for _, layer in self.named_sublayers(
-                include_sublayers=include_sublayers)
+            for _, layer in self.named_sublayers(include_self=include_self)
         ]
         return ret
 
@@ -650,8 +647,7 @@ def named_parameters(self, prefix='', include_sublayers=True):
         params_set = set()
         named_sublayers = self.named_sublayers(
             prefix=prefix,
-            include_sublayers=include_sublayers,
-            include_self=True)
+            include_self=True) if include_sublayers else zip([prefix], [self])
         for layer_prefix, sublayer in named_sublayers:
             params = sublayer._parameters.items()
             for key, param in params:
@@ -661,18 +657,13 @@ def named_parameters(self, prefix='', include_sublayers=True):
                 name = layer_prefix + ('.' if layer_prefix else '') + key
                 yield name, param
 
-    def named_sublayers(self,
-                        prefix='',
-                        include_sublayers=True,
-                        include_self=False,
-                        layers_set=None):
+    def named_sublayers(self, prefix='', include_self=False, layers_set=None):
         """
         Returns an iterator over all sublayers in the Layer, yielding tuple of name and sublayer.
         The duplicate sublayer will only be yielded once.
 
         Parameters:
             prefix(str, optional): Prefix to prepend to all parameter names. Default: ''.
-            include_sublayers(bool, optional): Whether include the sublayers. Default: True.
             include_self(bool, optional): Whether include the Layer itself. Default: False.
             layers_set(set, optioanl): The set to record duplicate sublayers. Default: None.
 
@@ -696,17 +687,14 @@ def named_sublayers(self,
         if include_self and self not in layers_set:
             layers_set.add(self)
             yield prefix, self
-        if include_sublayers:
-            for key, layer in self._sub_layers.items():
-                if layer is None:
-                    continue
-                layer_prefix = prefix + ('.' if prefix else '') + key
-                for p, l in layer.named_sublayers(
-                        prefix=layer_prefix,
-                        include_sublayers=include_sublayers,
-                        include_self=True,
-                        layers_set=layers_set):
-                    yield p, l
+        for key, layer in self._sub_layers.items():
+            if layer is None:
+                continue
+            layer_prefix = prefix + ('.' if prefix else '') + key
+            for p, l in layer.named_sublayers(
+                    prefix=layer_prefix, include_self=True,
+                    layers_set=layers_set):
+                yield p, l
 
     def register_buffer(self, name, tensor, persistable=True):
         """
@@ -843,8 +831,7 @@ def named_buffers(self, prefix='', include_sublayers=True):
         buffers_set = set()
         named_sublayers = self.named_sublayers(
             prefix=prefix,
-            include_sublayers=include_sublayers,
-            include_self=True)
+            include_self=True) if include_sublayers else zip([prefix], [self])
         for layer_prefix, sublayer in named_sublayers:
             buffers = sublayer._buffers.items()
             for key, buffer in buffers:
@@ -1262,16 +1249,12 @@ def state_dict(self,
         return destination
 
     @framework.deprecate_stat_dict
-    def set_state_dict(self,
-                       state_dict,
-                       include_sublayers=True,
-                       use_structured_name=True):
+    def set_state_dict(self, state_dict, use_structured_name=True):
         '''
         Set parameters and persistable buffers from state_dict. All the parameters and buffers will be reset by the tensor in the state_dict
 
         Parameters:
             state_dict(dict) : Dict contains all the parameters and persistable buffers.
-            include_sublayers(bool, optional) : If true, also include the parameters and peresistable buffers from sublayers. Default: True
             use_structured_name(bool, optional) : If true, use structured name as key, otherwise, use parameter or buffer name as key. 
                                                   Default: True
         Returns:
@@ -1343,6 +1326,114 @@ def _set_var(var, ndarray):
             for param, state in matched_param_state:
                 _set_var(param, state)
 
+    def _apply(self, func, device, dtype, blocking):
+        for layer in self.children():
+            layer._apply(func, device, dtype, blocking)
+
+        for key, param in self._parameters.items():
+            if param is not None:
+                with no_grad():
+                    param_applied = func(param, device, dtype, blocking)
+                    assert param.is_leaf
+                    param_applied.stop_gradient = param.stop_gradient
+                    self._parameters[key] = param_applied
+
+                if param.grad is not None:
+                    with no_grad():
+                        grad_applied = func(param._grad_ivar(), device, dtype,
+                                            blocking)
+
+                        grad_applied.stop_gradient = param._grad_ivar(
+                        ).stop_gradient
+                        self._parameters[key]._set_grad_ivar(grad_applied)
+
+        for key, buf in self._buffers.items():
+            self._buffers[key] = func(buf, device, dtype, blocking)
+
+    def to(self, device=None, dtype=None, blocking=None):
+        '''
+        Cast the parameters and buffers of Layer by the give device, dtype and blocking.
+
+        Parameters:
+            device(str|paddle.CPUPlace()|paddle.CUDAPlace()|paddle.CUDAPinnedPlace()|paddle.XPUPlace()|None, optional): The device of the Layer which want to be stored. 
+            If None, the device is the same with the original Tensor. If device is string, it can be ``cpu``, ``gpu:x`` and ``xpu:x``, where ``x`` is the 
+            index of the GPUs or XPUs. Default: None. 
+            
+            dtype(str|core.VarDesc.VarType|None, optional): The type of the data. If None, the dtype is the same with the original Tensor. Default: None.
+
+            blocking(bool|None, optional): If False and the source is in pinned memory, the copy will be 
+              asynchronous with respect to the host. Otherwise, the argument has no effect. If None, the blocking is set True. Default: None.
+            
+        Returns:
+            None
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+
+                linear=paddle.nn.Linear(2, 2)
+                linear.weight
+                #Parameter containing:
+                #Tensor(shape=[2, 2], dtype=float32, place=CUDAPlace(0), stop_gradient=False,
+                #       [[-0.32770029,  0.38653070],
+                #        [ 0.46030545,  0.08158520]])
+
+                linear.to(dtype='float64')
+                linear.weight
+                #Tenor(shape=[2, 2], dtype=float64, place=CUDAPlace(0), stop_gradient=False,
+                #       [[-0.32770029,  0.38653070],
+                #        [ 0.46030545,  0.08158520]])
+
+                linear.to(device='cpu')
+                linear.weight
+                #Tensor(shape=[2, 2], dtype=float64, place=CPUPlace, stop_gradient=False,
+                #       [[-0.32770029,  0.38653070],
+                #        [ 0.46030545,  0.08158520]])
+                linear.to(device=paddle.CUDAPinnedPlace(), blocking=False)
+                linear.weight
+                #Tensor(shape=[2, 2], dtype=float64, place=CUDAPinnedPlace, stop_gradient=False,
+                #       [[-0.04989364, -0.56889004],
+                #        [ 0.33960250,  0.96878713]])
+    
+
+        '''
+
+        if device is None and dtype is None and blocking is None:
+            return
+
+        if device is not None:
+            if isinstance(device, str):
+                device = paddle.device._convert_to_place(device)
+            elif isinstance(device, (core.CPUPlace, core.CUDAPlace,
+                                     core.CUDAPinnedPlace, core.XPUPlace)):
+                pass
+            else:
+                raise ValueError(
+                    "device value error, must be str, paddle.CPUPlace(), paddle.CUDAPlace(), paddle.CUDAPinnedPlace() or paddle.XPUPlace(), but the type of device is "
+                    + type(device).__name__)
+
+        if blocking is None:
+            blocking = True
+        else:
+            assert isinstance(
+                blocking,
+                bool), "blocking value error, must be the True, False or None"
+
+        def transform(t, device, dtype, blocking):
+            if device is None:
+                device = t.place
+            if dtype is None:
+                dtype = t.dtype
+
+            new_t = t._copy_to(device, blocking)
+            if dtype is not None and dtype != t.dtype:
+                new_t = new_t.cast(dtype=dtype)
+
+            return new_t
+
+        self._apply(transform, device, dtype, blocking)
+
     # [aliases] Compatible with old method names
     set_dict = set_state_dict
     load_dict = set_state_dict
diff --git a/python/paddle/fluid/dygraph/math_op_patch.py b/python/paddle/fluid/dygraph/math_op_patch.py
index 1df3e31ae4b26..41cce6a0858a6 100644
--- a/python/paddle/fluid/dygraph/math_op_patch.py
+++ b/python/paddle/fluid/dygraph/math_op_patch.py
@@ -29,6 +29,7 @@
     core.VarDesc.VarType.INT16,
     core.VarDesc.VarType.INT32,
     core.VarDesc.VarType.INT64,
+    core.VarDesc.VarType.BOOL,
 ]
 
 # NOTE(chenweihang): We currently do not fully support the type promotion 
diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py
index b80621e21f1c5..ca5e5606e432b 100644
--- a/python/paddle/fluid/dygraph/parallel.py
+++ b/python/paddle/fluid/dygraph/parallel.py
@@ -25,6 +25,7 @@
 from paddle.fluid.dygraph import to_variable, no_grad
 from paddle.utils import deprecated
 from ..layers import collective
+from paddle.fluid.dygraph import base as imperative_base
 import warnings
 import paddle
 import itertools
@@ -320,6 +321,62 @@ def scale_loss(loss):
     return scaled_loss
 
 
+@imperative_base.no_grad
+@framework.dygraph_only
+def build_groups(vars, group_size):
+    group_idx = 0
+    memory_counter = 0
+    var_groups = OrderedDict()
+    dtype = vars[0].dtype
+
+    for var in vars:
+        bytes = np.prod(var.shape) * core.size_of_dtype(var.dtype)
+        if memory_counter < group_size and dtype == var.dtype:
+            memory_counter += bytes
+        else:
+            memory_counter = bytes
+            dtype = var.dtype
+            group_idx += 1
+        var_groups.setdefault(group_idx, []).append(var)
+    return _coalesce_tensors(var_groups)
+
+
+@imperative_base.no_grad
+@framework.dygraph_only
+def sync_params_buffers(model,
+                        comm_group=None,
+                        src_rank=0,
+                        is_model_parallel=False):
+    model_vars = []
+    for _, param in model.state_dict().items():
+        if not isinstance(param, core.VarBase):
+            raise TypeError("The data type of '%s' must be Varbase" %
+                            param.name)
+        # is_distributed param not need to sync when in mp mode
+        if is_model_parallel and param.is_distributed:
+            continue
+
+        model_vars.append(param.detach())
+    if len(model_vars) == 0:
+        return
+
+    # group size is 128M
+    coalesced_vars = build_groups(model_vars, 128 * 1024 * 1024)
+
+    for coalesced_var, _, _ in coalesced_vars:
+        paddle.distributed.broadcast(
+            coalesced_var, src=src_rank, group=comm_group, use_calc_stream=True)
+
+    for coalesced_var, origin_vars, var_shapes in coalesced_vars:
+        var_len = [np.prod(v_shape) for v_shape in var_shapes]
+        paddle.fluid.framework._dygraph_tracer().trace_op(
+            type='split',
+            inputs={'X': coalesced_var},
+            outputs={'Out': origin_vars},
+            attrs={'sections': var_len,
+                   'axis': 0})
+
+
 class DataParallel(layers.Layer):
     """
     Run the dygraph module with data parallelism.
@@ -443,7 +500,7 @@ def __init__(self,
             # TODO(liuyuhui) Currently not support xpu. xpu is 
             # still broadcasting parameters when calling layer
             if not paddle.is_compiled_with_xpu():
-                self._sync_params_buffers()
+                sync_params_buffers(self._layers)
 
             self.comm_buffer_size = int(comm_buffer_size * 1024 * 1024)
             # NOTE(shenliang03): We can set environment variables to control 
@@ -516,46 +573,6 @@ def _find_varbase(self, obj):
             return itertools.chain(*map(self._find_varbase, obj.values()))
         return []
 
-    def _sync_params_buffers(self):
-        model_vars = []
-        for _, param in self._layers.state_dict().items():
-            if not isinstance(param, core.VarBase):
-                raise TypeError("The data type of '%s' must be Varbase" %
-                                param.name)
-            model_vars.append(param.detach())
-        if len(model_vars) == 0:
-            return
-
-        mega_bytes = 128 * 1024 * 1024
-        group_idx = 0
-        memory_counter = 0
-        var_groups = OrderedDict()
-        dtype = model_vars[0].dtype
-
-        for var in model_vars:
-            bytes = np.prod(var.shape) * core.size_of_dtype(var.dtype)
-            if memory_counter < mega_bytes and dtype == var.dtype:
-                memory_counter += bytes
-            else:
-                memory_counter = 0
-                dtype = var.dtype
-                group_idx += 1
-            var_groups.setdefault(group_idx, []).append(var)
-
-        coalesced_vars = _coalesce_tensors(var_groups)
-
-        for coalesced_var, _, _ in coalesced_vars:
-            collective._broadcast(coalesced_var, root=0, sync_mode=True)
-
-        for coalesced_var, origin_vars, var_shapes in coalesced_vars:
-            var_len = [np.prod(v_shape) for v_shape in var_shapes]
-            framework._dygraph_tracer().trace_op(
-                type='split',
-                inputs={'X': coalesced_var},
-                outputs={'Out': origin_vars},
-                attrs={'sections': var_len,
-                       'axis': 0})
-
     def forward(self, *inputs, **kwargs):
         outputs = self._layers(*inputs, **kwargs)
         if self._strategy.nranks > 1 and framework._dygraph_tracer()._has_grad:
@@ -621,16 +638,12 @@ def state_dict(self,
             structured_name_prefix=structured_name_prefix)
 
     @framework.deprecate_stat_dict
-    def set_state_dict(self,
-                       state_dict,
-                       include_sublayers=True,
-                       use_structured_name=True):
+    def set_state_dict(self, state_dict, use_structured_name=True):
         '''
         Set parameters and persistable buffers from state_dict. All the parameters and buffers will be reset by the tensor in the state_dict
 
         Parameters:
             state_dict(dict) : Dict contains all the parameters and persistable buffers.
-            include_sublayers(bool, optional) : If true, also include the parameters and peresistable buffers from sublayers. Default: True
             use_structured_name(bool, optional) : If true, use structured name as key, otherwise, use parameter or buffer name as key. 
                                                   Default: True
         Returns:
@@ -656,9 +669,7 @@ def set_state_dict(self,
         '''
 
         self._layers.set_state_dict(
-            state_dict,
-            include_sublayers=include_sublayers,
-            use_structured_name=use_structured_name)
+            state_dict, use_structured_name=use_structured_name)
 
     # [aliases] Compatible with old method names
     set_dict = set_state_dict
diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py
index ac594709867d1..64209aee875ba 100644
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -434,6 +434,9 @@ def __nonzero__(self):
     def __bool__(self):
         return self.__nonzero__()
 
+    def __array__(self, dtype=None):
+        return self.numpy().astype(dtype)
+
     for method_name, method in (
         ("__bool__", __bool__), ("__nonzero__", __nonzero__),
         ("_to_static_var", _to_static_var), ("set_value", set_value),
@@ -442,7 +445,7 @@ def __bool__(self):
         ("gradient", gradient), ("register_hook", register_hook),
         ("__str__", __str__), ("__repr__", __str__),
         ("__deepcopy__", __deepcopy__), ("__module__", "paddle"),
-        ("__name__", "Tensor")):
+        ("__name__", "Tensor"), ("__array__", __array__)):
         setattr(core.VarBase, method_name, method)
 
     # NOTE(zhiqiu): pybind11 will set a default __str__ method of enum class.
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 9c85cc6cd5db6..62a9c42ee0a61 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -1373,11 +1373,14 @@ def _prepare_trainer(self,
                          fetch_info=None,
                          print_period=100):
         is_heter = 0
+        use_ps_gpu = 0
         if not program._fleet_opt is None:
             if program._fleet_opt.get("worker_class", "") == "HeterCpuWorker":
                 is_heter = 1
             if program._fleet_opt.get("trainer", "") == "HeterXpuTrainer":
                 is_heter = 1
+            if program._fleet_opt.get("use_ps_gpu", False):
+                use_ps_gpu = True
         if scope is None:
             scope = global_scope()
         if fetch_list is None:
@@ -1412,7 +1415,9 @@ def _prepare_trainer(self,
             trainer._set_program(program.program)
 
         if thread <= 0:
-            if dataset.thread_num <= 0:
+            if use_ps_gpu:
+                trainer._set_thread(len(program._fleet_opt["worker_places"]))
+            elif dataset.thread_num <= 0:
                 raise RuntimeError(
                     "You should set thread num first, either in Dataset"
                     "or in Executor.train_from_dataset")
@@ -1446,8 +1451,12 @@ def _run_from_dataset(self,
             for var in program.global_block().vars.values():
                 if var.is_data:
                     data_vars.append(var)
-            dataset = paddle.fluid.DatasetFactory().create_dataset(
-                'FileInstantDataset')
+            if core.is_compiled_with_npu():
+                dataset = paddle.fluid.DatasetFactory().create_dataset(
+                    'InMemoryDataset')
+            else:
+                dataset = paddle.fluid.DatasetFactory().create_dataset(
+                    'FileInstantDataset')
             dataset.set_batch_size(1)
             dataset.set_thread(1)
             dataset.set_filelist(['None'])
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 499f0873dc3bb..ccfec944a7940 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -2254,7 +2254,8 @@ class Operator(object):
         'gen_bkcl_id', 'c_gen_bkcl_id', 'gen_nccl_id', 'c_gen_nccl_id',
         'c_comm_init', 'c_sync_calc_stream', 'c_sync_comm_stream',
         'queue_generator', 'dequeue', 'enqueue', 'heter_listen_and_serv',
-        'c_wait_comm', 'c_wait_compute'
+        'c_wait_comm', 'c_wait_compute', 'c_gen_hccl_id', 'c_comm_init_hccl',
+        'copy_cross_scope'
     }
 
     def __init__(self,
@@ -3239,10 +3240,7 @@ def _insert_op(self, index, *args, **kwargs):
             Operator: the insert Operator.
         """
         self._sync_with_cpp()
-        op_desc = self.desc._insert_op(index)
-        op = Operator(block=self, desc=op_desc, *args, **kwargs)
-        self.ops.insert(index, op)
-        return op
+        return self._insert_op_without_sync(index, *args, **kwargs)
 
     def _insert_op_without_sync(self, index, *args, **kwargs):
         """
@@ -6076,7 +6074,8 @@ def device_guard(device=None):
     A context manager that specifies the device on which the OP will be placed.
 
     Args:
-        device(str|None): Specify the device to use in the context. It should be 'cpu' or 'gpu',
+        device(str|None): Specify the device to use in the context. It should be ``cpu``,
+            ``gpu`` or ``gpu:x``, where ``x`` is the index of the GPUs. 
             When it is set to 'cpu' or 'gpu', all OPs created in the context will be
             placed on CPUPlace or CUDAPlace. When 'gpu' is set and the program runs on
             single-card, the device index will be the same as the device on which the
diff --git a/python/paddle/fluid/incubate/data_generator/__init__.py b/python/paddle/fluid/incubate/data_generator/__init__.py
deleted file mode 100644
index 8d31a68e8083d..0000000000000
--- a/python/paddle/fluid/incubate/data_generator/__init__.py
+++ /dev/null
@@ -1,375 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import sys
-
-__all__ = ['MultiSlotDataGenerator', 'MultiSlotStringDataGenerator']
-
-
-class DataGenerator(object):
-    """
-    DataGenerator is a general Base class for user to inherit
-    A user who wants to define his/her own python processing logic
-    with paddle.fluid.dataset should inherit this class.
-    """
-
-    def __init__(self):
-        self._proto_info = None
-        self.batch_size_ = 32
-
-    def _set_line_limit(self, line_limit):
-        if not isinstance(line_limit, int):
-            raise ValueError("line_limit%s must be in int type" %
-                             type(line_limit))
-        if line_limit < 1:
-            raise ValueError("line_limit can not less than 1")
-        self._line_limit = line_limit
-
-    def set_batch(self, batch_size):
-        '''
-        Set batch size of current DataGenerator
-        This is necessary only if a user wants to define generator_batch
-        
-        Example:
-
-            .. code-block:: python
-                import paddle.fluid.incubate.data_generator as dg
-                class MyData(dg.DataGenerator):
-
-                    def generate_sample(self, line):
-                        def local_iter():
-                            int_words = [int(x) for x in line.split()]
-                            yield ("words", int_words)
-                        return local_iter
-
-                    def generate_batch(self, samples):
-                        def local_iter():
-                            for s in samples:
-                                yield ("words", s[1].extend([s[1][0]]))
-                mydata = MyData()
-                mydata.set_batch(128)
-                    
-        '''
-        self.batch_size_ = batch_size
-
-    def run_from_memory(self):
-        '''
-        This function generator data from memory, it is usually used for
-        debug and benchmarking
-
-        Example:
-            .. code-block:: python
-                import paddle.fluid.incubate.data_generator as dg
-                class MyData(dg.DataGenerator):
-
-                    def generate_sample(self, line):
-                        def local_iter():
-                            yield ("words", [1, 2, 3, 4])
-                        return local_iter
-
-                mydata = MyData()
-                mydata.run_from_memory()
-        '''
-        batch_samples = []
-        line_iter = self.generate_sample(None)
-        for user_parsed_line in line_iter():
-            if user_parsed_line == None:
-                continue
-            batch_samples.append(user_parsed_line)
-            if len(batch_samples) == self.batch_size_:
-                batch_iter = self.generate_batch(batch_samples)
-                for sample in batch_iter():
-                    sys.stdout.write(self._gen_str(sample))
-                batch_samples = []
-        if len(batch_samples) > 0:
-            batch_iter = self.generate_batch(batch_samples)
-            for sample in batch_iter():
-                sys.stdout.write(self._gen_str(sample))
-
-    def run_from_stdin(self):
-        '''
-        This function reads the data row from stdin, parses it with the
-        process function, and further parses the return value of the 
-        process function with the _gen_str function. The parsed data will
-        be wrote to stdout and the corresponding protofile will be
-        generated.
-
-        Example:
-        
-            .. code-block:: python
-                import paddle.fluid.incubate.data_generator as dg
-                class MyData(dg.DataGenerator):
-
-                    def generate_sample(self, line):
-                        def local_iter():
-                            int_words = [int(x) for x in line.split()]
-                            yield ("words", [int_words])
-                        return local_iter
-
-                mydata = MyData()
-                mydata.run_from_stdin()
-
-        '''
-        batch_samples = []
-        for line in sys.stdin:
-            line_iter = self.generate_sample(line)
-            for user_parsed_line in line_iter():
-                if user_parsed_line == None:
-                    continue
-                batch_samples.append(user_parsed_line)
-                if len(batch_samples) == self.batch_size_:
-                    batch_iter = self.generate_batch(batch_samples)
-                    for sample in batch_iter():
-                        sys.stdout.write(self._gen_str(sample))
-                    batch_samples = []
-        if len(batch_samples) > 0:
-            batch_iter = self.generate_batch(batch_samples)
-            for sample in batch_iter():
-                sys.stdout.write(self._gen_str(sample))
-
-    def _gen_str(self, line):
-        '''
-        Further processing the output of the process() function rewritten by
-        user, outputting data that can be directly read by the datafeed,and
-        updating proto_info information.
-
-        Args:
-            line(str): the output of the process() function rewritten by user.
-
-        Returns:
-            Return a string data that can be read directly by the datafeed.
-        '''
-        raise NotImplementedError(
-            "pls use MultiSlotDataGenerator or PairWiseDataGenerator")
-
-    def generate_sample(self, line):
-        '''
-        This function needs to be overridden by the user to process the 
-        original data row into a list or tuple.
-
-        Args:
-            line(str): the original data row
-
-        Returns:
-            Returns the data processed by the user.
-              The data format is list or tuple: 
-            [(name, [feasign, ...]), ...] 
-              or ((name, [feasign, ...]), ...)
-             
-            For example:
-            [("words", [1926, 08, 17]), ("label", [1])]
-              or (("words", [1926, 08, 17]), ("label", [1]))
-
-        Note:
-            The type of feasigns must be in int or float. Once the float
-            element appears in the feasign, the type of that slot will be
-            processed into a float.
-
-        Example:
-
-            .. code-block:: python
-                import paddle.fluid.incubate.data_generator as dg
-                class MyData(dg.DataGenerator):
-
-                    def generate_sample(self, line):
-                        def local_iter():
-                            int_words = [int(x) for x in line.split()]
-                            yield ("words", [int_words])
-                        return local_iter
-
-        '''
-        raise NotImplementedError(
-            "Please rewrite this function to return a list or tuple: " +
-            "[(name, [feasign, ...]), ...] or ((name, [feasign, ...]), ...)")
-
-    def generate_batch(self, samples):
-        '''
-        This function needs to be overridden by the user to process the
-        generated samples from generate_sample(self, str) function
-        It is usually used as batch processing when a user wants to
-        do preprocessing on a batch of samples, e.g. padding according to
-        the max length of a sample in the batch
-
-        Args:
-            samples(list tuple): generated sample from generate_sample
-
-        Returns:
-            a python generator, the same format as return value of generate_sample
-
-        Example:
-
-            .. code-block:: python
-                import paddle.fluid.incubate.data_generator as dg
-                class MyData(dg.DataGenerator):
-
-                    def generate_sample(self, line):
-                        def local_iter():
-                            int_words = [int(x) for x in line.split()]
-                            yield ("words", int_words)
-                        return local_iter
-
-                    def generate_batch(self, samples):
-                        def local_iter():
-                            for s in samples:
-                                yield ("words", s[1].extend([s[1][0]]))
-                mydata = MyData()
-                mydata.set_batch(128)
-        '''
-
-        def local_iter():
-            for sample in samples:
-                yield sample
-
-        return local_iter
-
-
-# TODO: guru4elephant
-# add more generalized DataGenerator that can adapt user-defined slot
-# for example, [(name, float_list), (name, str_list), (name, int_list)]
-class MultiSlotStringDataGenerator(DataGenerator):
-    def _gen_str(self, line):
-        '''
-        Further processing the output of the process() function rewritten by
-        user, outputting data that can be directly read by the MultiSlotDataFeed,
-        and updating proto_info information.
-
-        The input line will be in this format:
-            >>> [(name, [str(feasign), ...]), ...]
-            >>> or ((name, [str(feasign), ...]), ...)
-        The output will be in this format:
-            >>> [ids_num id1 id2 ...] ...
-
-        For example, if the input is like this:
-            >>> [("words", ["1926", "08", "17"]), ("label", ["1"])]
-            >>> or (("words", ["1926", "08", "17"]), ("label", ["1"]))
-        the output will be:
-            >>> 3 1234 2345 3456 1 1
-
-        Args:
-            line(str): the output of the process() function rewritten by user.
-
-        Returns:
-            Return a string data that can be read directly by the MultiSlotDataFeed.
-        '''
-        if not isinstance(line, list) and not isinstance(line, tuple):
-            raise ValueError(
-                "the output of process() must be in list or tuple type"
-                "Examples: [('words', ['1926', '08', '17']), ('label', ['1'])]")
-        output = ""
-        for index, item in enumerate(line):
-            name, elements = item
-            if output:
-                output += " "
-            out_str = []
-            out_str.append(str(len(elements)))
-            out_str.extend(elements)
-            output += " ".join(out_str)
-        return output + "\n"
-
-
-class MultiSlotDataGenerator(DataGenerator):
-    def _gen_str(self, line):
-        '''
-        Further processing the output of the process() function rewritten by
-        user, outputting data that can be directly read by the MultiSlotDataFeed,
-        and updating proto_info information.
-
-        The input line will be in this format:
-            >>> [(name, [feasign, ...]), ...] 
-            >>> or ((name, [feasign, ...]), ...)
-        The output will be in this format:
-            >>> [ids_num id1 id2 ...] ...
-        The proto_info will be in this format:
-            >>> [(name, type), ...]
-        
-        For example, if the input is like this:
-            >>> [("words", [1926, 08, 17]), ("label", [1])]
-            >>> or (("words", [1926, 08, 17]), ("label", [1]))
-        the output will be:
-            >>> 3 1234 2345 3456 1 1
-        the proto_info will be:
-            >>> [("words", "uint64"), ("label", "uint64")]
-
-        Args:
-            line(str): the output of the process() function rewritten by user.
-
-        Returns:
-            Return a string data that can be read directly by the MultiSlotDataFeed.
-        '''
-        if not isinstance(line, list) and not isinstance(line, tuple):
-            raise ValueError(
-                "the output of process() must be in list or tuple type"
-                "Example: [('words', [1926, 08, 17]), ('label', [1])]")
-        output = ""
-
-        if self._proto_info is None:
-            self._proto_info = []
-            for item in line:
-                name, elements = item
-                if not isinstance(name, str):
-                    raise ValueError("name%s must be in str type" % type(name))
-                if not isinstance(elements, list):
-                    raise ValueError("elements%s must be in list type" %
-                                     type(elements))
-                if not elements:
-                    raise ValueError(
-                        "the elements of each field can not be empty, you need padding it in process()."
-                    )
-                self._proto_info.append((name, "uint64"))
-                if output:
-                    output += " "
-                output += str(len(elements))
-                for elem in elements:
-                    if isinstance(elem, float):
-                        self._proto_info[-1] = (name, "float")
-                    elif not isinstance(elem, int) and not isinstance(elem,
-                                                                      long):
-                        raise ValueError(
-                            "the type of element%s must be in int or float" %
-                            type(elem))
-                    output += " " + str(elem)
-        else:
-            if len(line) != len(self._proto_info):
-                raise ValueError(
-                    "the complete field set of two given line are inconsistent.")
-            for index, item in enumerate(line):
-                name, elements = item
-                if not isinstance(name, str):
-                    raise ValueError("name%s must be in str type" % type(name))
-                if not isinstance(elements, list):
-                    raise ValueError("elements%s must be in list type" %
-                                     type(elements))
-                if not elements:
-                    raise ValueError(
-                        "the elements of each field can not be empty, you need padding it in process()."
-                    )
-                if name != self._proto_info[index][0]:
-                    raise ValueError(
-                        "the field name of two given line are not match: require<%s>, get<%s>."
-                        % (self._proto_info[index][0], name))
-                if output:
-                    output += " "
-                output += str(len(elements))
-                for elem in elements:
-                    if self._proto_info[index][1] != "float":
-                        if isinstance(elem, float):
-                            self._proto_info[index] = (name, "float")
-                        elif not isinstance(elem, int) and not isinstance(elem,
-                                                                          long):
-                            raise ValueError(
-                                "the type of element%s must be in int or float"
-                                % type(elem))
-                    output += " " + str(elem)
-        return output + "\n"
diff --git a/python/paddle/fluid/incubate/data_generator/test_data_generator.py b/python/paddle/fluid/incubate/data_generator/test_data_generator.py
deleted file mode 100644
index dcacd67e92a88..0000000000000
--- a/python/paddle/fluid/incubate/data_generator/test_data_generator.py
+++ /dev/null
@@ -1,36 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-from __init__ import *
-
-
-class SyntheticData(MultiSlotDataGenerator):
-    def generate_sample(self, line):
-        def data_iter():
-            for i in range(10000):
-                yield ("words", [1, 2, 3, 4]), ("label", [0])
-
-        return data_iter
-
-
-class SyntheticStringData(MultiSlotStringDataGenerator):
-    def generate_sample(self, line):
-        def data_iter():
-            for i in range(10000):
-                yield ("words", ["1", "2", "3", "4"], ("label", ["0"]))
-
-
-sd = SyntheticData()
-sd.run_from_memory()
-
-sd2 = SyntheticStringData()
-sd.run_from_memory()
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py b/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
index 08e64c15c483b..5f32749704747 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
@@ -24,6 +24,7 @@
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import paddle.fluid.framework as framework
+import paddle.compat as cpt
 
 from paddle.fluid.transpiler.details.program_utils import delete_ops
 from paddle.fluid.incubate.fleet.parameter_server.ir.public import _get_optimize_ops
@@ -93,7 +94,7 @@ def _add_lr_var(main_program, compiled_config):
     return program
 
 
-def distributed_ops_pass(program, config):
+def distributed_ops_pass(program, config, use_ps_gpu=False):
     trainer_id = config.get_role_id()
     send_ctx = config.get_the_one_send_context(
         split_dense_table=config.is_heter_ps_mode)
@@ -109,7 +110,7 @@ def _get_pull_sparse_ops(_program):
                 pull_sparse_ops[param_name] = ops
         return pull_sparse_ops
 
-    def _pull_sparse_fuse(_program, pull_sparse_ops):
+    def _pull_sparse_fuse(_program, pull_sparse_ops, use_ps_gpu):
         for param, ops in pull_sparse_ops.items():
             all_ops = program.global_block().ops
             op_idxs = [all_ops.index(op) for op in ops]
@@ -159,18 +160,31 @@ def _pull_sparse_fuse(_program, pull_sparse_ops):
             if min(outputs_idxs) - max(inputs_idxs) >= 1:
                 distributed_idx = max(inputs_idxs) + 1
 
-                program.global_block()._insert_op(
-                    index=distributed_idx,
-                    type="distributed_lookup_table",
-                    inputs={"Ids": inputs,
-                            'W': w},
-                    outputs={"Outputs": outputs},
-                    attrs={
-                        "is_distributed": is_distributed,
-                        "padding_idx": padding_idx,
-                        "table_id": table_id,
-                        "lookup_table_version": op_type
-                    })
+                if use_ps_gpu:
+                    program.global_block()._insert_op(
+                        index=distributed_idx,
+                        type="pull_box_sparse",
+                        inputs={"Ids": inputs,
+                                'W': w},
+                        outputs={"Out": outputs},
+                        attrs={
+                            "size": w.shape[1],
+                            "is_distributed": True,
+                            "is_sparse": True
+                        })
+                else:
+                    program.global_block()._insert_op(
+                        index=distributed_idx,
+                        type="distributed_lookup_table",
+                        inputs={"Ids": inputs,
+                                'W': w},
+                        outputs={"Outputs": outputs},
+                        attrs={
+                            "is_distributed": is_distributed,
+                            "padding_idx": padding_idx,
+                            "table_id": table_id,
+                            "lookup_table_version": op_type
+                        })
             else:
                 for i in range(len(inputs_idxs)):
                     distributed_idx = op_idxs[i] + 1
@@ -189,7 +203,7 @@ def _pull_sparse_fuse(_program, pull_sparse_ops):
                         })
 
     pull_sparse_ops = _get_pull_sparse_ops(program)
-    _pull_sparse_fuse(program, pull_sparse_ops)
+    _pull_sparse_fuse(program, pull_sparse_ops, use_ps_gpu)
     return program
 
 
@@ -308,6 +322,54 @@ def _fake_init_sparsetable(sparse_table_names):
     return program
 
 
+def ps_gpu_pass(program):
+    def _add_push_box_sparse_op(program):
+        op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName()
+        backward = core.op_proto_and_checker_maker.OpRole.Backward
+        for op in program.global_block().ops:
+            if op.type != "pull_box_sparse":
+                continue
+            grad_op_desc, op_grad_to_var = core.get_grad_op_desc(
+                op.desc, cpt.to_text(set()), [])
+            for op_desc in grad_op_desc:
+                new_op_desc = program.global_block().desc.append_op()
+                new_op_desc.copy_from(op_desc)
+                new_op_desc._set_attr(op_role_attr_name, backward)
+
+    def _remove_lookup_table_grad_op_and_var(program):
+        lookup_table_grad_var = {}
+        remove_op_index = []
+        remove_var = []
+        for idx, op in list(enumerate(program.global_block().ops)):
+            if op.type == "lookup_table_grad":
+                for name in op.output("W@GRAD"):
+                    lookup_table_grad_var[name] = 1
+                    remove_op_index.append(idx)
+                    remove_var.append(name)
+                for name in op.input("W"):
+                    lookup_table_grad_var[name] = 1
+
+        for idx, op in list(enumerate(program.global_block().ops)):
+            if op.type == "pull_box_sparse":
+                continue
+            for key_name in op.input_names:
+                for var in op.input(key_name):
+                    if var in lookup_table_grad_var:
+                        remove_op_index.append(idx)
+                        break
+
+        remove_op_index = list(set(remove_op_index))
+        remove_op_index.sort(reverse=True)
+        for idx in remove_op_index:
+            program.global_block()._remove_op(idx)
+        for name in remove_var:
+            program.global_block()._remove_var(name)
+
+    _add_push_box_sparse_op(program)
+    _remove_lookup_table_grad_op_and_var(program)
+    return program
+
+
 def delet_extra_optimizes_pass(program, config):
     optimize_vars = []
     optimize_op_role_vars = []
diff --git a/python/paddle/fluid/incubate/fleet/utils/fleet_barrier_util.py b/python/paddle/fluid/incubate/fleet/utils/fleet_barrier_util.py
deleted file mode 100644
index a9fd8ac74f428..0000000000000
--- a/python/paddle/fluid/incubate/fleet/utils/fleet_barrier_util.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
-from paddle.fluid.contrib.utils import HDFSClient
-import os
-import time
-
-
-def check_all_trainers_ready(ready_path, epoch):
-    trainer_num = fleet.worker_num()
-    trainer_id = fleet.worker_index()
-
-    hadoop_home = os.getenv("HADOOP_HOME")
-    configs = {
-        "fs.default.name": os.getenv("FS_NAME"),
-        "hadoop.job.ugi": os.getenv("FS_UGI")
-    }
-
-    node_ready = "ready.{}.{}.done".format(epoch, trainer_id)
-
-    with open(node_ready, "w") as node:
-        node.write("")
-
-    client = HDFSClient(hadoop_home, configs)
-    if not client.is_dir(ready_path):
-        client.makedirs(ready_path)
-    client.upload(
-        hdfs_path=ready_path,
-        local_path=node_ready,
-        overwrite=True,
-        retry_times=0)
-
-    print("PUT {} ON HDFS {} OK".format(node_ready, ready_path))
-
-    while True:
-        ready_num = len(client.ls(ready_path))
-        print("have {} trainers need to be ready".format(trainer_num - ready_num
-                                                         % trainer_num))
-        if ready_num % trainer_num == 0:
-            break
-        time.sleep(10)
-        ready_num = len(client.ls(ready_path))
-
-    print("All trainers are ready, continue training")
diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py
index 86fab9811275f..c5345c7fed235 100644
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -238,7 +238,8 @@ def __call__(self, var, block=None):
         block = self._check_block(block)
 
         assert isinstance(block, framework.Block)
-        check_variable_and_dtype(var, "Out", ["float16", "float32", "float64"],
+        check_variable_and_dtype(var, "Out",
+                                 ["uint16", "float16", "float32", "float64"],
                                  "uniform_random")
 
         # Initialization Ops should be prepended and not appended
@@ -246,7 +247,7 @@ def __call__(self, var, block=None):
             self._seed = block.program.random_seed
 
         # to be compatible of fp16 initializers
-        if var.dtype == VarDesc.VarType.FP16:
+        if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]:
             out_dtype = VarDesc.VarType.FP32
             out_var = block.create_var(
                 name=unique_name.generate(".".join(
@@ -275,7 +276,7 @@ def __call__(self, var, block=None):
             },
             stop_gradient=True)
 
-        if var.dtype == VarDesc.VarType.FP16:
+        if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]:
             block.append_op(
                 type="cast",
                 inputs={"X": out_var},
@@ -330,14 +331,15 @@ def __call__(self, var, block=None):
 
         assert isinstance(block, framework.Block)
 
-        check_variable_and_dtype(var, "Out", ["float16", "float32", "float64"],
+        check_variable_and_dtype(var, "Out",
+                                 ["uint16", "float16", "float32", "float64"],
                                  "guassian_random")
         # Initialization Ops should be prepended and not appended
         if self._seed == 0:
             self._seed = block.program.random_seed
 
         # to be compatible of fp16 initalizers
-        if var.dtype == VarDesc.VarType.FP16:
+        if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]:
             out_dtype = VarDesc.VarType.FP32
             out_var = block.create_var(
                 name=unique_name.generate(".".join(
@@ -363,7 +365,7 @@ def __call__(self, var, block=None):
             },
             stop_gradient=True)
 
-        if var.dtype == VarDesc.VarType.FP16:
+        if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]:
             block.append_op(
                 type="cast",
                 inputs={"X": out_var},
@@ -421,7 +423,7 @@ def __call__(self, var, block=None):
             self._seed = block.program.random_seed
 
         # to be compatible of fp16 initalizers
-        if var.dtype == VarDesc.VarType.FP16:
+        if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]:
             out_dtype = VarDesc.VarType.FP32
             out_var = block.create_var(
                 name=unique_name.generate(".".join(
@@ -446,7 +448,7 @@ def __call__(self, var, block=None):
             },
             stop_gradient=True)
 
-        if var.dtype == VarDesc.VarType.FP16:
+        if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]:
             block.append_op(
                 type="cast",
                 inputs={"X": out_var},
@@ -526,7 +528,8 @@ def __call__(self, var, block=None):
         block = self._check_block(block)
 
         assert isinstance(block, framework.Block)
-        check_variable_and_dtype(var, "Out", ["float16", "float32", "float64"],
+        check_variable_and_dtype(var, "Out",
+                                 ["uint16", "float16", "float32", "float64"],
                                  "xavier_init")
 
         f_in, f_out = self._compute_fans(var)
@@ -539,7 +542,7 @@ def __call__(self, var, block=None):
             self._seed = block.program.random_seed
 
         # to be compatible of fp16 initalizers
-        if var.dtype == VarDesc.VarType.FP16:
+        if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]:
             out_dtype = VarDesc.VarType.FP32
             out_var = block.create_var(
                 name=unique_name.generate(".".join(
@@ -581,7 +584,7 @@ def __call__(self, var, block=None):
                 },
                 stop_gradient=True)
 
-        if var.dtype == VarDesc.VarType.FP16:
+        if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]:
             block.append_op(
                 type="cast",
                 inputs={"X": out_var},
@@ -670,7 +673,7 @@ def __call__(self, var, block=None):
             self._seed = block.program.random_seed
 
         # to be compatible of fp16 initalizers
-        if var.dtype == VarDesc.VarType.FP16:
+        if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]:
             out_dtype = VarDesc.VarType.FP32
             out_var = block.create_var(
                 name=unique_name.generate(".".join(
@@ -712,7 +715,7 @@ def __call__(self, var, block=None):
                 },
                 stop_gradient=True)
 
-        if var.dtype == VarDesc.VarType.FP16:
+        if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]:
             block.append_op(
                 type="cast",
                 inputs={"X": out_var},
@@ -812,7 +815,9 @@ def __call__(self, var, block=None):
         weight = np.reshape(weight, shape)
 
         # to be compatible of fp16 initalizers
-        if var.dtype == VarDesc.VarType.FP16 or var.dtype == VarDesc.VarType.FP64:
+        if var.dtype in [
+                VarDesc.VarType.FP16, VarDesc.VarType.BF16, VarDesc.VarType.FP64
+        ]:
             out_dtype = VarDesc.VarType.FP32
             out_var = block.create_var(
                 name=unique_name.generate(".".join(
@@ -842,7 +847,9 @@ def __call__(self, var, block=None):
                 value_name: values
             })
 
-        if var.dtype == VarDesc.VarType.FP16 or var.dtype == VarDesc.VarType.FP64:
+        if var.dtype in [
+                VarDesc.VarType.FP16, VarDesc.VarType.BF16, VarDesc.VarType.FP64
+        ]:
             block.append_op(
                 type="cast",
                 inputs={"X": out_var},
@@ -898,7 +905,7 @@ def __call__(self, var, block=None):
         assert isinstance(block, framework.Block)
 
         # to be compatible of fp16 initalizers
-        if var.dtype == VarDesc.VarType.FP16:
+        if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]:
             out_dtype = VarDesc.VarType.FP32
             np_value = self._value.astype("float32")
             out_var = block.create_var(
@@ -935,7 +942,7 @@ def __call__(self, var, block=None):
             },
             stop_gradient=True)
 
-        if var.dtype == VarDesc.VarType.FP16:
+        if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]:
             block.append_op(
                 type="cast",
                 inputs={"X": out_var},
diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index cfb4b12599385..768248e136b05 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -2041,6 +2041,10 @@ def set_var(var, ndarray):
             p = paddle.fluid.core.Place()
             p.set_place(t._place())
             place = paddle.fluid.XPUPlace(p.xpu_device_id())
+        elif p.is_npu_place():
+            p = paddle.fluid.core.Place()
+            p.set_place(t._place())
+            place = paddle.fluid.NPUPlace(p.npu_device_id())
         else:
             p = paddle.fluid.core.Place()
             p.set_place(t._place())
@@ -2335,6 +2339,10 @@ def set_program_state(program, state_dict):
                 p = paddle.fluid.core.Place()
                 p.set_place(ten_place)
                 py_place = paddle.fluid.XPUPlace(p.xpu_device_id())
+            elif ten_place.is_npu_place():
+                p = paddle.fluid.core.Place()
+                p.set_place(ten_place)
+                py_place = paddle.fluid.NPUPlace(p.npu_device_id())
 
             ten.set(new_para_np, py_place)
 
diff --git a/python/paddle/fluid/layer_helper_base.py b/python/paddle/fluid/layer_helper_base.py
index 5ee46a68fb76e..858078615a752 100644
--- a/python/paddle/fluid/layer_helper_base.py
+++ b/python/paddle/fluid/layer_helper_base.py
@@ -331,12 +331,14 @@ def create_parameter(self,
             if isinstance(dtype, core.VarDesc.VarType):
                 if dtype != core.VarDesc.VarType.FP32 and \
                         dtype != core.VarDesc.VarType.FP64 and \
-                        dtype != core.VarDesc.VarType.FP16:
+                        dtype != core.VarDesc.VarType.FP16 and \
+                        dtype != core.VarDesc.VarType.BF16:
                     raise TypeError(
                         "Can not create parameter with default initializer when dtype is not float type. Set default_initializer to fit the parameter dtype!"
                     )
             else:
-                if not (dtype.startswith("float") or dtype == "double"):
+                if not (dtype.startswith("float") or
+                        dtype in ["double", "uint16"]):
                     raise TypeError(
                         "Can not create parameter with default initializer when dtype is not float type. Set default_initializer to fit the parameter dtype!"
                     )
diff --git a/python/paddle/fluid/layers/math_op_patch.py b/python/paddle/fluid/layers/math_op_patch.py
index 96947bf72c7dd..a68331b156b3b 100644
--- a/python/paddle/fluid/layers/math_op_patch.py
+++ b/python/paddle/fluid/layers/math_op_patch.py
@@ -22,6 +22,7 @@
 from .layer_function_generator import OpProtoHolder
 
 _supported_int_dtype_ = [
+    core.VarDesc.VarType.BOOL,
     core.VarDesc.VarType.UINT8,
     core.VarDesc.VarType.INT8,
     core.VarDesc.VarType.INT16,
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index c5a0545f9111e..e5663d607aa88 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -491,7 +491,7 @@ def embedding(input,
     helper = LayerHelper('embedding', **locals())
     check_variable_and_dtype(input, 'input', ['int64'],
                              'fluid.layers.embedding')
-    check_dtype(dtype, 'dtype', ['float16', 'float32', 'float64'],
+    check_dtype(dtype, 'dtype', ['uint16', 'float16', 'float32', 'float64'],
                 'fluid.layers.embedding')
 
     if is_distributed:
@@ -1524,6 +1524,10 @@ def conv2d(input,
             not use_cudnn):
         l_type = 'depthwise_conv2d'
 
+    if (num_channels == groups and num_filters % num_channels == 0 and
+            core.is_compiled_with_rocm()):
+        l_type = 'depthwise_conv2d'
+
     helper = LayerHelper(l_type, **locals())
     dtype = helper.input_dtype()
 
@@ -9514,8 +9518,8 @@ def pow(x, factor=1.0, name=None):
             y_2 = fluid.layers.pow(x, factor=factor_tensor)
             # y_2 is x^{3.0}
     """
-    check_variable_and_dtype(x, 'x', ['int32', 'int64', 'float32', 'float64'],
-                             'pow')
+    check_variable_and_dtype(
+        x, 'x', ['int32', 'int64', 'float16', 'float32', 'float64'], 'pow')
 
     helper = LayerHelper('pow', **locals())
     inputs = {'X': x}
@@ -10328,7 +10332,8 @@ def expand(x, expand_times, name=None):
     inputs = {"X": [x]}
     attrs = {}
     check_variable_and_dtype(
-        x, 'x', ['bool', 'float32', 'float64', 'int32', 'int64'], 'expand')
+        x, 'x', ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
+        'expand')
     check_type(expand_times, 'expand_times', (list, tuple, Variable), 'expand')
     if convert_dtype(x.dtype) == 'bool' and x.stop_gradient == True:
         raise ValueError(
diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py
index 841daf7a41d1f..67cdc6dce5a82 100755
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -27,6 +27,7 @@
 
 __activations_noattr__ = [
     'sigmoid',
+    'silu',
     'logsigmoid',
     'tanh_shrink',
     'softplus',
@@ -100,6 +101,20 @@
 
 """)
 
+add_sample_code(globals()["silu"], r"""
+Examples:
+    .. code-block:: python
+
+        import paddle
+        import paddle.nn.functional as F
+
+        x = paddle.to_tensor([1.0, 2.0, 3.0, 4.0])
+        out = F.silu(x)
+        print(out)
+        # [ 0.7310586 1.7615942 2.8577224, 3.9280552 ]
+
+""")
+
 add_sample_code(globals()["logsigmoid"], r"""
 Examples:
     .. code-block:: python
diff --git a/python/paddle/fluid/layers/sequence_lod.py b/python/paddle/fluid/layers/sequence_lod.py
index df1113660f7d8..a42ec2c92a3aa 100644
--- a/python/paddle/fluid/layers/sequence_lod.py
+++ b/python/paddle/fluid/layers/sequence_lod.py
@@ -139,10 +139,11 @@ def sequence_conv(input,
 
         .. code-block:: python
 
-             import paddle.fluid as fluid
+             import paddle
+             paddle.enable_static()
 
-             x = fluid.data(name='x', shape=[-1, 10], dtype='float32', lod_level=1)
-             x_conved = fluid.layers.sequence_conv(input=x, num_filters=2, filter_size=3, padding_start=-1)
+             x = paddle.static.data(name='x', shape=[-1, 10], dtype='float32', lod_level=1)
+             x_conved = paddle.static.nn.sequence_conv(input=x, num_filters=2, filter_size=3, padding_start=-1)
     """
 
     assert not in_dygraph_mode(), (
@@ -233,15 +234,17 @@ def sequence_softmax(input, use_cudnn=False, name=None):
     Examples:
 
         .. code-block:: python
-
-             import paddle.fluid as fluid
-             x = fluid.data(name='x', shape=[7, 1],
+             
+             import paddle
+             paddle.enable_static()
+             
+             x = paddle.static.data(name='x', shape=[7, 1],
                               dtype='float32', lod_level=1)
-             x_sequence_softmax_1 = fluid.layers.sequence_softmax(input=x)  
+             x_sequence_softmax_1 = paddle.static.nn.sequence_softmax(input=x)  
 
-             y = fluid.data(name='y', shape=[7],
+             y = paddle.static.data(name='y', shape=[7],
                  dtype='float32', lod_level=1)
-             x_sequence_softmax_2 = fluid.layers.sequence_softmax(input=y)  
+             x_sequence_softmax_2 = paddle.static.nn.sequence_softmax(input=y)  
     """
     assert not in_dygraph_mode(), (
         "sequence layer is not supported in dygraph mode yet.")
@@ -334,15 +337,16 @@ def sequence_pool(input, pool_type, is_test=False, pad_value=0.0):
 
         .. code-block:: python
 
-            import paddle.fluid as fluid
+            import paddle
+            paddle.enable_static()
 
-            x = fluid.data(name='x', shape=[None, 10], dtype='float32', lod_level=1)
-            avg_x = fluid.layers.sequence_pool(input=x, pool_type='average')
-            sum_x = fluid.layers.sequence_pool(input=x, pool_type='sum')
-            sqrt_x = fluid.layers.sequence_pool(input=x, pool_type='sqrt')
-            max_x = fluid.layers.sequence_pool(input=x, pool_type='max')
-            last_x = fluid.layers.sequence_pool(input=x, pool_type='last')
-            first_x = fluid.layers.sequence_pool(input=x, pool_type='first')
+            x = paddle.static.data(name='x', shape=[None, 10], dtype='float32', lod_level=1)
+            avg_x = paddle.static.nn.sequence_pool(input=x, pool_type='average')
+            sum_x = paddle.static.nn.sequence_pool(input=x, pool_type='sum')
+            sqrt_x = paddle.static.nn.sequence_pool(input=x, pool_type='sqrt')
+            max_x = paddle.static.nn.sequence_pool(input=x, pool_type='max')
+            last_x = paddle.static.nn.sequence_pool(input=x, pool_type='last')
+            first_x = paddle.static.nn.sequence_pool(input=x, pool_type='first')
     """
     assert not in_dygraph_mode(), (
         "sequence layer is not supported in dygraph mode yet.")
@@ -413,10 +417,12 @@ def sequence_concat(input, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            x = fluid.data(name='x', shape=[-1, 10], dtype='float32', lod_level=1)
-            y = fluid.data(name='y', shape=[-1, 10], dtype='float32', lod_level=1)
-            out = fluid.layers.sequence_concat(input=[x, y])
+            import paddle
+            paddle.enable_static()
+
+            x = paddle.static.data(name='x', shape=[-1, 10], dtype='float32', lod_level=1)
+            y = paddle.static.data(name='y', shape=[-1, 10], dtype='float32', lod_level=1)
+            out = paddle.static.nn.sequence_concat(input=[x, y])
     """
     assert not in_dygraph_mode(), (
         "sequence layer is not supported in dygraph mode yet.")
@@ -481,9 +487,11 @@ def sequence_first_step(input):
 
         .. code-block:: python
 
-             import paddle.fluid as fluid
-             x = fluid.data(name='x', shape=[None, 10], dtype='float32', lod_level=1)
-             x_first_step = fluid.layers.sequence_first_step(input=x)
+             import paddle
+             paddle.enable_static()
+
+             x = paddle.static.data(name='x', shape=[None, 10], dtype='float32', lod_level=1)
+             x_first_step = paddle.static.nn.sequence_first_step(input=x)
     """
     check_variable_and_dtype(input, 'input', ['float32', 'float64'],
                              'sequence_first_step')
@@ -538,9 +546,11 @@ def sequence_last_step(input):
 
         .. code-block:: python
 
-             import paddle.fluid as fluid
-             x = fluid.data(name='x', shape=[None, 10], dtype='float32', lod_level=1)
-             x_last_step = fluid.layers.sequence_last_step(input=x)
+             import paddle
+             paddle.enable_static()
+             
+             x = paddle.static.data(name='x', shape=[None, 10], dtype='float32', lod_level=1)
+             x_last_step = paddle.static.nn.sequence_last_step(input=x)
     """
     check_variable_and_dtype(input, 'input', ['float32', 'float64'],
                              'sequence_last_step')
@@ -598,13 +608,15 @@ def sequence_slice(input, offset, length, name=None):
 
         .. code-block:: python
 
-             import paddle.fluid as fluid
+             import paddle
+             paddle.enable_static()
+             
              import numpy as np
-             seqs = fluid.data(name='x', shape=[10, 5],
+             seqs = paddle.static.data(name='x', shape=[10, 5],
                               dtype='float32', lod_level=1)
-             offset = fluid.layers.assign(input=np.array([[0, 1]]).astype("int32"))
-             length = fluid.layers.assign(input=np.array([[2, 1]]).astype("int32"))
-             subseqs = fluid.layers.sequence_slice(input=seqs, offset=offset,
+             offset = paddle.assign(np.array([[0, 1]]).astype("int32"))
+             length = paddle.assign(np.array([[2, 1]]).astype("int32"))
+             subseqs = paddle.static.nn.sequence_slice(input=seqs, offset=offset,
                                                    length=length)
     """
     assert not in_dygraph_mode(), (
@@ -715,17 +727,18 @@ def sequence_expand(x, y, ref_level=-1, name=None):
     Examples:
         .. code-block:: python
 	
-            import paddle.fluid as fluid
-            import paddle.fluid.layers as layers
+            import paddle
+            from paddle import fluid
+            paddle.enable_static()
             import numpy as np
 
-            x = fluid.data(name='x', shape=[4, 1], dtype='float32')
-            y = fluid.data(name='y', shape=[8, 1],
+            x = paddle.static.data(name='x', shape=[4, 1], dtype='float32')
+            y = paddle.static.data(name='y', shape=[8, 1],
                         dtype='float32', lod_level=1)
-            out = layers.sequence_expand(x=x, y=y, ref_level=0)
+            out = paddle.static.nn.sequence_expand(x=x, y=y, ref_level=0)
 
-            exe = fluid.Executor(fluid.CPUPlace())
-            place = fluid.CPUPlace()
+            exe = paddle.static.Executor(fluid.CPUPlace())
+            place = paddle.CPUPlace()
 
             np_data = np.array([[1], [2], [3], [4]]).astype('float32')
             x_lod_tensor = fluid.create_lod_tensor(np_data, [[2, 2]], place)
@@ -836,13 +849,14 @@ def sequence_expand_as(x, y, name=None):
     Examples:
         .. code-block:: python
 
+            import paddle
             import paddle.fluid as fluid
-            import paddle.fluid.layers as layers
+            paddle.enable_static()
             import numpy as np
 
-            x = fluid.data(name='x', shape=[4, 1], dtype='float32')
-            y = fluid.data(name='y', shape=[8, 1], dtype='float32', lod_level=1)
-            out = layers.sequence_expand_as(x=x, y=y)
+            x = paddle.static.data(name='x', shape=[4, 1], dtype='float32')
+            y = paddle.static.data(name='y', shape=[8, 1], dtype='float32', lod_level=1)
+            out = paddle.static.nn.sequence_expand_as(x=x, y=y)
 
             exe = fluid.Executor(fluid.CPUPlace())
             place = fluid.CPUPlace()
@@ -969,13 +983,15 @@ def sequence_pad(x, pad_value, maxlen=None, name=None):
     Examples:
         .. code-block:: python
 
+            import paddle
+            paddle.enable_static()
             import paddle.fluid as fluid
             import numpy
 
-            x = fluid.data(name='x', shape=[10, 5], dtype='float32', lod_level=1)
-            pad_value = fluid.layers.assign(
-                input=numpy.array([0.0], dtype=numpy.float32))
-            out = fluid.layers.sequence_pad(x=x, pad_value=pad_value)
+            x = paddle.static.data(name='x', shape=[10, 5], dtype='float32', lod_level=1)
+            pad_value = paddle.assign(
+                numpy.array([0.0], dtype=numpy.float32))
+            out = paddle.static.nn.sequence_pad(x=x, pad_value=pad_value)
     """
 
     assert not in_dygraph_mode(), (
@@ -1048,16 +1064,18 @@ def sequence_unpad(x, length, name=None):
     Examples:
         .. code-block:: python
 
+            import paddle
+            paddle.enable_static()
             import paddle.fluid as fluid
             import numpy
 
             # pad data
-            x = fluid.data(name='x', shape=[10, 5], dtype='float32', lod_level=1)
-            pad_value = fluid.layers.assign(input=numpy.array([0.0], dtype=numpy.float32))
-            pad_data, len = fluid.layers.sequence_pad(x=x, pad_value=pad_value)
+            x = paddle.static.data(name='x', shape=[10, 5], dtype='float32', lod_level=1)
+            pad_value = paddle.assign(numpy.array([0.0], dtype=numpy.float32))
+            pad_data, len = paddle.static.nn.sequence_pad(x=x, pad_value=pad_value)
             
             # unpad data
-            unpad_data = fluid.layers.sequence_unpad(x=pad_data, length=len)
+            unpad_data = paddle.static.nn.sequence_unpad(x=pad_data, length=len)
     """
 
     assert not in_dygraph_mode(), (
@@ -1123,9 +1141,11 @@ def sequence_reshape(input, new_dim):
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            x = fluid.data(name='x', shape=[None, 16], dtype='float32', lod_level=1)
-            x_reshaped = fluid.layers.sequence_reshape(input=x, new_dim=4)
+            import paddle
+            paddle.enable_static()
+
+            x = paddle.static.data(name='x', shape=[None, 16], dtype='float32', lod_level=1)
+            x_reshaped = paddle.static.nn.sequence_reshape(input=x, new_dim=4)
     """
     assert not in_dygraph_mode(), (
         "sequence layer is not supported in dygraph mode yet.")
@@ -1200,12 +1220,13 @@ def sequence_scatter(input, index, updates, name=None):
 
         .. code-block:: python
 	
-            import paddle.fluid as fluid
+            import paddle
+            paddle.enable_static()
 
-            input = fluid.data( name="x", shape=[None, 3, 6], dtype='float32' )
-            index = fluid.data( name='index', shape=[12, 1],  dtype='int64', lod_level=1)
-            updates = fluid.data( name='updates', shape=[12, 1], dtype='float32', lod_level=1)
-            output = fluid.layers.sequence_scatter(input, index, updates)
+            input = paddle.static.data(name="x", shape=[None, 3, 6], dtype='float32' )
+            index = paddle.static.data(name='index', shape=[12, 1],  dtype='int64', lod_level=1)
+            updates = paddle.static.data(name='updates', shape=[12, 1], dtype='float32', lod_level=1)
+            output = paddle.static.nn.sequence_scatter(input, index, updates)
 
     """
     assert not in_dygraph_mode(), (
@@ -1279,10 +1300,11 @@ def sequence_enumerate(input, win_size, pad_value=0, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-
-            x = fluid.data(name='x', shape=[-1, 1], dtype='int32', lod_level=1)
-            out = fluid.layers.sequence_enumerate(input=x, win_size=3, pad_value=0)
+            import paddle
+            paddle.enable_static()
+            
+            x = paddle.static.data(name='x', shape=[-1, 1], dtype='int32', lod_level=1)
+            out = paddle.static.nn.sequence_enumerate(input=x, win_size=3, pad_value=0)
     """
     assert not in_dygraph_mode(), (
         "sequence layer is not supported in dygraph mode yet.")
@@ -1333,26 +1355,30 @@ def sequence_mask(x, maxlen=None, dtype='int64', name=None):
             Tensor or LodTensor with shape [d_1, d_2, ..., d_n].
         maxlen (int, optional): Maximum length of the sequence. If :code:`maxlen` \
                            is None, it would be replace with :math:`max(x)`.
-        dtype (np.dtype|core.VarDesc.VarType|str, optional): Data type of the output, \
+        dtype (np.dtype|paddle.dtype|str, optional): Data type of the output, \
              ``int64`` by default.
         name(str, optional): For detailed information, please refer \
             to :ref:`api_guide_Name`. Usually name is no need to set and \
             None by default.
 
-    Returns: The output sequence mask. Tensor or LodTensor with shape [d_1, d_2, ..., d_n, maxlen] \
-            and data type of :code:`dtype`. The data type should be float32, float64, int8, \
+    Returns: The output sequence mask. Tensor with shape [d_1, d_2, ..., d_n, maxlen] \
+            and data type of :code:`dtype`. The data type should be bool, float32, float64, int8, \
             int32 or int64.
 
-    Return Type: Variable
+    Return Type: Tensor
 
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            import paddle.fluid.layers as layers
+            import paddle
+
+            lengths = paddle.to_tensor([10, 9, 8])
+            mask = paddle.nn.functional.sequence_mask(lengths)
 
-            x = fluid.data(name='x', shape=[10], dtype='float32', lod_level=1)
-            mask = layers.sequence_mask(x=x)
+            print(mask.numpy())
+            # [[1 1 1 1 1 1 1 1 1 1]
+            #  [1 1 1 1 1 1 1 1 1 0]
+            #  [1 1 1 1 1 1 1 1 0 0]]
 
     """
     helper = LayerHelper('sequence_mask', **locals())
@@ -1414,9 +1440,11 @@ def sequence_reverse(x, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            x = fluid.data(name='x', shape=[None, 10], dtype='float32', lod_level=1)
-            x_reversed = fluid.layers.sequence_reverse(x)
+            import paddle
+            paddle.enable_static()
+
+            x = paddle.static.data(name='x', shape=[None, 10], dtype='float32', lod_level=1)
+            x_reversed = paddle.static.nn.sequence_reverse(x)
     """
     assert not in_dygraph_mode(), (
         "sequence layer is not supported in dygraph mode yet.")
diff --git a/python/paddle/fluid/nets.py b/python/paddle/fluid/nets.py
index c47cce76f8984..e8f8bdd3f9add 100644
--- a/python/paddle/fluid/nets.py
+++ b/python/paddle/fluid/nets.py
@@ -16,6 +16,7 @@
 import six
 from . import layers
 from .data_feeder import check_variable_and_dtype, convert_dtype
+from ..utils import deprecated
 
 __all__ = [
     "simple_img_conv_pool",
@@ -332,6 +333,7 @@ def sequence_conv_pool(input,
     return pool_out
 
 
+@deprecated(since="2.0.0", update_to="paddle.nn.functional.glu")
 def glu(input, dim=-1):
     r"""
 	:api_attr: Static Graph
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 27ce44a257e78..21b4c429a66e9 100755
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -4352,7 +4352,7 @@ def _insert_send_recv(cur_id, prev_id):
                         ring_id = self._pp_ring_map[pair_key]
 
                     if self.schedule_mode == 'F-then-B':  # F-then-B
-                        block._insert_op(
+                        block._insert_op_without_sync(
                             index=index + extra_index_info['index'],
                             type='send_v2',
                             inputs={'X': var},
@@ -4364,7 +4364,7 @@ def _insert_send_recv(cur_id, prev_id):
                                 'ring_id': ring_id
                             })
                         extra_index_info['index'] += 1
-                        block._insert_op(
+                        block._insert_op_without_sync(
                             index=index + extra_index_info['index'],
                             type='recv_v2',
                             outputs={'Out': [var]},
@@ -4379,7 +4379,7 @@ def _insert_send_recv(cur_id, prev_id):
                             })
                         extra_index_info['index'] += 1
                     elif self.schedule_mode == '1F1B':  # 1F1B
-                        block._insert_op(
+                        block._insert_op_without_sync(
                             index=index + extra_index_info['index'],
                             type='c_sync_calc_stream',
                             inputs={'X': [var]},
@@ -4389,7 +4389,7 @@ def _insert_send_recv(cur_id, prev_id):
                                 self._op_role_key: op_role,
                             })
                         extra_index_info['index'] += 1
-                        block._insert_op(
+                        block._insert_op_without_sync(
                             index=index + extra_index_info['index'],
                             type='send_v2',
                             inputs={'X': var},
@@ -4409,7 +4409,7 @@ def _insert_send_recv(cur_id, prev_id):
                         else:
                             insert_index = index
                             new_op_role = self._op_role.Backward
-                        block._insert_op(
+                        block._insert_op_without_sync(
                             index=insert_index + extra_index_info['index'],
                             type='c_sync_comm_stream',
                             inputs={'X': [var]},
@@ -4424,7 +4424,7 @@ def _insert_send_recv(cur_id, prev_id):
                         var_shape = list(var.shape)
                         var_shape[0] = self.micro_batch_size if var_shape[
                             0] < 0 else var_shape[0]
-                        block._insert_op(
+                        block._insert_op_without_sync(
                             index=index + extra_index_info['index'],
                             type='recv_v2',
                             outputs={'Out': [var]},
@@ -4818,7 +4818,10 @@ def device_cmp(device1, device2):
         place_list = []
         for dev in device_list:
             dev_index = int(dev.split(":")[1])
-            place_list.append(core.CUDAPlace(0))
+            if core.is_compiled_with_cuda():
+                place_list.append(core.CUDAPlace(dev_index % 1))
+            elif core.is_compiled_with_npu():
+                place_list.append(core.NPUPlace(dev_index % 1))
 
         # Step6: Split startup program
         new_startup_program = self._split_startup_program(startup_program,
@@ -4837,7 +4840,10 @@ def device_cmp(device1, device2):
             self._accumulate_gradients(real_block)
             real_block._sync_with_cpp()
 
-        place_id = int(os.getenv("FLAGS_selected_gpus", "0"))
+        if core.is_compiled_with_cuda():
+            place_id = int(os.getenv("FLAGS_selected_gpus", "0"))
+        elif core.is_compiled_with_npu():
+            place_id = int(os.getenv("FLAGS_selected_npus", "0"))
         main_program._pipeline_opt = {
             "trainer": "PipelineTrainer",
             "device_worker": "Section",
diff --git a/python/paddle/fluid/profiler.py b/python/paddle/fluid/profiler.py
index bc7a60af94617..40b0862be0177 100644
--- a/python/paddle/fluid/profiler.py
+++ b/python/paddle/fluid/profiler.py
@@ -106,6 +106,65 @@ def cuda_profiler(output_file, output_mode=None, config=None):
         os.remove(config_file)
 
 
+@signature_safe_contextmanager
+def npu_profiler(output_file, config=None):
+    """
+    The NPU profiler.
+    
+    This fuctions is used to profile NPU program by NPU runtime application
+    programming interface. The profiling result will be written into
+    `output_file`. The users can set set the NPU profiling config by `config` argument. 
+    
+    After getting the profiling result file, users can use 
+    `tools provided by Ascend <https://support.huaweicloud.com/tg-Inference-cann/atlasprofiling_16_0006.html>`_ 
+    to load this output file to visualize results.
+
+    Args:
+        output_file (str) : The output file name, the result will be
+            written into this file. It should be absolute path. 
+        config (list<str>, optional) : NPU profile config. For more details, please
+            refer to `User Guide <https://support.huaweicloud.com/tg-Inference-cann/atlasprofiling_16_0006.html>`_ .
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle.fluid as fluid
+            import paddle.fluid.profiler as profiler
+            import numpy as np
+
+            epoc = 8
+            dshape = [4, 3, 28, 28]
+            data = fluid.data(name='data', shape=[None, 3, 28, 28], dtype='float32')
+            conv = fluid.layers.conv2d(data, 20, 3, stride=[1, 1], padding=[1, 1])
+
+            place = fluid.NPUPlace(0)
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
+
+            output_file = 'npu.txt'
+            with profiler.npu_profiler(output_file) as npu_prof:
+                for i in range(epoc):
+                    input = np.random.random(dshape).astype('float32')
+                    exe.run(fluid.default_main_program(), feed={'data': input})
+            # then use  NPU profiler tools to load this output file
+            # to visualize results.
+    """
+    # TODO: support config in python.
+    if not config:
+        config = core.npu_prof_create_config()
+
+    core.npu_prof_init(output_file)
+    # Enables profiler collection by the active NPU profiling tool.
+    core.npu_prof_start(config)
+    try:
+        yield
+    # Disables profiler collection.
+    finally:
+        core.npu_prof_stop(config)
+        core.npu_prof_finalize()
+
+
 def reset_profiler():
     """
     Clear the previous time record. This interface does not work for
diff --git a/python/paddle/fluid/regularizer.py b/python/paddle/fluid/regularizer.py
index 5e0e5f724a889..db08955c455fb 100644
--- a/python/paddle/fluid/regularizer.py
+++ b/python/paddle/fluid/regularizer.py
@@ -28,10 +28,12 @@ def _create_regularization_of_grad(param, grad, regularization=None):
     Function helper of append_regularization_ops.
     """
     # If no gradient or no regularization is specified,  then we don't need to do anything
-    if grad is None or (param.regularizer is None and regularization is None):
+    if grad is None or ((not hasattr(param, 'regularizer') or (
+            hasattr(param, 'regularizer') and param.regularizer is None)) and
+                        regularization is None):
         return grad
     regularization_term = None
-    if param.regularizer is not None:
+    if hasattr(param, 'regularizer') and param.regularizer is not None:
         # Add variable for regularization term in grad block
         regularization_term = param.regularizer(param, grad, grad.block)
     elif regularization is not None:
@@ -213,7 +215,7 @@ def __call__(self, param, grad, block):
         Returns:
             new variable for weight decay
         """
-        assert isinstance(param, framework.Parameter)
+        assert isinstance(param, framework.Variable)
         assert isinstance(block, framework.Block)
 
         inputs = {"X": [param]}
@@ -320,7 +322,7 @@ def __call__(self, param, grad, block):
         Returns:
             new variable for weight decay
         """
-        assert isinstance(param, framework.Parameter)
+        assert isinstance(param, framework.Variable)
         assert isinstance(block, framework.Block)
 
         if framework.in_dygraph_mode():
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py
index b267617451772..7f5c76d0aeeae 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py
@@ -142,7 +142,8 @@ def setUp(self):
             cmd = 'cd /d {} && python custom_relu_setup.py install'.format(
                 cur_dir)
         else:
-            cmd = 'cd {} && python custom_relu_setup.py install'.format(cur_dir)
+            cmd = 'cd {} && {} custom_relu_setup.py install'.format(
+                cur_dir, sys.executable)
         run_cmd(cmd)
 
         # NOTE(Aurelius84): Normally, it's no need to add following codes for users.
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index e8669fd295162..2e68dd899ee7b 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -11,6 +11,7 @@ endif()
 string(REPLACE ".py" "" DIST_TEST_OPS "${DIST_TEST_OPS}")
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_mnist)
 list(APPEND DIST_TEST_OPS test_pipeline)
+list(APPEND DIST_TEST_OPS test_static_model_parallel)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_se_resnext)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_sparse_embedding)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_sparse_embedding_over_height)
@@ -21,6 +22,7 @@ list(APPEND DIST_TEST_OPS test_gen_nccl_id_op)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_unused_variables)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_control_flow)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_dataparallel)
+list(APPEND DIST_TEST_OPS test_parallel_dygraph_pipeline_layer)
 set(MIXED_DIST_TEST_OPS ${DIST_TEST_OPS})
 #remove distribute unittests.
 list(APPEND MIXED_DIST_TEST_OPS test_dgc_op)
@@ -28,9 +30,9 @@ list(APPEND MIXED_DIST_TEST_OPS test_dgc_momentum_op)
 list(APPEND MIXED_DIST_TEST_OPS test_dgc_optimizer)
 list(APPEND MIXED_DIST_TEST_OPS test_simple_dist_transpiler)
 list(APPEND MIXED_DIST_TEST_OPS test_recv_save_op)
-list(APPEND MIXED_DIST_TEST_OPS test_transpiler_ops)
 list(APPEND MIXED_DIST_TEST_OPS test_c_comm_init_op)
 list(APPEND MIXED_DIST_TEST_OPS test_communicator_async)
+list(APPEND MIXED_DIST_TEST_OPS test_communicator_ps_gpu)
 list(APPEND MIXED_DIST_TEST_OPS test_communicator_geo)
 list(APPEND MIXED_DIST_TEST_OPS test_communicator_half_async)
 list(APPEND MIXED_DIST_TEST_OPS test_communicator_sync)
@@ -70,7 +72,10 @@ endforeach()
 
 if(((NOT WITH_ROCM) AND (NOT WITH_GPU)) OR WIN32)
     LIST(REMOVE_ITEM TEST_OPS test_c_comm_init_all_op)
+    LIST(REMOVE_ITEM TEST_OPS test_c_concat)
+    LIST(REMOVE_ITEM TEST_OPS test_c_split)
     LIST(REMOVE_ITEM TEST_OPS test_allgather)
+    LIST(REMOVE_ITEM TEST_OPS test_c_identity)
     LIST(REMOVE_ITEM TEST_OPS test_allreduce)
     LIST(REMOVE_ITEM TEST_OPS test_broadcast)
     LIST(REMOVE_ITEM TEST_OPS test_collective_reduce)
@@ -153,6 +158,8 @@ if(APPLE OR WIN32)
     LIST(REMOVE_ITEM TEST_OPS test_fleet_metric)
 endif()
 
+list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_hybrid_parallel)
+
 if ((NOT WITH_GPU) AND (NOT WITH_ROCM))
     LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op)
     LIST(REMOVE_ITEM TEST_OPS test_rank_attention_op) # TODO(shenliang03): rank_attention_op support CPU device in future
@@ -166,6 +173,7 @@ if ((NOT WITH_GPU) AND (NOT WITH_ROCM))
     LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_sync_batch_norm)
     list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_control_flow)
     list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_dataparallel)
+    list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_pipeline_layer)
     LIST(REMOVE_ITEM TEST_OPS test_imperative_auto_mixed_precision)
     LIST(REMOVE_ITEM TEST_OPS test_fleet_base_single)
 elseif(WITH_GPU)
@@ -483,8 +491,8 @@ if(WITH_DISTRIBUTE)
     list(REMOVE_ITEM DIST_TEST_OPS "test_dist_fleet_gloo")
 
     py_test_modules(test_recv_save_op MODULES test_recv_save_op ENVS ${dist_ENVS})
-    py_test_modules(test_transpiler_ops MODULES test_transpiler_ops ENVS ${dist_ENVS})
     py_test_modules(test_communicator_async MODULES test_communicator_async ENVS ${dist_ENVS})
+    py_test_modules(test_communicator_ps_gpu MODULES test_communicator_ps_gpu ENVS ${dist_ENVS})
     py_test_modules(test_communicator_geo MODULES test_communicator_geo ENVS ${dist_ENVS})
     py_test_modules(test_communicator_half_async MODULES test_communicator_half_async ENVS ${dist_ENVS} FLAGS_communicator_send_queue_size=1 FLAGS_communicator_max_merge_var_num=1)
     py_test_modules(test_communicator_sync MODULES test_communicator_sync ENVS ${dist_ENVS} FLAGS_communicator_send_queue_size=1 FLAGS_communicator_max_merge_var_num=1)
@@ -533,7 +541,7 @@ if(WITH_DISTRIBUTE)
         bash_test_modules(test_fleet_launch_async START_BASH test_fleet_launch_async.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
         bash_test_modules(test_fleet_launch_cloud START_BASH test_fleet_launch_cloud.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
         bash_test_modules(test_fleet_launch_nproc START_BASH test_fleet_launch_nproc.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
-        if(WITH_ASCEND)
+        if(WITH_ASCEND OR WITH_ASCEND_CL)
             bash_test_modules(test_fleet_launch_ascend START_BASH test_fleet_launch_ascend.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
             bash_test_modules(test_ascend_group START_BASH test_ascend_group.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
         endif()
@@ -625,6 +633,12 @@ if (WITH_XPU)
     add_subdirectory(xpu)
 endif()
 
+# dist xpu tests: 
+if (WITH_XPU_BKCL)
+    py_test(test_collective_reduce_api_xpu SRCS "test_collective_reduce_api.py")
+    py_test(test_collective_allreduce_api_xpu SRCS "test_collective_allreduce_api.py")
+endif()
+
 if (WITH_ASCEND_CL)
     add_subdirectory(npu)
 endif()
@@ -732,6 +746,8 @@ set_tests_properties(test_trilinear_interp_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_bicubic_interp_v2_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_gather_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_static_save_load PROPERTIES TIMEOUT 250)
+set_tests_properties(test_pylayer_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_paddle_save_load_binary PROPERTIES TIMEOUT 120)
 if (WIN32)
     set_tests_properties(test_static_save_load_large PROPERTIES TIMEOUT 900)
     set_tests_properties(test_paddle_save_load PROPERTIES TIMEOUT 250)
@@ -805,7 +821,7 @@ set_tests_properties(test_imperative_optimizer PROPERTIES TIMEOUT 120)
 set_tests_properties(test_pool2d_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_transpose_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_eager_deletion_gru_net PROPERTIES TIMEOUT 120)
-set_tests_properties(test_activation_op PROPERTIES TIMEOUT 180)
+set_tests_properties(test_activation_op PROPERTIES TIMEOUT 270)
 set_tests_properties(test_normal PROPERTIES TIMEOUT 120)
 set_tests_properties(test_lstmp_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_bilinear_interp_op PROPERTIES TIMEOUT 120)
@@ -842,6 +858,7 @@ if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL)
     set_tests_properties(test_parallel_dygraph_dataparallel PROPERTIES TIMEOUT 120)
     set_tests_properties(test_parallel_dygraph_unused_variables PROPERTIES TIMEOUT 120)
     set_tests_properties(test_parallel_dygraph_control_flow PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_parallel_dygraph_pipeline_layer PROPERTIES TIMEOUT 120)
     if(${NCCL_VERSION} VERSION_GREATER_EQUAL 2212)
         set_tests_properties(test_parallel_dygraph_sparse_embedding PROPERTIES TIMEOUT 120)
         set_tests_properties(test_parallel_dygraph_transformer PROPERTIES TIMEOUT 120)
@@ -854,6 +871,7 @@ if((WITH_ROCM OR WITH_GPU) AND NOT WIN32)
     set_tests_properties(test_new_group_api PROPERTIES TIMEOUT 120)
     if(WITH_DISTRIBUTE)
         set_tests_properties(test_pipeline PROPERTIES TIMEOUT 120)
+        set_tests_properties(test_static_model_parallel PROPERTIES TIMEOUT 240)
     endif()
     set_tests_properties(test_reducescatter_api PROPERTIES TIMEOUT 120)
     set_tests_properties(test_broadcast PROPERTIES TIMEOUT 120)
@@ -861,7 +879,10 @@ if((WITH_ROCM OR WITH_GPU) AND NOT WIN32)
     set_tests_properties(test_collective_reduce_api PROPERTIES TIMEOUT 120)
     set_tests_properties(test_collective_reduce PROPERTIES TIMEOUT 120)
     set_tests_properties(test_allreduce PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_c_concat PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_c_split PROPERTIES TIMEOUT 120)
     set_tests_properties(test_allgather PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_c_identity PROPERTIES TIMEOUT 120)
     set_tests_properties(test_collective_scatter_api PROPERTIES TIMEOUT 120)
     set_tests_properties(test_collective_barrier_api PROPERTIES TIMEOUT 120)
     set_tests_properties(test_collective_scatter PROPERTIES TIMEOUT 120)
diff --git a/python/paddle/fluid/tests/unittests/ascend_group.py b/python/paddle/fluid/tests/unittests/ascend_group.py
index 78a3687b5ca3c..851544e165980 100644
--- a/python/paddle/fluid/tests/unittests/ascend_group.py
+++ b/python/paddle/fluid/tests/unittests/ascend_group.py
@@ -71,6 +71,24 @@ def init_communicator(startup_program, main_program, current_endpoint,
             OP_ROLE_KEY: OpRole.Forward,
         })
 
+    # add input op for test
+    fill_var_name = "tensor@Filled"
+    fill_var = block.create_var(
+        name=fill_var_name,
+        shape=[10, 10],
+        dtype='float32',
+        persistable=False,
+        stop_gradient=True)
+    block.append_op(
+        type="fill_constant",
+        outputs={"Out": fill_var_name},
+        attrs={
+            "shape": [10, 10],
+            "dtype": fill_var.dtype,
+            "value": 1.0,
+            "place_type": 1
+        })
+
     with fluid.program_guard(main_program):
         op_type = "c_allreduce_sum"
         data = fluid.layers.fill_constant(shape=[1], dtype='float32', value=2.5)
@@ -120,10 +138,14 @@ def train(world_endpoints, world_device_ids, local_device_ids, local_rank):
     main_program = main_programs[local_rank]
     loss = Loss(Block(main_program))
     optimizer = ascend_optimizer.AscendOptimizer(None, fetch_list=[])
-    optimizer.minimize(loss, startup_program, auto_dp=True)
+    optimizer.minimize(
+        loss,
+        startup_program,
+        auto_dp=True,
+        rank_table_file=os.getenv("RANK_TABLE_FILE", None))
 
     exe = paddle.static.Executor(paddle.CPUPlace())
-    #exe.run(startup_program)
+    exe.run(startup_program)
     exe.run(main_program)
 
 
diff --git a/python/paddle/fluid/tests/unittests/ascend_multi_process_collective.py b/python/paddle/fluid/tests/unittests/ascend_multi_process_collective.py
index 33e6f63ea10ce..bb2180a733f81 100644
--- a/python/paddle/fluid/tests/unittests/ascend_multi_process_collective.py
+++ b/python/paddle/fluid/tests/unittests/ascend_multi_process_collective.py
@@ -19,6 +19,7 @@
 
 def train(prefix):
     selected_accelerators = os.getenv("FLAGS_selected_accelerators")
+    selected_npus = os.getenv("FLAGS_selected_npus")
     trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
     worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS")
     current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
@@ -27,8 +28,8 @@ def train(prefix):
     device_ids = os.getenv("PADDLE_WORLD_DEVICE_IDS")
     current_device_id = os.getenv("PADDLE_LOCAL_DEVICE_IDS")
 
-    details = "selected_accelerators:{} worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{} device_ids:{} device_id:{}"\
-            .format(selected_accelerators, worker_endpoints, trainers_num, current_endpoint,trainer_id,device_ids, current_device_id)
+    details = "selected_accelerators:{} selected_npus:{} worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{} device_ids:{} device_id:{}"\
+            .format(selected_accelerators, selected_npus, worker_endpoints, trainers_num, current_endpoint,trainer_id,device_ids, current_device_id)
 
     print(details)
     with open("multi_process_{}.check_{}.log".format(prefix, trainer_id),
diff --git a/python/paddle/fluid/tests/unittests/collective_concat_op.py b/python/paddle/fluid/tests/unittests/collective_concat_op.py
new file mode 100644
index 0000000000000..c9de1713e7282
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective_concat_op.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import os
+import sys
+import signal
+import time
+from contextlib import closing
+from six import string_types
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import paddle.fluid.layers as layers
+from functools import reduce
+from test_collective_base import TestCollectiveRunnerBase, runtime_main
+
+paddle.enable_static()
+
+
+class TestCollectiveConcat(TestCollectiveRunnerBase):
+    def __init__(self):
+        self.global_ring_id = 0
+
+    def get_model(self, main_prog, startup_program):
+        ring_id = 0
+        nranks = 2
+        with fluid.program_guard(main_prog, startup_program):
+            tindata = layers.data(
+                name="tindata", shape=[10, 1000], dtype='float32')
+            toutdata = main_prog.current_block().create_var(
+                name="outofconcat",
+                dtype='float32',
+                type=core.VarDesc.VarType.LOD_TENSOR,
+                persistable=False,
+                stop_gradient=False)
+            main_prog.global_block().append_op(
+                type="c_concat",
+                inputs={'X': tindata},
+                attrs={
+                    'ring_id': ring_id,
+                    'rank': self.rank,
+                    'nranks': nranks
+                },
+                outputs={'Out': toutdata})
+            return toutdata
+
+
+if __name__ == "__main__":
+    runtime_main(TestCollectiveConcat, "concat", 0)
diff --git a/python/paddle/fluid/tests/unittests/collective_identity_op.py b/python/paddle/fluid/tests/unittests/collective_identity_op.py
new file mode 100644
index 0000000000000..e024b64e82509
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective_identity_op.py
@@ -0,0 +1,66 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import os
+import sys
+import signal
+import time
+from contextlib import closing
+from six import string_types
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import paddle.fluid.layers as layers
+from functools import reduce
+from test_collective_base import TestCollectiveRunnerBase, runtime_main
+
+paddle.enable_static()
+
+
+class TestCollectiveIdentity(TestCollectiveRunnerBase):
+    def __init__(self):
+        self.global_ring_id = 0
+
+    def get_model(self, main_prog, startup_program):
+        ring_id = 0
+        nranks = 2
+        with fluid.program_guard(main_prog, startup_program):
+            tindata = layers.data(
+                name="tindata", shape=[10, 1000], dtype='float32')
+            toutdata = main_prog.current_block().create_var(
+                name="outofgather",
+                dtype='float32',
+                type=core.VarDesc.VarType.LOD_TENSOR,
+                persistable=False,
+                stop_gradient=False)
+            main_prog.global_block().append_op(
+                type="c_identity",
+                inputs={'X': tindata},
+                outputs={'Out': toutdata},
+                attrs={'ring_id': ring_id,
+                       'nranks': nranks})
+            return toutdata
+
+
+if __name__ == "__main__":
+    runtime_main(TestCollectiveIdentity, "identity", 0)
diff --git a/python/paddle/fluid/tests/unittests/collective_split_op.py b/python/paddle/fluid/tests/unittests/collective_split_op.py
new file mode 100644
index 0000000000000..553955354fe02
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective_split_op.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import os
+import sys
+import signal
+import time
+from contextlib import closing
+from six import string_types
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import paddle.fluid.layers as layers
+from functools import reduce
+from test_collective_base import TestCollectiveRunnerBase, runtime_main
+
+paddle.enable_static()
+
+
+class TestCollectiveAllGather(TestCollectiveRunnerBase):
+    def __init__(self):
+        self.global_ring_id = 0
+
+    def get_model(self, main_prog, startup_program):
+        ring_id = 0
+        nranks = 2
+        with fluid.program_guard(main_prog, startup_program):
+            tindata = layers.data(
+                name="tindata", shape=[10, 1000], dtype='float32')
+            toutdata = main_prog.current_block().create_var(
+                name="outofsplit",
+                dtype='float32',
+                type=core.VarDesc.VarType.LOD_TENSOR,
+                persistable=False,
+                stop_gradient=False)
+            main_prog.global_block().append_op(
+                type="c_split",
+                inputs={'X': tindata},
+                attrs={
+                    'ring_id': ring_id,
+                    'rank': self.rank,
+                    'nranks': nranks
+                },
+                outputs={'Out': toutdata})
+            return toutdata
+
+
+if __name__ == "__main__":
+    runtime_main(TestCollectiveAllGather, "split", 0)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py
index 1fee1c1ef6fdc..07e9b1ac62e27 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py
@@ -54,7 +54,7 @@ def __init__(self, process_cmd, d_model, dropout_rate):
                 self.functors.append(
                     self.add_sublayer(
                         "layer_norm_%d" % len(
-                            self.sublayers(include_sublayers=False)),
+                            [layer for layer in self.children()]),
                         LayerNorm(
                             normalized_shape=d_model,
                             param_attr=fluid.ParamAttr(
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/yolov3.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/yolov3.py
index 528e388f6a2e2..bb95bdf9fc677 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/yolov3.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/yolov3.py
@@ -97,7 +97,7 @@ def __setattr__(self, name, value):
 # derived learning rate the to get the final learning rate.
 cfg.learning_rate = 0.001
 # maximum number of iterations
-cfg.max_iter = 20 if fluid.is_compiled_with_cuda() else 2
+cfg.max_iter = 20 if fluid.is_compiled_with_cuda() else 1
 # Disable mixup in last N iter
 cfg.no_mixup_iter = 10 if fluid.is_compiled_with_cuda() else 1
 # warm up to learning rate 
diff --git a/python/paddle/fluid/tests/unittests/hccl_tools.py b/python/paddle/fluid/tests/unittests/hccl_tools.py
new file mode 100644
index 0000000000000..3ae8f38dc64bd
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/hccl_tools.py
@@ -0,0 +1,174 @@
+# -*- coding:UTF-8 -*-
+
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""generate hccl config file script"""
+import os
+import sys
+import json
+import socket
+from argparse import ArgumentParser
+from typing import Dict, Any
+
+
+def parse_args():
+    """
+    parse args .
+
+    Args:
+
+    Returns:
+        args.
+
+    Examples:
+        >>> parse_args()
+    """
+    parser = ArgumentParser(description="mindspore distributed training launch "
+                            "helper utilty that will generate hccl"
+                            " config file")
+    parser.add_argument(
+        "--device_num",
+        type=str,
+        default="[0,8)",
+        help="The number of the Ascend accelerators used. please note that the Ascend accelerators"
+        "used must be continuous, such [0,4) means to use four chips "
+        "0，1，2，3; [0,1) means to use chip 0; The first four chips are"
+        "a group, and the last four chips are a group. In addition to"
+        "the [0,8) chips are allowed, other cross-group such as [3,6)"
+        "are prohibited.")
+    parser.add_argument(
+        "--visible_devices",
+        type=str,
+        default="0,1,2,3,4,5,6,7",
+        help="will use the visible devices sequentially")
+    parser.add_argument("--server_ip", type=str, default="", help="server ip")
+    args = parser.parse_args()
+    return args
+
+
+def get_host_ip():
+    """
+    get host ip
+    """
+    ip = None
+
+    try:
+        hostname = socket.gethostname()
+        ip = socket.gethostbyname(hostname)
+    except EOFError:
+        pass
+
+    return ip
+
+
+def main():
+    print("start", __file__)
+    args = parse_args()
+
+    # visible_devices
+    visible_devices = args.visible_devices.split(',')
+    print('visible_devices:{}'.format(visible_devices))
+
+    # server_id
+    ip = get_host_ip()
+    if args.server_ip:
+        server_id = args.server_ip
+    elif ip:
+        server_id = ip
+    else:
+        raise ValueError("please input server ip!")
+    print('server_id:{}'.format(server_id))
+
+    # device_num
+    first_num = int(args.device_num[1])
+    last_num = int(args.device_num[3])
+    if first_num < 0 or last_num > 8:
+        raise ValueError("device num {} must be in range [0,8] !".format(
+            args.device_num))
+    if first_num > last_num:
+        raise ValueError(
+            "First num {} of device num {} must less than last num {} !".format(
+                first_num, args.device_num, last_num))
+    if first_num < 4:
+        if last_num > 4:
+            if first_num == 0 and last_num == 8:
+                pass
+            else:
+                raise ValueError(
+                    "device num {} must be in the same group of [0,4] or [4,8] !".
+                    format(args.device_num))
+
+    device_num_list = list(range(first_num, last_num))
+    print("device_num_list:", device_num_list)
+
+    assert len(visible_devices) >= len(device_num_list)
+
+    # construct hccn_table
+    device_ips = {}
+    with open('/etc/hccn.conf', 'r') as fin:
+        for hccn_item in fin.readlines():
+            if hccn_item.strip().startswith('address_'):
+                device_id, device_ip = hccn_item.split('=')
+                device_id = device_id.split('_')[1]
+                device_ips[device_id] = device_ip.strip()
+
+    hccn_table = {'version': '1.0', 'server_count': '1', 'server_list': []}
+    device_list = []
+    rank_id = 0
+    for instance_id in device_num_list:
+        device_id = visible_devices[instance_id]
+        device_ip = device_ips[device_id]
+        device = {
+            'device_id': device_id,
+            'device_ip': device_ip,
+            'rank_id': str(rank_id)
+        }
+        print('rank_id:{}, device_id:{}, device_ip:{}'.format(
+            rank_id, device_id, device_ip))
+        rank_id += 1
+        device_list.append(device)
+    hccn_table['server_list'].append({
+        'server_id': server_id,
+        'device': device_list,
+        'host_nic_ip': 'reserve'
+    })
+    hccn_table['status'] = 'completed'
+
+    # save hccn_table to file
+    table_path = os.getcwd()
+    table_fn = os.path.join(table_path, 'hccl_{}p_{}_{}.json'.format(
+        len(device_num_list), "".join(map(str, device_num_list)), server_id))
+    with open(table_fn, 'w') as table_fp:
+        json.dump(hccn_table, table_fp, indent=4)
+    sys.stdout.flush()
+    print("Completed: hccl file was save in :", table_fn)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/paddle/fluid/tests/unittests/hybrid_communicate_group.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_communicate_group.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/hybrid_communicate_group.py
rename to python/paddle/fluid/tests/unittests/hybrid_parallel_communicate_group.py
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_amp.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_amp.py
new file mode 100644
index 0000000000000..248c271eec6a1
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_amp.py
@@ -0,0 +1,52 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import numpy as np
+from hybrid_parallel_mp_model import TestDistMPTraning
+import paddle.distributed.fleet as fleet
+import unittest
+
+
+class TestMPClipGrad(TestDistMPTraning):
+    def build_optimizer(self, model):
+        grad_clip = paddle.nn.ClipGradByGlobalNorm(2.0)
+        scheduler = paddle.optimizer.lr.ExponentialDecay(
+            learning_rate=0.001, gamma=0.999, verbose=True)
+        optimizer = paddle.optimizer.SGD(scheduler,
+                                         grad_clip=grad_clip,
+                                         parameters=model.parameters())
+        return optimizer
+
+    def train_batch(self, batch, model, optimizer, is_mp):
+        scaler = paddle.amp.GradScaler(init_loss_scaling=5160)
+        if is_mp:
+            scaler = fleet.distributed_scaler(scaler)
+        with paddle.amp.auto_cast():
+            output = model(batch)
+            loss = output.mean()
+
+        scaled = scaler.scale(loss)  # scale the loss
+        scaled.backward()  # do backward
+
+        scaler.minimize(optimizer, scaled)  # update parameters
+        optimizer.clear_grad()
+        return scaled
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_clip_grad.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_clip_grad.py
new file mode 100644
index 0000000000000..ad95aceaa2cf9
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_clip_grad.py
@@ -0,0 +1,40 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import numpy as np
+from hybrid_parallel_mp_model import TestDistMPTraning
+import unittest
+import logging
+
+#log = logging.getLogger("HybridParallel")
+#log.setLevel(logging.WARNING)
+
+
+class TestMPClipGrad(TestDistMPTraning):
+    def build_optimizer(self, model):
+        grad_clip = paddle.nn.ClipGradByGlobalNorm(2.0)
+        scheduler = paddle.optimizer.lr.ExponentialDecay(
+            learning_rate=0.001, gamma=0.999, verbose=True)
+        optimizer = paddle.optimizer.SGD(scheduler,
+                                         grad_clip=grad_clip,
+                                         parameters=model.parameters())
+        return optimizer
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_layers.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_layers.py
new file mode 100644
index 0000000000000..dfbef998a2f07
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_layers.py
@@ -0,0 +1,273 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+
+import paddle
+import numpy as np
+import random
+import paddle.distributed as dist
+import paddle.fluid as fluid
+import paddle.distributed.fleet as fleet
+from paddle import framework
+
+
+def set_random_seed(seed):
+    """Set random seed for reproducability."""
+    random.seed(seed)
+    np.random.seed(seed)
+    paddle.seed(seed)
+    fleet.meta_parallel.model_parallel_random_seed(seed)
+
+
+class ColumnLinearNet(fluid.dygraph.Layer):
+    def __init__(self, input_size, output_size, global_dtype):
+        super(ColumnLinearNet, self).__init__()
+        self.parallel_linear = fleet.meta_parallel.ColumnParallelLinear(
+            in_features=input_size,
+            out_features=output_size,
+            weight_attr=None,
+            has_bias=True,
+            gather_output=True,
+            name="test_column_linear")
+
+    def forward(self, x):
+        output = self.parallel_linear(x)
+        return output
+
+
+class RowLinearNet(fluid.dygraph.Layer):
+    def __init__(self, input_size, output_size):
+        super(RowLinearNet, self).__init__()
+        self.parallel_linear = fleet.meta_parallel.RowParallelLinear(
+            in_features=input_size,
+            out_features=output_size,
+            has_bias=True,
+            input_is_parallel=False,
+            name="test_row_linear")
+
+    def forward(self, x):
+        output = self.parallel_linear(x)
+        return output
+
+
+class EmbeddingNet(fluid.dygraph.Layer):
+    def __init__(self, vocab_size, hidden_size):
+        super(EmbeddingNet, self).__init__()
+        self.embedding = fleet.meta_parallel.VocabParallelEmbedding(vocab_size,
+                                                                    hidden_size)
+
+    def forward(self, x):
+        output = self.embedding(x)
+        return output
+
+
+class SimpleMatmul(fluid.dygraph.Layer):
+    def __init__(self, weight, output_size, global_dtype):
+        super(SimpleMatmul, self).__init__()
+        self.weight = paddle.create_parameter(
+            shape=weight.shape,
+            dtype=global_dtype,
+            attr=paddle.ParamAttr(
+                initializer=paddle.nn.initializer.Assign(weight)))
+        self.bias = self.create_parameter(
+            shape=[output_size],
+            dtype=global_dtype,
+            attr=paddle.ParamAttr(
+                initializer=paddle.nn.initializer.Constant(0.0)))
+
+    def forward(self, x):
+        output = paddle.matmul(x, self.weight) + self.bias
+        return output
+
+
+class SimpleEmbedding(fluid.dygraph.Layer):
+    def __init__(self, vocab_size, hidden_size, weight):
+        super(SimpleEmbedding, self).__init__()
+        self.embedding = paddle.nn.Embedding(
+            vocab_size,
+            hidden_size,
+            weight_attr=paddle.framework.ParamAttr(
+                name="origin_embedding",
+                initializer=paddle.nn.initializer.Assign(weight)))
+
+    def forward(self, x):
+        output = self.embedding(x)
+        return output
+
+
+class TestDistTraning(unittest.TestCase):
+    def setUp(self):
+        strategy = fleet.DistributedStrategy()
+        self.model_parallel_size = 2
+        strategy.hybrid_configs = {
+            "dp_degree": 1,
+            "mp_degree": self.model_parallel_size,
+            "pp_degree": 1
+        }
+        fleet.init(is_collective=True, strategy=strategy)
+
+    def test_column_parallel_layer(self):
+        set_random_seed(1024)
+        global_dtype = "float32"
+
+        input_size_per_card = 17
+        input_size = input_size_per_card * self.model_parallel_size
+        output_size_per_card = 13
+        output_size = output_size_per_card * self.model_parallel_size
+        batch_size = 4
+
+        model_a = ColumnLinearNet(input_size, output_size, global_dtype)
+
+        # get w
+        check_group = dist.new_group(list(range(self.model_parallel_size)))
+        integral_w = []
+        partial_w = model_a.parallel_linear.weight.clone().detach()
+        paddle.distributed.all_gather(integral_w, partial_w, group=check_group)
+        integral_w = paddle.concat(integral_w, axis=1)
+
+        model_b = SimpleMatmul(integral_w, output_size, global_dtype)
+
+        optimizer_a = paddle.optimizer.SGD(learning_rate=0.001,
+                                           parameters=model_a.parameters())
+        optimizer_b = paddle.optimizer.SGD(learning_rate=0.001,
+                                           parameters=model_b.parameters())
+        for idx in range(5):
+            input = paddle.randn([batch_size, input_size], global_dtype)
+            input.stop_gradient = True
+
+            output_a = model_a(input)
+            loss_a = output_a.mean()
+            loss_a.backward()
+
+            output_b = model_b(input)
+            loss_b = output_b.mean()
+            loss_b.backward()
+
+            optimizer_a.step()
+            optimizer_b.step()
+
+            np.testing.assert_allclose(loss_a.numpy(), loss_b.numpy())
+
+    def test_row_parallel_layer(self):
+        global_dtype = "float32"
+        paddle.set_default_dtype(global_dtype)
+        set_random_seed(1024)
+
+        self.hcg = fleet.get_hybrid_communicate_group()
+
+        self.word_size = self.hcg.get_model_parallel_world_size()
+        self.rank_id = self.hcg.get_model_parallel_rank()
+
+        input_size_per_card = 11
+        input_size = input_size_per_card * self.model_parallel_size
+        output_size_per_card = 10
+        output_size = output_size_per_card * self.model_parallel_size
+        batch_size = 4
+
+        model_a = RowLinearNet(input_size, output_size)
+
+        # get w
+        check_group = dist.new_group(list(range(self.model_parallel_size)))
+        integral_w = []
+        partial_w = model_a.parallel_linear.weight.clone().detach()
+        paddle.distributed.all_gather(integral_w, partial_w, group=check_group)
+        integral_w = paddle.concat(integral_w, axis=0)
+
+        model_b = SimpleMatmul(integral_w, output_size, global_dtype)
+
+        optimizer_a = paddle.optimizer.SGD(learning_rate=0.001,
+                                           parameters=model_a.parameters())
+
+        optimizer_b = paddle.optimizer.SGD(learning_rate=0.001,
+                                           parameters=model_b.parameters())
+
+        for idx in range(5):
+            input = paddle.randn([batch_size, input_size], global_dtype)
+            input.stop_gradient = True
+
+            output_a = model_a(input)
+            loss_a = output_a.mean()
+            loss_a.backward()
+
+            output_b = model_b(input)
+            loss_b = output_b.mean()
+            loss_b.backward()
+
+            optimizer_a.step()
+            optimizer_b.step()
+
+            np.testing.assert_allclose(
+                loss_a.numpy(), loss_b.numpy(), rtol=1e-5)
+
+    def test_parallel_embedding(self):
+        batch_size = 17
+        seq_length = 23
+        vocab_size_per_card = 2
+        vocab_size = vocab_size_per_card * self.model_parallel_size
+        hidden_size = 2
+        seed = 1236
+
+        set_random_seed(seed)
+        rank_id = dist.get_rank()
+
+        # model_a
+        model_a = EmbeddingNet(vocab_size, hidden_size)
+
+        # model_b
+        check_group = dist.new_group(list(range(self.model_parallel_size)))
+        integral_w = []
+        partial_w = model_a.embedding.embedding.weight.clone().detach()
+        paddle.distributed.all_gather(integral_w, partial_w, group=check_group)
+        result_w = []
+        for idx in range(len(integral_w)):
+            tmp = paddle.gather(
+                integral_w[idx],
+                paddle.to_tensor(list(range(vocab_size_per_card))))
+            result_w.append(tmp)
+        integral_w = paddle.concat(result_w, axis=0)
+
+        model_b = SimpleEmbedding(vocab_size, hidden_size, integral_w)
+
+        optimizer_a = paddle.optimizer.SGD(learning_rate=0.001,
+                                           parameters=model_a.parameters())
+
+        optimizer_b = paddle.optimizer.SGD(learning_rate=0.001,
+                                           parameters=model_b.parameters())
+
+        for _ in range(5):
+            np_input_data = np.random.randint(0, vocab_size,
+                                              (batch_size, seq_length))
+            input_data = paddle.to_tensor(np_input_data, dtype="int32")
+
+            output_a = model_a(input_data)
+            loss_a = output_a.mean()
+
+            output_b = model_b(input_data)
+            loss_b = output_b.mean()
+
+            loss_a.backward()
+            loss_b.backward()
+
+            optimizer_a.step()
+            optimizer_b.step()
+            np.testing.assert_allclose(
+                loss_a.numpy(), loss_b.numpy(), rtol=1e-6)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_model.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_model.py
new file mode 100644
index 0000000000000..767bf5d57e74a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_model.py
@@ -0,0 +1,223 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import numpy as np
+import random
+import paddle.distributed as dist
+import paddle.fluid as fluid
+import paddle.distributed.fleet as fleet
+from paddle.io import DataLoader, Dataset
+import unittest
+
+
+def set_random_seed(seed, dp_id, rank_id):
+    """Set random seed for reproducability."""
+    random.seed(seed)
+    np.random.seed(seed + dp_id)
+    paddle.seed(seed + rank_id)
+
+
+vocab_size = 5
+hidden_size = 10
+inner_size = 8
+output_size = 2
+seq_length = 2
+
+
+class SimpleMPNet(fluid.dygraph.Layer):
+    def __init__(self, vocab_size, hidden_size, inner_size, output_size, np_fc1,
+                 np_fc2, mp_id):
+        super(SimpleMPNet, self).__init__()
+
+        if mp_id == 0:
+            init_fc1_data = np_fc1[:, :(inner_size // 2)]
+            init_fc2_data = np_fc2[:(inner_size // 2), :]
+        else:
+            init_fc1_data = np_fc1[:, (inner_size // 2):]
+            init_fc2_data = np_fc2[(inner_size // 2):, :]
+
+        self.linear1 = fleet.meta_parallel.ColumnParallelLinear(
+            hidden_size,
+            inner_size,
+            weight_attr=paddle.framework.ParamAttr(
+                initializer=paddle.nn.initializer.Assign(init_fc1_data)),
+            gather_output=False,
+            has_bias=True)
+
+        self.linear2 = fleet.meta_parallel.RowParallelLinear(
+            inner_size,
+            hidden_size,
+            weight_attr=paddle.framework.ParamAttr(
+                initializer=paddle.nn.initializer.Assign(init_fc2_data)),
+            input_is_parallel=True,
+            has_bias=True)
+
+        self.linear3 = paddle.nn.Linear(
+            hidden_size,
+            output_size,
+            weight_attr=paddle.framework.ParamAttr(
+                initializer=paddle.nn.initializer.Constant(0.0)),
+            bias_attr=paddle.framework.ParamAttr(
+                initializer=paddle.nn.initializer.Constant(0.0)))
+
+        self.embedding = fleet.meta_parallel.VocabParallelEmbedding(
+            vocab_size,
+            hidden_size,
+            weight_attr=paddle.nn.initializer.Constant(value=0.5))
+
+    def forward(self, x):
+        x = self.embedding(x)
+        x = self.linear1(x)
+        x = self.linear2(x)
+        x = self.linear3(x)
+        return x
+
+
+class SimpleDPNet(fluid.dygraph.Layer):
+    def __init__(self, vocab_size, hidden_size, inner_size, output_size, np_fc1,
+                 np_fc2):
+
+        super(SimpleDPNet, self).__init__()
+        self.linear1 = paddle.nn.Linear(
+            hidden_size,
+            inner_size,
+            weight_attr=paddle.framework.ParamAttr(
+                initializer=paddle.nn.initializer.Assign(np_fc1)),
+            bias_attr=paddle.framework.ParamAttr(
+                initializer=paddle.nn.initializer.Constant(0.0)))
+
+        self.linear2 = paddle.nn.Linear(
+            inner_size,
+            hidden_size,
+            weight_attr=paddle.framework.ParamAttr(
+                initializer=paddle.nn.initializer.Assign(np_fc2)),
+            bias_attr=paddle.framework.ParamAttr(
+                initializer=paddle.nn.initializer.Constant(0.0)))
+
+        self.linear3 = paddle.nn.Linear(
+            hidden_size,
+            output_size,
+            weight_attr=paddle.framework.ParamAttr(
+                initializer=paddle.nn.initializer.Constant(0.0)),
+            bias_attr=paddle.framework.ParamAttr(
+                initializer=paddle.nn.initializer.Constant(0.0)))
+
+        self.embedding = paddle.nn.Embedding(
+            vocab_size,
+            hidden_size,
+            weight_attr=paddle.nn.initializer.Constant(value=0.5))
+
+    def forward(self, x):
+        x = self.embedding(x)
+        x = self.linear1(x)
+        x = self.linear2(x)
+        x = self.linear3(x)
+        return x
+
+
+class TrainDataset(Dataset):
+    def __init__(self, length):
+        self.length = length
+
+    def __len__(self):
+        return self.length
+
+    def __getitem__(self, index):
+        np_input_data = np.random.randint(0, vocab_size, (seq_length, ))
+        return np_input_data
+
+
+class TestDistMPTraning(unittest.TestCase):
+    def setUp(self):
+        strategy = fleet.DistributedStrategy()
+        self.model_parallel_size = 2
+        self.data_parallel_size = 1
+        strategy.hybrid_configs = {
+            "dp_degree": self.data_parallel_size,
+            "mp_degree": self.model_parallel_size,
+            "pp_degree": 1
+        }
+        fleet.init(is_collective=True, strategy=strategy)
+
+    def train_batch(self, batch, model, optimizer, is_mp):
+        output = model(batch)
+        loss = output.mean()
+        loss.backward()  # do backward
+        optimizer.step()  # update parameters
+        optimizer.clear_grad()
+        return loss
+
+    def build_optimizer(self, model):
+        optimizer = paddle.optimizer.SGD(learning_rate=0.001,
+                                         parameters=model.parameters())
+        return optimizer
+
+    def build_model_optimizer(self):
+        hcg = fleet.get_hybrid_communicate_group()
+        word_size = hcg.get_model_parallel_world_size()
+        mp_id = hcg.get_model_parallel_rank()
+        dp_id = hcg.get_data_parallel_rank()
+        rank_id = dist.get_rank()
+        set_random_seed(1024, dp_id, rank_id)
+
+        np_fc1 = np.random.random_sample((hidden_size, inner_size))
+        np_fc2 = np.random.random_sample((inner_size, hidden_size))
+
+        train_data = TrainDataset(length=10000)
+
+        train_batch_sampler = paddle.io.DistributedBatchSampler(
+            train_data,
+            batch_size=4,
+            shuffle=False,
+            num_replicas=self.data_parallel_size,
+            rank=dp_id)
+        train_data_loader = DataLoader(
+            dataset=train_data,
+            batch_sampler=train_batch_sampler,
+            num_workers=0,
+            return_list=True)
+
+        model_a = SimpleMPNet(vocab_size, hidden_size, inner_size, output_size,
+                              np_fc1, np_fc2, mp_id)
+        optimizer_a = self.build_optimizer(model_a)
+        model_a = fleet.distributed_model(model_a)
+        optimizer_a = fleet.distributed_optimizer(optimizer_a)
+
+        model_b = SimpleDPNet(vocab_size, hidden_size, inner_size, output_size,
+                              np_fc1, np_fc2)
+        optimizer_b = self.build_optimizer(model_b)
+
+        return model_a, optimizer_a, model_b, optimizer_b, train_data_loader
+
+    def test_mp_model(self):
+        model_a, optimizer_a, model_b, optimizer_b, train_data_loader = self.build_model_optimizer(
+        )
+
+        for step, batch in enumerate(train_data_loader):
+            if step > 5:
+                return
+
+            loss_a = self.train_batch(batch, model_a, optimizer_a, True)
+            loss_b = self.train_batch(batch, model_b, optimizer_b, False)
+
+            np.testing.assert_allclose(
+                loss_a.numpy(), loss_b.numpy(), rtol=1e-5)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_random.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_random.py
new file mode 100644
index 0000000000000..59d24066946aa
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_random.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+
+import paddle
+import numpy as np
+import paddle.distributed as dist
+import paddle.fluid as fluid
+import paddle.distributed.fleet as fleet
+import random
+
+
+class TestDistTraning(unittest.TestCase):
+    def setUp(self):
+        strategy = fleet.DistributedStrategy()
+        self.model_parallel_size = 2
+        strategy.hybrid_configs = {
+            "dp_degree": 1,
+            "mp_degree": self.model_parallel_size,
+            "pp_degree": 1
+        }
+        fleet.init(is_collective=True, strategy=strategy)
+
+    def test_cuda_rng_tracker(self):
+        seed_1 = 2021
+        seed_2 = 1024
+
+        size = [20, 15]
+
+        paddle.seed(seed_1)
+        target_11 = paddle.randn(size, "float32")
+        target_12 = paddle.randn(size, "float32")
+
+        paddle.seed(seed_2)
+        target_21 = paddle.randn(size, "float32")
+        target_22 = paddle.randn(size, "float32")
+
+        paddle.seed(seed_1)
+
+        fleet.meta_parallel.get_rng_state_tracker().add("test", seed_2)
+
+        result_11 = paddle.randn(size, "float32")
+
+        with fleet.meta_parallel.get_rng_state_tracker().rng_state("test"):
+            result_21 = paddle.randn(size, "float32")
+
+        result_12 = paddle.randn(size, "float32")
+
+        with fleet.meta_parallel.get_rng_state_tracker().rng_state("test"):
+            result_22 = paddle.randn(size, "float32")
+
+        np.testing.assert_allclose(result_11.numpy(), target_11.numpy())
+        np.testing.assert_allclose(result_12.numpy(), target_12.numpy())
+        np.testing.assert_allclose(result_21.numpy(), target_21.numpy())
+        np.testing.assert_allclose(result_22.numpy(), target_22.numpy())
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_layer.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_layer.py
new file mode 100644
index 0000000000000..3130cbf458467
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_layer.py
@@ -0,0 +1,148 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import os
+import paddle
+from paddle.distributed import fleet
+import copy
+from paddle.fluid.dygraph.container import Sequential
+import paddle.nn as nn
+from paddle.fluid.dygraph.layers import Layer
+from paddle.distributed.fleet.meta_parallel import LayerDesc, PipelineLayer
+import paddle.nn.functional as F
+import unittest
+
+
+class AlexNet(Layer):
+    def __init__(self, num_classes=10):
+        super(AlexNet, self).__init__()
+        self.features = Sequential(
+            nn.Conv2D(
+                3, 64, kernel_size=11, stride=4, padding=5),
+            nn.ReLU(),
+            nn.MaxPool2D(
+                kernel_size=2, stride=2),
+            nn.Conv2D(
+                64, 192, kernel_size=5, padding=2),
+            nn.ReLU(),
+            nn.MaxPool2D(
+                kernel_size=2, stride=2),
+            nn.Conv2D(
+                192, 384, kernel_size=3, padding=1),
+            nn.ReLU(),
+            nn.Conv2D(
+                384, 256, kernel_size=3, padding=1),
+            nn.ReLU(),
+            nn.Conv2D(
+                256, 256, kernel_size=3, padding=1),
+            nn.ReLU(),
+            nn.MaxPool2D(
+                kernel_size=2, stride=2), )
+        self.classifier = nn.Linear(256, num_classes)
+        self.loss_fn = nn.loss.CrossEntropyLoss()
+
+    def forward(self, x, y):
+        x = self.features(x)
+        x.flatten()
+
+        x = self.classifier(x)
+        return self.loss_fn(x, y)
+
+
+class AlexNetPipe(AlexNet):
+    def to_layers(self):
+        feat = [self.features[i] for i in range(len(self.features))]
+        loss_fn = [lambda x: x.flatten(), self.classifier]
+        feat.extend(loss_fn)
+        return feat
+
+
+class AlexNetPipeDesc(PipelineLayer):
+    def __init__(self, num_classes=10, **kwargs):
+        self.num_classes = num_classes
+        decs = [
+            LayerDesc(
+                nn.Conv2D, 3, 64, kernel_size=11, stride=4, padding=5),
+            LayerDesc(nn.ReLU),
+            LayerDesc(
+                nn.MaxPool2D, kernel_size=2, stride=2),
+            LayerDesc(
+                nn.Conv2D, 64, 192, kernel_size=5, padding=2),
+            F.relu,
+            LayerDesc(
+                nn.MaxPool2D, kernel_size=2, stride=2),
+            LayerDesc(
+                nn.Conv2D, 192, 384, kernel_size=3, padding=1),
+            F.relu,
+            LayerDesc(
+                nn.Conv2D, 384, 256, kernel_size=3, padding=1),
+            F.relu,
+            LayerDesc(
+                nn.Conv2D, 256, 256, kernel_size=3, padding=1),
+            F.relu,
+            LayerDesc(
+                nn.MaxPool2D, kernel_size=2, stride=2),
+            lambda x: x.flatten(),
+            LayerDesc(nn.Linear, 256, self.num_classes),  # classifier
+        ]
+        super(AlexNetPipeDesc, self).__init__(
+            layers=decs, loss_fn=nn.CrossEntropyLoss(), **kwargs)
+
+
+class TestPipeLayerAPI(unittest.TestCase):
+    def setUp(self):
+        strategy = fleet.DistributedStrategy()
+        self.model_parallel_size = 2
+        strategy.hybrid_configs = {
+            "dp_degree": 1,
+            "mp_degree": 1,
+            "pp_degree": self.model_parallel_size
+        }
+        fleet.init(is_collective=True, strategy=strategy)
+        self.hcg = fleet.get_hybrid_communicate_group()
+
+    def test_pipelayer_desc(self):
+        pipe_model = AlexNetPipeDesc(num_stages=self.model_parallel_size)
+        np.testing.assert_array_equal(len(pipe_model.parameters()), 6)
+
+    def test_pipelayer_sequential(self):
+        init_net = AlexNetPipe()
+        pipe_model = PipelineLayer(
+            layers=init_net.to_layers(),
+            num_stages=self.model_parallel_size,
+            loss_fn=nn.CrossEntropyLoss())
+        stage_id = self.hcg.get_stage_id()
+        init_parameters = init_net.parameters()
+        pipe_parameters = pipe_model.parameters()
+        part_number = len(init_parameters) // 2
+
+        if stage_id == 0:
+            for idx in range(part_number):
+                param_a = init_parameters[idx]
+                param_b = pipe_parameters[idx]
+                np.testing.assert_array_equal(param_a.name, param_b.name)
+                np.testing.assert_allclose(param_a.numpy(), param_b.numpy())
+
+        elif stage_id == 1:
+            for idx in range(part_number):
+                param_a = init_parameters[idx + part_number]
+                param_b = pipe_parameters[idx]
+
+                np.testing.assert_array_equal(param_a.name, param_b.name)
+                np.testing.assert_allclose(param_a.numpy(), param_b.numpy())
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
index dfec1cc7572be..8e4c091cd01dd 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
@@ -32,4 +32,6 @@ if(WITH_GPU AND TENSORRT_FOUND)
 set_tests_properties(test_trt_subgraph_pass PROPERTIES TIMEOUT 120)
 set_tests_properties(test_trt_activation_pass PROPERTIES TIMEOUT 120)
 set_tests_properties(test_trt_conv_pass PROPERTIES TIMEOUT 120)
+set_tests_properties(test_trt_multiclass_nms_op PROPERTIES TIMEOUT 200)
+set_tests_properties(test_trt_dynamic_shape PROPERTIES TIMEOUT 120)
 endif()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_dynamic_shape.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_dynamic_shape.py
new file mode 100644
index 0000000000000..fd69a8bf6c37f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_dynamic_shape.py
@@ -0,0 +1,92 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import PassVersionChecker
+from paddle.fluid.core import AnalysisConfig
+
+
+class TRTDynamicShapeTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 16, 16], dtype="float32")
+            out = fluid.layers.conv2d(
+                input=data,
+                num_filters=3,
+                filter_size=3,
+                groups=1,
+                padding=[1, 1],
+                bias_attr=False,
+                act=None)
+
+        self.feeds = self.set_feeds()
+        self.enable_trt = True
+        self.trt_parameters = TRTDynamicShapeTest.TensorRTParam(
+            1 << 30, 1, 1, AnalysisConfig.Precision.Float32, False, False)
+        self.dynamic_shape_params = TRTDynamicShapeTest.DynamicShapeParam({
+            'data': [1, 3, 8, 8]
+        }, {'data': [1, 3, 32, 32]}, {'data': [1, 3, 16, 16]}, False)
+        self.fetch_list = [out]
+
+    def set_feeds(self):
+        return {"data": np.random.random([1, 3, 16, 16]).astype("float32"), }
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            self.check_output_with_option(use_gpu)
+
+
+class TRTDynamicShapeOutOfBound1Test(TRTDynamicShapeTest):
+    def set_feeds(self):
+        return {"data": np.random.random([1, 3, 64, 16]).astype("float32"), }
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            with self.assertRaises(Exception):
+                self.check_output_with_option(use_gpu)
+
+
+class TRTDynamicShapeOutOfBound2Test(TRTDynamicShapeTest):
+    def set_feeds(self):
+        return {"data": np.random.random([2, 3, 16, 16]).astype("float32"), }
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            with self.assertRaises(Exception):
+                self.check_output_with_option(use_gpu)
+
+
+class TRTDynamicShapeOutOfBound3Test(TRTDynamicShapeTest):
+    def set_feeds(self):
+        return {"data": np.random.random([1, 3, 4, 16]).astype("float32"), }
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            with self.assertRaises(Exception):
+                self.check_output_with_option(use_gpu)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_fc_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_fc_fuse_pass.py
index 48706bf5ad1fd..3daa50020bab2 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_fc_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_fc_fuse_pass.py
@@ -55,5 +55,182 @@ def test_check_output(self):
             self.check_output_with_option(use_gpu[i])
 
 
+class FCFusePassTRTDynamicDims2Test(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(name="data", shape=[32, 128], dtype="float32")
+            fc_out1 = fluid.layers.fc(input=data,
+                                      size=64,
+                                      num_flatten_dims=1,
+                                      act="relu")
+            out = fluid.layers.softmax(input=fc_out1)
+
+        self.feeds = {"data": np.random.random((32, 128)).astype("float32")}
+        self.enable_trt = True
+        self.trt_parameters = FCFusePassTRTDynamicDims2Test.TensorRTParam(
+            1 << 30, 32, 2, AnalysisConfig.Precision.Float32, False, False)
+        self.dynamic_shape_params = FCFusePassTRTDynamicDims2Test.DynamicShapeParam(
+            {
+                'data': [1, 128]
+            }, {'data': [64, 128]}, {'data': [32, 128]}, False)
+        self.fetch_list = [out]
+
+    def test_check_output(self):
+        use_gpu = [False]
+        if core.is_compiled_with_cuda():
+            use_gpu.append(True)
+        for i in range(len(use_gpu)):
+            self.check_output_with_option(use_gpu[i])
+
+
+class FCFusePassTRTDynamicDims3Cols1Test(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(name="data", shape=[32, 128, 32], dtype="float32")
+            fc_out1 = fluid.layers.fc(input=data,
+                                      size=64,
+                                      num_flatten_dims=1,
+                                      act="relu")
+            out = fluid.layers.softmax(input=fc_out1)
+
+        self.feeds = {"data": np.random.random((32, 128, 32)).astype("float32")}
+        self.enable_trt = True
+        self.trt_parameters = FCFusePassTRTDynamicDims3Cols1Test.TensorRTParam(
+            1 << 30, 32, 2, AnalysisConfig.Precision.Float32, False, False)
+        self.dynamic_shape_params = FCFusePassTRTDynamicDims3Cols1Test.DynamicShapeParam(
+            {
+                'data': [1, 128, 32]
+            }, {'data': [64, 128, 32]}, {'data': [32, 128, 32]}, False)
+        self.fetch_list = [out]
+
+    def test_check_output(self):
+        use_gpu = [False]
+        if core.is_compiled_with_cuda():
+            use_gpu.append(True)
+        for i in range(len(use_gpu)):
+            self.check_output_with_option(use_gpu[i])
+
+
+class FCFusePassTRTDynamicDims3Cols2Test(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(name="data", shape=[32, 128, 32], dtype="float32")
+            fc_out1 = fluid.layers.fc(input=data,
+                                      size=64,
+                                      num_flatten_dims=2,
+                                      act="relu")
+            out = fluid.layers.softmax(input=fc_out1)
+
+        self.feeds = {"data": np.random.random((32, 128, 32)).astype("float32")}
+        self.enable_trt = True
+        self.trt_parameters = FCFusePassTRTDynamicDims3Cols2Test.TensorRTParam(
+            1 << 30, 32, 2, AnalysisConfig.Precision.Float32, False, False)
+        self.dynamic_shape_params = FCFusePassTRTDynamicDims3Cols2Test.DynamicShapeParam(
+            {
+                'data': [1, 32, 32]
+            }, {'data': [64, 256, 32]}, {'data': [32, 128, 32]}, False)
+        self.fetch_list = [out]
+
+    def test_check_output(self):
+        use_gpu = [False]
+        if core.is_compiled_with_cuda():
+            use_gpu.append(True)
+        for i in range(len(use_gpu)):
+            self.check_output_with_option(use_gpu[i])
+
+
+class FCFusePassTRTDynamicDims4Cols1Test(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[32, 12, 4, 6], dtype="float32")
+            fc_out1 = fluid.layers.fc(input=data,
+                                      size=64,
+                                      num_flatten_dims=1,
+                                      act="relu")
+            out = fluid.layers.softmax(input=fc_out1)
+
+        self.feeds = {
+            "data": np.random.random((32, 12, 4, 6)).astype("float32")
+        }
+        self.enable_trt = True
+        self.trt_parameters = FCFusePassTRTDynamicDims4Cols1Test.TensorRTParam(
+            1 << 30, 32, 2, AnalysisConfig.Precision.Float32, False, False)
+        self.dynamic_shape_params = FCFusePassTRTDynamicDims4Cols1Test.DynamicShapeParam(
+            {
+                'data': [1, 12, 4, 6]
+            }, {'data': [64, 12, 4, 6]}, {'data': [32, 12, 4, 6]}, False)
+        self.fetch_list = [out]
+
+    def test_check_output(self):
+        use_gpu = [False]
+        if core.is_compiled_with_cuda():
+            use_gpu.append(True)
+        for i in range(len(use_gpu)):
+            self.check_output_with_option(use_gpu[i])
+
+
+class FCFusePassTRTDynamicDims4Cols2Test(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[32, 128, 32, 32], dtype="float32")
+            fc_out1 = fluid.layers.fc(input=data,
+                                      size=64,
+                                      num_flatten_dims=2,
+                                      act="relu")
+            out = fluid.layers.softmax(input=fc_out1)
+
+        self.feeds = {
+            "data": np.random.random((32, 128, 32, 32)).astype("float32")
+        }
+        self.enable_trt = True
+        self.trt_parameters = FCFusePassTRTDynamicDims4Cols2Test.TensorRTParam(
+            1 << 30, 32, 2, AnalysisConfig.Precision.Float32, False, False)
+        self.dynamic_shape_params = FCFusePassTRTDynamicDims4Cols2Test.DynamicShapeParam(
+            {
+                'data': [1, 64, 32, 32]
+            }, {'data': [64, 256, 32, 32]}, {'data': [32, 128, 32, 32]}, False)
+        self.fetch_list = [out]
+
+    def test_check_output(self):
+        use_gpu = [False]
+        if core.is_compiled_with_cuda():
+            use_gpu.append(True)
+        for i in range(len(use_gpu)):
+            self.check_output_with_option(use_gpu[i])
+
+
+class FCFusePassTRTDynamicDims4Cols3Test(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[32, 128, 32, 32], dtype="float32")
+            fc_out1 = fluid.layers.fc(input=data,
+                                      size=64,
+                                      num_flatten_dims=3,
+                                      act="relu")
+            out = fluid.layers.softmax(input=fc_out1)
+
+        self.feeds = {
+            "data": np.random.random((32, 128, 32, 32)).astype("float32")
+        }
+        self.enable_trt = True
+        self.trt_parameters = FCFusePassTRTDynamicDims4Cols3Test.TensorRTParam(
+            1 << 30, 32, 2, AnalysisConfig.Precision.Float32, False, False)
+        self.dynamic_shape_params = FCFusePassTRTDynamicDims4Cols3Test.DynamicShapeParam(
+            {
+                'data': [1, 128, 32, 32]
+            }, {'data': [64, 128, 32, 32]}, {'data': [32, 128, 32, 32]}, False)
+        self.fetch_list = [out]
+
+    def test_check_output(self):
+        use_gpu = [False]
+        if core.is_compiled_with_cuda():
+            use_gpu.append(True)
+        for i in range(len(use_gpu)):
+            self.check_output_with_option(use_gpu[i])
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_bilinear_interp_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_bilinear_interp_mkldnn_op.py
index e86273ea1c28e..e740efa14c575 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_bilinear_interp_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_bilinear_interp_mkldnn_op.py
@@ -198,4 +198,6 @@ def init_test_case(self):
 
 
 if __name__ == "__main__":
+    from paddle import enable_static
+    enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_bilinear_interp_v2_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_bilinear_interp_v2_mkldnn_op.py
new file mode 100644
index 0000000000000..e3b0639289ab2
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_bilinear_interp_v2_mkldnn_op.py
@@ -0,0 +1,210 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import math
+import paddle
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from paddle.fluid.tests.unittests.op_test import OpTest
+from paddle.fluid.tests.unittests.op_test import skip_check_grad_ci
+
+
+def bilinear_interp_mkldnn_np(input,
+                              out_h,
+                              out_w,
+                              out_size=None,
+                              actual_shape=None,
+                              data_layout='NCHW'):
+    """bilinear interpolation implement in shape [N, C, H, W]"""
+    if data_layout == "NHWC":
+        input = np.transpose(input, (0, 3, 1, 2))  # NHWC => NCHW
+    if out_size is not None:
+        out_h = out_size[0]
+        out_w = out_size[1]
+    if actual_shape is not None:
+        out_h = actual_shape[0]
+        out_w = actual_shape[1]
+    batch_size, channel, in_h, in_w = input.shape
+
+    out = np.zeros((batch_size, channel, out_h, out_w))
+
+    for oh in range(out_h):
+        h0 = int(math.floor((oh + 0.5) * in_h / out_h - 0.5))
+        h1 = int(math.ceil((oh + 0.5) * in_h / out_h - 0.5))
+        h0 = max(h0, 0)
+        h1 = min(h1, in_h - 1)
+        Wh = (oh + 0.5) * in_h / out_h - 0.5 - h0
+        for ow in range(out_w):
+            w0 = int(math.floor((ow + 0.5) * in_w / out_w - 0.5))
+            w1 = int(math.ceil((ow + 0.5) * in_w / out_w - 0.5))
+            w0 = max(w0, 0)
+            w1 = min(w1, in_w - 1)
+            Ww = (ow + 0.5) * in_w / out_w - 0.5 - w0
+            input_h0_w0 = input[:, :, h0, w0]
+            input_h1_w0 = input[:, :, h1, w0]
+            input_h0_w1 = input[:, :, h0, w1]
+            input_h1_w1 = input[:, :, h1, w1]
+            out[:, :, oh, ow] = input_h0_w0 * (1 - Wh) * (
+                1 - Ww) + input_h1_w0 * Wh * (1 - Ww) + input_h0_w1 * (
+                    1 - Wh) * Ww + input_h1_w1 * Wh * Ww
+
+    if data_layout == "NHWC":
+        out = np.transpose(out, (0, 2, 3, 1))  # NCHW => NHWC
+
+    return out.astype(input.dtype)
+
+
+@skip_check_grad_ci(reason="Haven not implement interpolate grad kernel.")
+class TestBilinearInterpMKLDNNOp(OpTest):
+    def init_test_case(self):
+        pass
+
+    def setUp(self):
+        self.op_type = "bilinear_interp_v2"
+        self.interp_method = 'bilinear'
+        self._cpu_only = True
+        self.use_mkldnn = True
+        self.input_shape = [1, 1, 2, 2]
+        self.data_layout = 'NCHW'
+        # priority: actual_shape > out_size > scale > out_h & out_w
+        self.out_h = 1
+        self.out_w = 1
+        self.scale = 2.0
+        self.out_size = None
+        self.actual_shape = None
+
+        self.init_test_case()
+
+        input_np = np.random.random(self.input_shape).astype("float32")
+        if self.data_layout == "NCHW":
+            in_h = self.input_shape[2]
+            in_w = self.input_shape[3]
+        else:
+            in_h = self.input_shape[1]
+            in_w = self.input_shape[2]
+
+        scale_h = 0
+        scale_w = 0
+
+        if self.scale:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                scale_h = float(self.scale)
+                scale_w = float(self.scale)
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                scale_w = self.scale[0]
+                scale_h = self.scale[0]
+            elif isinstance(self.scale, list) and len(self.scale) > 1:
+                scale_w = self.scale[1]
+                scale_h = self.scale[0]
+
+        if scale_h > 0 and scale_w > 0:
+            out_h = int(in_h * scale_h)
+            out_w = int(in_w * scale_w)
+        else:
+            out_h = self.out_h
+            out_w = self.out_w
+
+        output_np = bilinear_interp_mkldnn_np(input_np, out_h, out_w,
+                                              self.out_size, self.actual_shape,
+                                              self.data_layout)
+
+        if isinstance(self.scale, float):
+            self.scale = [self.scale, self.scale]
+
+        self.inputs = {'X': input_np}
+        if self.out_size is not None:
+            self.inputs['OutSize'] = self.out_size
+        if self.actual_shape is not None:
+            self.inputs['OutSize'] = self.actual_shape
+        self.attrs = {
+            'interp_method': self.interp_method,
+            'out_h': self.out_h,
+            'out_w': self.out_w,
+            'scale': self.scale,
+            'data_layout': self.data_layout,
+            'use_mkldnn': self.use_mkldnn
+        }
+        self.outputs = {'Out': output_np}
+
+    def test_check_output(self):
+        self.check_output(check_dygraph=False)
+
+
+class TestBilinearInterpOpMKLDNNNHWC(TestBilinearInterpMKLDNNOp):
+    def init_test_case(self):
+        self.input_shape = [3, 2, 32, 16]
+        self.out_h = 27
+        self.out_w = 49
+        self.scale = [2.0, 3.0]
+        self.data_layout = 'NHWC'
+
+
+class TestBilinearNeighborInterpMKLDNNCase2(TestBilinearInterpMKLDNNOp):
+    def init_test_case(self):
+        self.input_shape = [3, 3, 9, 6]
+        self.out_h = 12
+        self.out_w = 12
+
+
+class TestBilinearNeighborInterpCase3(TestBilinearInterpMKLDNNOp):
+    def init_test_case(self):
+        self.input_shape = [1, 1, 32, 64]
+        self.out_h = 64
+        self.out_w = 128
+        self.scale = [0.1, 0.05]
+
+
+class TestBilinearNeighborInterpCase4(TestBilinearInterpMKLDNNOp):
+    def init_test_case(self):
+        self.input_shape = [1, 1, 32, 64]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = [13.0, 15.0]
+        self.out_size = np.array([65, 129]).astype("int32")
+
+
+class TestBilinearNeighborInterpCase5(TestBilinearInterpMKLDNNOp):
+    def init_test_case(self):
+        self.input_shape = [1, 1, 9, 6]
+        self.out_h = 12
+        self.out_w = 12
+        self.out_size = np.array([13, 13]).astype("int32")
+
+
+class TestBilinearNeighborInterpCase6(TestBilinearInterpMKLDNNOp):
+    def init_test_case(self):
+        self.input_shape = [1, 1, 32, 64]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = 1.0
+        self.out_size = np.array([65, 129]).astype("int32")
+
+
+class TestBilinearNeighborInterpSame(TestBilinearInterpMKLDNNOp):
+    def init_test_case(self):
+        self.input_shape = [2, 3, 32, 64]
+        self.out_h = 32
+        self.out_w = 64
+        self.scale = 2.0
+        self.out_size = np.array([65, 129]).astype("int32")
+
+
+if __name__ == "__main__":
+    from paddle import enable_static
+    enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_nearest_interp_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_nearest_interp_mkldnn_op.py
index 1e4bfd5f0cf01..9f39826cb3ed2 100755
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_nearest_interp_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_nearest_interp_mkldnn_op.py
@@ -163,4 +163,6 @@ def init_test_case(self):
 
 
 if __name__ == "__main__":
+    from paddle import enable_static
+    enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_nearest_interp_v2_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_nearest_interp_v2_mkldnn_op.py
new file mode 100644
index 0000000000000..b608ca3af2f36
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_nearest_interp_v2_mkldnn_op.py
@@ -0,0 +1,184 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from paddle.fluid.tests.unittests.op_test import OpTest
+from paddle.fluid.tests.unittests.op_test import skip_check_grad_ci
+
+
+def nearest_neighbor_interp_mkldnn_np(X,
+                                      out_h,
+                                      out_w,
+                                      out_size=None,
+                                      actual_shape=None,
+                                      data_layout='NCHW'):
+    """nearest neighbor interpolation implement in shape [N, C, H, W]"""
+    if data_layout == "NHWC":
+        X = np.transpose(X, (0, 3, 1, 2))  # NHWC => NCHW
+    if out_size is not None:
+        out_h = out_size[0]
+        out_w = out_size[1]
+    if actual_shape is not None:
+        out_h = actual_shape[0]
+        out_w = actual_shape[1]
+
+    n, c, in_h, in_w = X.shape
+
+    fh = fw = 0.0
+    if (out_h > 1):
+        fh = out_h * 1.0 / in_h
+    if (out_w > 1):
+        fw = out_w * 1.0 / in_w
+
+    out = np.zeros((n, c, out_h, out_w))
+
+    for oh in range(out_h):
+        ih = int(round((oh + 0.5) / fh - 0.5))
+        for ow in range(out_w):
+            iw = int(round((ow + 0.5) / fw - 0.5))
+            out[:, :, oh, ow] = X[:, :, ih, iw]
+
+    if data_layout == "NHWC":
+        out = np.transpose(out, (0, 2, 3, 1))  # NCHW => NHWC
+
+    return out.astype(X.dtype)
+
+
+@skip_check_grad_ci(reason="Haven not implement interpolate grad kernel.")
+class TestNearestInterpV2MKLDNNOp(OpTest):
+    def init_test_case(self):
+        pass
+
+    def setUp(self):
+        self.op_type = "nearest_interp_v2"
+        self.interp_method = 'nearest'
+        self._cpu_only = True
+        self.use_mkldnn = True
+        self.input_shape = [1, 1, 2, 2]
+        self.data_layout = 'NCHW'
+        # priority: actual_shape > out_size > scale > out_h & out_w
+        self.out_h = 1
+        self.out_w = 1
+        self.scale = [2.0, 3.0]
+        self.out_size = None
+        self.actual_shape = None
+
+        self.init_test_case()
+
+        input_np = np.random.random(self.input_shape).astype("float32")
+        if self.data_layout == "NCHW":
+            in_h = self.input_shape[2]
+            in_w = self.input_shape[3]
+        else:
+            in_h = self.input_shape[1]
+            in_w = self.input_shape[2]
+
+        scale_h = 0
+        scale_w = 0
+
+        if self.scale:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                scale_h = float(self.scale)
+                scale_w = float(self.scale)
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                scale_w = self.scale[0]
+                scale_h = self.scale[0]
+            elif isinstance(self.scale, list) and len(self.scale) > 1:
+                scale_w = self.scale[1]
+                scale_h = self.scale[0]
+
+        if scale_h > 0 and scale_w > 0:
+            out_h = int(in_h * scale_h)
+            out_w = int(in_w * scale_w)
+        else:
+            out_h = self.out_h
+            out_w = self.out_w
+
+        output_np = nearest_neighbor_interp_mkldnn_np(
+            input_np, out_h, out_w, self.out_size, self.actual_shape,
+            self.data_layout)
+
+        if isinstance(self.scale, float):
+            self.scale = [self.scale]
+
+        self.inputs = {'X': input_np}
+        if self.out_size is not None:
+            self.inputs['OutSize'] = self.out_size
+        if self.actual_shape is not None:
+            self.inputs['OutSize'] = self.actual_shape
+        self.attrs = {
+            'interp_method': self.interp_method,
+            'out_h': self.out_h,
+            'out_w': self.out_w,
+            'scale': self.scale,
+            'data_layout': self.data_layout,
+            'use_mkldnn': self.use_mkldnn
+        }
+        self.outputs = {'Out': output_np}
+
+    def test_check_output(self):
+        self.check_output(check_dygraph=False)
+
+
+class TestNearestInterpOpV2MKLDNNNHWC(TestNearestInterpV2MKLDNNOp):
+    def init_test_case(self):
+        self.input_shape = [3, 2, 32, 16]
+        self.out_h = 27
+        self.out_w = 49
+        self.scale = [2.0, 3.0]
+        self.data_layout = 'NHWC'
+
+
+class TestNearestNeighborInterpV2MKLDNNCase2(TestNearestInterpV2MKLDNNOp):
+    def init_test_case(self):
+        self.input_shape = [3, 3, 9, 6]
+        self.out_h = 12
+        self.out_w = 12
+
+
+class TestNearestNeighborInterpV2MKLDNNCase3(TestNearestInterpV2MKLDNNOp):
+    def init_test_case(self):
+        self.input_shape = [1, 1, 32, 64]
+        self.out_h = 64
+        self.out_w = 128
+        self.scale = [0.1, 0.05]
+
+
+class TestNearestNeighborInterpV2MKLDNNCase4(TestNearestInterpV2MKLDNNOp):
+    def init_test_case(self):
+        self.input_shape = [1, 1, 32, 64]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = [13.0, 15.0]
+        self.out_size = np.array([65, 129]).astype("int32")
+
+
+class TestNearestNeighborInterpV2MKLDNNSame(TestNearestInterpV2MKLDNNOp):
+    def init_test_case(self):
+        self.input_shape = [2, 3, 32, 64]
+        self.out_h = 32
+        self.out_w = 64
+        self.out_size = np.array([65, 129]).astype("int32")
+
+
+if __name__ == "__main__":
+    from paddle import enable_static
+    enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_reduce_bf16_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_reduce_bf16_mkldnn_op.py
new file mode 100644
index 0000000000000..1d7ab4f6b3369
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_reduce_bf16_mkldnn_op.py
@@ -0,0 +1,209 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from paddle.fluid.tests.unittests.op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+import paddle
+
+
+@unittest.skipIf(not core.supports_bfloat16(),
+                 "place does not support BF16 evaluation")
+@unittest.skipIf(core.is_compiled_with_cuda(),
+                 "core is compiled with CUDA which has no BF implementation")
+class TestReduceSumDefaultBF16OneDNNOp(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.use_mkldnn = True
+        self.x_fp32 = np.random.random((5, 6, 10)).astype("float32")
+        self.x_bf16 = convert_float_to_uint16(self.x_fp32)
+        self.inputs = {'X': self.x_bf16}
+        self.outputs = {'Out': self.x_fp32.sum(axis=0)}
+        self.attrs = {'use_mkldnn': self.use_mkldnn}
+
+    def test_check_output(self):
+        self.check_output(check_dygraph=False)
+
+    def calculate_grads(self):
+        tmp_tensor = np.zeros(self.x_fp32.shape).astype("float32")
+
+        prod_of_reduced_dims = self.inputs['X'].shape[0]
+        axis = 0
+
+        if "dim" in self.attrs:
+            prod_of_reduced_dims = 1
+            axis = tuple(self.attrs['dim'])
+            for i in range(len(axis)):
+                ax = axis[i]
+                if axis[i] < 0:
+                    ax = len(axis) + axis[i]
+                prod_of_reduced_dims *= self.inputs['X'].shape[ax]
+
+        if 'reduce_all' in self.attrs:
+            if self.attrs['reduce_all'] is True:
+                axis = None
+                prod_of_reduced_dims = np.asarray(self.inputs['X'].shape).prod()
+
+        keepdim = False
+        if 'keep_dim' in self.attrs:
+            keepdim = True
+
+        self.grad_Out = self.x_fp32.sum(axis=axis, keepdims=keepdim)
+        self.grad_Out = np.atleast_1d(self.grad_Out)
+        self.grad_X = tmp_tensor + self.grad_Out  # broadcast grad
+
+        if self.op_type == 'reduce_mean':
+            self.grad_X /= prod_of_reduced_dims
+
+
+class TestReduceDefaultWithGradBF16OneDNNOp(TestReduceSumDefaultBF16OneDNNOp):
+    def test_check_grad(self):
+        self.calculate_grads()
+        self.check_grad_with_place(
+            core.CPUPlace(), ["X"],
+            "Out",
+            check_dygraph=False,
+            user_defined_grads=[self.grad_X],
+            user_defined_grad_outputs=[convert_float_to_uint16(self.grad_Out)])
+
+
+class TestReduceSum4DReduceAllDimAttributeBF16OneDNNOp(
+        TestReduceDefaultWithGradBF16OneDNNOp):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.use_mkldnn = True
+        self.x_fp32 = np.random.normal(size=(2, 3, 5, 6)).astype('float32')
+        self.x_bf16 = convert_float_to_uint16(self.x_fp32)
+        self.inputs = {'X': self.x_bf16}
+        self.attrs = {'use_mkldnn': self.use_mkldnn, 'dim': [0, 1, 2, 3]}
+        self.outputs = {'Out': self.x_fp32.sum(axis=tuple(self.attrs['dim']))}
+
+
+class TestReduceSum4DReduceAllWithoutReduceAllAttributeNegativeDimsBF16OneDNNOp(
+        TestReduceDefaultWithGradBF16OneDNNOp):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.use_mkldnn = True
+        self.x_fp32 = np.random.normal(size=(4, 7, 6, 6)).astype('float32')
+        self.x_bf16 = convert_float_to_uint16(self.x_fp32)
+        self.inputs = {'X': self.x_bf16}
+        self.attrs = {'use_mkldnn': self.use_mkldnn, 'dim': [-1, -2, -3, -4]}
+        self.outputs = {'Out': self.x_fp32.sum(axis=tuple(self.attrs['dim']))}
+
+
+class TestReduceSum5DReduceAllKeepDimsBF16OneDNNOp(
+        TestReduceDefaultWithGradBF16OneDNNOp):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.use_mkldnn = True
+        self.x_fp32 = np.random.normal(size=(2, 5, 3, 2, 5)).astype('float32')
+        self.x_bf16 = convert_float_to_uint16(self.x_fp32)
+        self.inputs = {'X': self.x_bf16}
+        self.attrs = {'reduce_all': True, 'keep_dim': True, 'use_mkldnn': True}
+        self.outputs = {'Out': self.x_fp32.sum(keepdims=self.attrs['keep_dim'])}
+
+
+class TestReduceSum4DReduceAllBF16OneDNNOp(
+        TestReduceDefaultWithGradBF16OneDNNOp):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.use_mkldnn = True
+        self.x_fp32 = np.random.normal(size=(4, 5, 4, 5)).astype('float32')
+        self.x_bf16 = convert_float_to_uint16(self.x_fp32)
+        self.inputs = {'X': self.x_bf16}
+        self.attrs = {'reduce_all': True, 'use_mkldnn': self.use_mkldnn}
+        self.outputs = {'Out': self.x_fp32.sum()}
+
+
+@skip_check_grad_ci(
+    reason="reduce_max is discontinuous non-derivable function,"
+    " its gradient check is not supported by unittest framework.")
+class TestReduceMax3DBF16OneDNNOp(TestReduceSumDefaultBF16OneDNNOp):
+    """Remove Max with subgradient from gradient check to confirm the success of CI."""
+
+    def setUp(self):
+        self.op_type = "reduce_max"
+        self.use_mkldnn = True
+        self.x_fp32 = np.random.random((5, 6, 10)).astype("float32")
+        self.x_bf16 = convert_float_to_uint16(self.x_fp32)
+        self.inputs = {'X': self.x_bf16}
+        self.attrs = {'dim': [-1], 'use_mkldnn': self.use_mkldnn}
+        self.outputs = {'Out': self.x_fp32.max(axis=tuple(self.attrs['dim']))}
+
+
+@skip_check_grad_ci(
+    reason="reduce_max is discontinuous non-derivable function,"
+    " its gradient check is not supported by unittest framework.")
+class TestReduceMax4DNegativeAndPositiveDimsBF16OneDNNOp(
+        TestReduceSumDefaultBF16OneDNNOp):
+    """Remove Max with subgradient from gradient check to confirm the success of CI."""
+
+    def setUp(self):
+        self.op_type = "reduce_max"
+        self.use_mkldnn = True
+        self.x_fp32 = np.random.random((5, 6, 10, 9)).astype("float32")
+        self.x_bf16 = convert_float_to_uint16(self.x_fp32)
+        self.inputs = {'X': self.x_bf16}
+        self.attrs = {'dim': [-1, 0, 1], 'use_mkldnn': self.use_mkldnn}
+        self.outputs = {'Out': self.x_fp32.max(axis=tuple(self.attrs['dim']))}
+
+
+@skip_check_grad_ci(
+    reason="reduce_min is discontinuous non-derivable function,"
+    " its gradient check is not supported by unittest framework.")
+class TestReduceMin3DBF16OneDNNOp(TestReduceSumDefaultBF16OneDNNOp):
+    """Remove Min with subgradient from gradient check to confirm the success of CI."""
+
+    def setUp(self):
+        self.op_type = "reduce_min"
+        self.use_mkldnn = True
+        self.x_fp32 = np.random.random((5, 6, 10)).astype("float32")
+        self.x_bf16 = convert_float_to_uint16(self.x_fp32)
+        self.inputs = {'X': self.x_bf16}
+        self.attrs = {'dim': [2], 'use_mkldnn': self.use_mkldnn}
+        self.outputs = {'Out': self.x_fp32.min(axis=tuple(self.attrs['dim']))}
+
+
+class TestReduceMean3DBF16OneDNNOp(TestReduceDefaultWithGradBF16OneDNNOp):
+    def setUp(self):
+        self.op_type = "reduce_mean"
+        self.use_mkldnn = True
+        self.x_fp32 = np.random.random((5, 6, 10)).astype("float32")
+        self.x_bf16 = convert_float_to_uint16(self.x_fp32)
+        self.inputs = {'X': self.x_bf16}
+        self.attrs = {'use_mkldnn': self.use_mkldnn}
+        self.outputs = {'Out': self.x_fp32.sum(axis=0) / self.x_fp32.shape[0]}
+
+
+class TestReduceMean4DBF16OneDNNOp(TestReduceDefaultWithGradBF16OneDNNOp):
+    def setUp(self):
+        self.op_type = "reduce_mean"
+        self.use_mkldnn = True
+        self.x_fp32 = np.random.random((5, 6, 3, 5)).astype("float32")
+        self.x_bf16 = convert_float_to_uint16(self.x_fp32)
+        self.inputs = {'X': self.x_bf16}
+        self.attrs = {'use_mkldnn': self.use_mkldnn, 'dim': [0, 1]}
+        self.outputs = {
+            'Out': self.x_fp32.sum(axis=tuple(self.attrs['dim'])) /
+            (self.x_fp32.shape[0] * self.x_fp32.shape[1])
+        }
+
+
+if __name__ == '__main__':
+    paddle.enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_reduce_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_reduce_mkldnn_op.py
new file mode 100644
index 0000000000000..46ee2a14a2018
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_reduce_mkldnn_op.py
@@ -0,0 +1,169 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from paddle.fluid.tests.unittests.op_test import OpTest, skip_check_grad_ci
+import paddle.fluid as fluid
+import paddle
+
+
+class TestReduceSumDefaultOneDNNOp(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.use_mkldnn = True
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float32")}
+        self.outputs = {'Out': self.inputs['X'].sum(axis=0)}
+        self.attrs = {'use_mkldnn': self.use_mkldnn}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestReduceDefaultWithGradOneDNNOp(TestReduceSumDefaultOneDNNOp):
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestReduceSum4DOneDNNOp(TestReduceDefaultWithGradOneDNNOp):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.use_mkldnn = True
+        self.inputs = {'X': np.random.random((5, 10, 5, 5)).astype("float32")}
+        self.attrs = {'use_mkldnn': self.use_mkldnn, 'dim': [2]}
+        self.outputs = {
+            'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']))
+        }
+
+
+class TestReduceSum4DReduceAllDimAttributeBF16OneDNNOp(
+        TestReduceDefaultWithGradOneDNNOp):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.use_mkldnn = True
+        self.inputs = {'X': np.random.random((5, 10, 5, 3)).astype("float32")}
+        self.attrs = {'use_mkldnn': self.use_mkldnn, 'dim': [0, 1, 2, 3]}
+        self.outputs = {
+            'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']))
+        }
+
+
+class TestReduceSum5DKeepDimsOneDNNOp(TestReduceDefaultWithGradOneDNNOp):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.use_mkldnn = True
+        self.inputs = {'X': np.random.random((2, 5, 3, 2, 2)).astype("float32")}
+        self.attrs = {'dim': (2, 3, 4), 'keep_dim': True, 'use_mkldnn': True}
+        self.outputs = {
+            'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']),
+                                        keepdims=self.attrs['keep_dim'])
+        }
+
+
+class TestReduceSum5DReduceAllKeepDimsOneDNNOp(
+        TestReduceDefaultWithGradOneDNNOp):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.use_mkldnn = True
+        self.inputs = {'X': np.random.random((2, 5, 3, 2, 2)).astype("float32")}
+        self.attrs = {'reduce_all': True, 'keep_dim': True, 'use_mkldnn': True}
+        self.outputs = {
+            'Out': self.inputs['X'].sum(keepdims=self.attrs['keep_dim'])
+        }
+
+
+class TestReduceSum4DReduceAllOneDNNOp(TestReduceDefaultWithGradOneDNNOp):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.use_mkldnn = True
+        self.inputs = {'X': np.random.random((5, 6, 2, 10)).astype("float32")}
+        self.attrs = {'reduce_all': True, 'use_mkldnn': self.use_mkldnn}
+        self.outputs = {'Out': self.inputs['X'].sum()}
+
+
+@skip_check_grad_ci(
+    reason="reduce_max is discontinuous non-derivable function,"
+    " its gradient check is not supported by unittest framework.")
+class TestReduceMax3DOneDNNOp(TestReduceSumDefaultOneDNNOp):
+    """Remove Max with subgradient from gradient check to confirm the success of CI."""
+
+    def setUp(self):
+        self.op_type = "reduce_max"
+        self.use_mkldnn = True
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float32")}
+        self.attrs = {'dim': [-1], 'use_mkldnn': self.use_mkldnn}
+        self.outputs = {
+            'Out': self.inputs['X'].max(axis=tuple(self.attrs['dim']))
+        }
+
+
+@skip_check_grad_ci(
+    reason="reduce_max is discontinuous non-derivable function,"
+    " its gradient check is not supported by unittest framework.")
+class TestReduceMax4DNegativeAndPositiveDimsOneDNNOp(
+        TestReduceSumDefaultOneDNNOp):
+    """Remove Max with subgradient from gradient check to confirm the success of CI."""
+
+    def setUp(self):
+        self.op_type = "reduce_max"
+        self.use_mkldnn = True
+        self.inputs = {'X': np.random.random((5, 6, 10, 9)).astype("float32")}
+        self.attrs = {'dim': [-1, 0, 1], 'use_mkldnn': self.use_mkldnn}
+        self.outputs = {
+            'Out': self.inputs['X'].max(axis=tuple(self.attrs['dim']))
+        }
+
+
+@skip_check_grad_ci(
+    reason="reduce_min is discontinuous non-derivable function,"
+    " its gradient check is not supported by unittest framework.")
+class TestReduceMin3DOneDNNOp(TestReduceSumDefaultOneDNNOp):
+    """Remove Min with subgradient from gradient check to confirm the success of CI."""
+
+    def setUp(self):
+        self.op_type = "reduce_min"
+        self.use_mkldnn = True
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float32")}
+        self.attrs = {'dim': [2], 'use_mkldnn': self.use_mkldnn}
+        self.outputs = {
+            'Out': self.inputs['X'].min(axis=tuple(self.attrs['dim']))
+        }
+
+
+class TestReduceMean3DOneDNNOp(TestReduceDefaultWithGradOneDNNOp):
+    def setUp(self):
+        self.op_type = "reduce_mean"
+        self.use_mkldnn = True
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float32")}
+        self.attrs = {'dim': [0], 'use_mkldnn': self.use_mkldnn}
+        self.outputs = {
+            'Out': self.inputs['X'].sum(axis=0) / self.inputs['X'].shape[0]
+        }
+
+
+class TestReduceMean4DReduceAllOneDNNOp(TestReduceDefaultWithGradOneDNNOp):
+    def setUp(self):
+        self.op_type = "reduce_mean"
+        self.use_mkldnn = True
+        self.inputs = {'X': np.random.random((5, 6, 8, 10)).astype("float32")}
+        self.attrs = {'reduce_all': True, 'use_mkldnn': self.use_mkldnn}
+        self.outputs = {
+            'Out':
+            self.inputs['X'].sum() / np.asarray(self.inputs['X'].shape).prod()
+        }
+
+
+if __name__ == '__main__':
+    paddle.enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_accuracy_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_accuracy_op_npu.py
new file mode 100644
index 0000000000000..b5175bdb19c7e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_accuracy_op_npu.py
@@ -0,0 +1,122 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestAccuracy(OpTest):
+    def setUp(self):
+        self.op_type = "accuracy"
+        self.set_npu()
+        self.init_dtype()
+        np.random.seed(SEED)
+        pred = np.random.uniform(1, 2, [11, 1]).astype(self.dtype)
+        label = pred.copy()
+        accuracy = np.array([1]).astype(self.dtype)
+        correct = np.array([11 * 1]).astype(self.dtype)
+        total = np.array([11 * 1]).astype(self.dtype)
+
+        self.inputs = {
+            "Out": OpTest.np_dtype_to_fluid_dtype(pred),
+            "Label": OpTest.np_dtype_to_fluid_dtype(label),
+            "Indices": OpTest.np_dtype_to_fluid_dtype(pred)
+        }
+        self.outputs = {
+            "Accuracy": accuracy,
+            "Correct": correct,
+            "Total": total
+        }
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.place = paddle.NPUPlace(0)
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+class TestAccuracy2(TestAccuracy):
+    def setUp(self):
+        self.op_type = "accuracy"
+        self.set_npu()
+        self.init_dtype()
+        np.random.seed(SEED)
+        pred = np.random.uniform(1, 2, [11, 1]).astype(self.dtype)
+        label = np.random.uniform(4, 5, [11, 1]).astype(self.dtype)
+        accuracy = np.array([0]).astype(self.dtype)
+        correct = np.array([11 * 0]).astype(self.dtype)
+        total = np.array([11 * 1]).astype(self.dtype)
+
+        self.inputs = {
+            "Out": OpTest.np_dtype_to_fluid_dtype(pred),
+            "Label": OpTest.np_dtype_to_fluid_dtype(label),
+            "Indices": OpTest.np_dtype_to_fluid_dtype(pred)
+        }
+        self.outputs = {
+            "Accuracy": accuracy,
+            "Correct": correct,
+            "Total": total
+        }
+
+
+class TestAccuracy3(TestAccuracy):
+    def setUp(self):
+        self.op_type = "accuracy"
+        self.set_npu()
+        self.init_dtype()
+        np.random.seed(SEED)
+        a = np.random.randint(1, 2, [5, 1])
+        b = np.random.randint(0, 1, [5, 1])
+        pred = np.row_stack((a, b)).astype(self.dtype)
+        label = np.random.randint(1, 2, [10, 1]).astype(self.dtype)
+        accuracy = np.array([0.5]).astype(self.dtype)
+        correct = np.array([5]).astype(self.dtype)
+        total = np.array([10 * 1]).astype(self.dtype)
+
+        self.inputs = {
+            "Out": OpTest.np_dtype_to_fluid_dtype(pred),
+            "Label": OpTest.np_dtype_to_fluid_dtype(label),
+            "Indices": OpTest.np_dtype_to_fluid_dtype(pred)
+        }
+        self.outputs = {
+            "Accuracy": accuracy,
+            "Correct": correct,
+            "Total": total
+        }
+
+
+class TestAccuracyInt(TestAccuracy):
+    def init_dtype(self):
+        self.dtype = np.int
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_adam_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_adam_op_npu.py
new file mode 100644
index 0000000000000..ebf041388eeab
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_adam_op_npu.py
@@ -0,0 +1,148 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from test_adam_op import adam_step
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestSGD(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.place = paddle.NPUPlace(0)
+        self.op_type = "adam"
+        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        # The second moment is positive
+        moment2 = np.random.random((102, 105)).astype("float32")
+
+        learning_rate = 0.004
+        beta1 = 0.78
+        beta2 = 0.836
+        epsilon = 1e-4
+        beta1_pow = beta1**10
+        beta2_pow = beta2**10
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'Moment1': moment1,
+            'Moment2': moment2,
+            'LearningRate': np.array([learning_rate]).astype("float32"),
+            'Beta1Pow': np.array([beta1_pow]).astype("float32"),
+            'Beta2Pow': np.array([beta2_pow]).astype("float32")
+        }
+
+        self.attrs = {'epsilon': epsilon, 'beta1': beta1, 'beta2': beta2}
+
+        param_out, moment1_out, \
+            moment2_out = adam_step(self.inputs, self.attrs)
+
+        self.outputs = {
+            'Moment1Out': moment1_out,
+            'Moment2Out': moment2_out,
+            'ParamOut': param_out,
+            'Beta1PowOut': np.array([beta1_pow]).astype("float32") * beta1,
+            'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2
+        }
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, atol=1e-5, check_dygraph=False)
+
+
+'''
+# TODO(zhiqiu): The following test may let 0-3 card down.
+# we need to analyze it and open it.
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestNet(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(32, 32)).astype('float32')
+        b_np = np.random.random(size=(32, 32)).astype('float32')
+        label_np = np.random.randint(2, size=(32, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
+            b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[32, 1], dtype='int64')
+
+            sum = paddle.add(a, b)
+            z = paddle.pow(sum, 2.0)
+
+            fc_1 = fluid.layers.fc(input=z, size=128)
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            adam = fluid.optimizer.Adam(learning_rate=0.01)
+            adam.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(
+                main_prog,
+                feed={"a": a_np,
+                      "b": b_np,
+                      "label": label_np},
+                fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        npu_pred, npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss))
+'''
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_amp_check_finite_and_scale_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_amp_check_finite_and_scale_op_npu.py
new file mode 100644
index 0000000000000..8828892dca3cc
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_amp_check_finite_and_scale_op_npu.py
@@ -0,0 +1,147 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import sys
+sys.path.append("..")
+from op_test import OpTest, skip_check_grad_ci
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import compiler, Program, program_guard
+from paddle.fluid.contrib.mixed_precision.amp_nn import check_finite_and_unscale
+
+paddle.enable_static()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestCheckFiniteAndUnscale(unittest.TestCase):
+    def get_prog(self):
+        paddle.enable_static()
+        main_program = paddle.static.Program()
+        with program_guard(main_program):
+            a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
+            b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
+            scale = paddle.static.data(name="scale", shape=[1], dtype='float32')
+            float_status = paddle.static.data(
+                name="status", shape=[8], dtype='float32')
+            main_program.global_block().append_op(
+                type="alloc_float_status",
+                outputs={"FloatStatus": float_status}, )
+            c = paddle.fluid.layers.elementwise_div(a, b)
+            out, found_inf = check_finite_and_unscale(
+                [c], scale, float_status=float_status)
+
+        return main_program, out, found_inf, float_status
+
+    def run_prog(self, a, b, scale):
+        main_program, out, found_inf, float_status = self.get_prog()
+        place = fluid.NPUPlace(0)
+        exe = fluid.Executor(place)
+        out_, founf_inf_, float_status_ = exe.run(
+            main_program,
+            feed={"a": a,
+                  "b": b,
+                  "scale": scale},
+            fetch_list=[out, found_inf, float_status])
+        print(float_status_)
+        return out_, founf_inf_
+
+    def test_contains_nan(self):
+        a = np.zeros((32, 32)).astype('float32')
+        b = np.zeros((32, 32)).astype('float32')
+        scale = np.array([2.0]).astype('float32')
+
+        out, found_inf = self.run_prog(a, b, scale)
+        print(out, found_inf)
+
+        self.assertTrue(found_inf[0])
+
+    def test_contains_inf(self):
+        a = np.ones((32, 32)).astype('float32')
+        b = np.zeros((32, 32)).astype('float32')
+        scale = np.array([2.0]).astype('float32')
+
+        out, found_inf = self.run_prog(a, b, scale)
+        print(out, found_inf)
+
+        self.assertTrue(found_inf[0])
+
+    def test_not_contains_nan_inf(self):
+        a = np.ones((32, 32)).astype('float32')
+        b = np.ones((32, 32)).astype('float32')
+        scale = np.array([2.0]).astype('float32')
+
+        out, found_inf = self.run_prog(a, b, scale)
+        print(out, found_inf)
+
+        self.assertTrue(np.allclose(out, (a / b) / scale[0]))
+        self.assertFalse(found_inf[0])
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestCheckFiniteAndUnscaleClearFloatStatus(unittest.TestCase):
+    def get_prog(self):
+        paddle.enable_static()
+        main_program = paddle.static.Program()
+        with program_guard(main_program):
+            a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
+            b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
+            scale = paddle.static.data(name="scale", shape=[1], dtype='float32')
+            float_status = paddle.static.data(
+                name="status", shape=[8], dtype='float32')
+            main_program.global_block().append_op(
+                type="alloc_float_status",
+                outputs={"FloatStatus": float_status}, )
+            c = paddle.fluid.layers.elementwise_div(a, b)
+            out, found_inf = check_finite_and_unscale(
+                [c], scale, float_status=float_status)
+            main_program.global_block().append_op(
+                type="alloc_float_status",
+                outputs={"FloatStatus": float_status}, )
+            d = paddle.fluid.layers.elementwise_add(a, b)
+            out, found_inf = check_finite_and_unscale(
+                [d], scale, float_status=float_status)
+
+        return main_program, out, found_inf, float_status
+
+    def run_prog(self, a, b, scale):
+        main_program, out, found_inf, float_status = self.get_prog()
+        place = fluid.NPUPlace(0)
+        exe = fluid.Executor(place)
+        out_, founf_inf_, float_status_ = exe.run(
+            main_program,
+            feed={"a": a,
+                  "b": b,
+                  "scale": scale},
+            fetch_list=[out, found_inf, float_status])
+        print(float_status_)
+        return out_, founf_inf_
+
+    def test_not_contains_nan_inf(self):
+        a = np.ones((32, 32)).astype('float32')
+        b = np.zeros((32, 32)).astype('float32')
+        scale = np.array([2.0]).astype('float32')
+
+        out, found_inf = self.run_prog(a, b, scale)
+        print(out, found_inf)
+
+        self.assertTrue(np.allclose(out, (a + b) / scale[0]))
+        self.assertFalse(found_inf[0])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_cast_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_cast_op_npu.py
new file mode 100755
index 0000000000000..ae48866b7b969
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_cast_op_npu.py
@@ -0,0 +1,99 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestCast1(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "cast"
+        self.place = paddle.NPUPlace(0)
+
+        ipt = np.random.random(size=[10, 10]) + 1
+        self.inputs = {'X': ipt.astype('float32')}
+        self.outputs = {'Out': ipt.astype('float16')}
+
+        self.attrs = {
+            'in_dtype': int(core.VarDesc.VarType.FP32),
+            'out_dtype': int(core.VarDesc.VarType.FP16)
+        }
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+class TestCast2(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "cast"
+        self.place = paddle.NPUPlace(0)
+
+        ipt = np.random.random(size=[10, 10]) + 1
+        self.inputs = {'X': ipt.astype('float16')}
+        self.outputs = {'Out': ipt.astype('float32')}
+
+        self.attrs = {
+            'in_dtype': int(core.VarDesc.VarType.FP16),
+            'out_dtype': int(core.VarDesc.VarType.FP32)
+        }
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False, atol=1e-3)
+
+
+class TestCast3(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "cast"
+        self.place = paddle.NPUPlace(0)
+
+        ipt = np.random.random(size=[10, 10]) + 1
+        self.inputs = {'X': ipt.astype('int32')}
+        self.outputs = {'Out': ipt.astype('int32')}
+
+        self.attrs = {
+            'in_dtype': int(core.VarDesc.VarType.INT32),
+            'out_dtype': int(core.VarDesc.VarType.INT32)
+        }
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False, atol=1e-3)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_compare_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_compare_op_npu.py
new file mode 100644
index 0000000000000..54a2c1e7163a9
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_compare_op_npu.py
@@ -0,0 +1,145 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestEqual(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "equal"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        y = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        out = x == y  # all elements are not equal
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(y)
+        }
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestLessthan(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "less_than"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        y = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        out = x < y
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(y)
+        }
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+class TestEqual2(TestEqual):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "equal"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        y = x.copy()
+        y[0][1] = 1
+        out = x == y  # all elements are equal, except position [0][1]
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(y)
+        }
+        self.outputs = {'Out': out}
+
+
+class TestLessthan2(TestLessthan):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "less_than"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        y = x.copy()
+        y[0][1] = 1
+        out = x < y  # all elements are equal, except position [0][1]
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(y)
+        }
+        self.outputs = {'Out': out}
+
+
+class TestEqual2FP16(TestEqual2):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+class TestEqual2Int(TestEqual2):
+    def init_dtype(self):
+        self.dtype = np.int32
+
+
+class TestLessthan2FP16(TestLessthan2):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_concat_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_concat_op_npu.py
new file mode 100644
index 0000000000000..a2ec1c7a9eef6
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_concat_op_npu.py
@@ -0,0 +1,115 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestConcat(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "concat"
+        self.place = paddle.NPUPlace(0)
+        self.init_dtype()
+        self.init_test_data()
+
+        self.inputs = {'X': [('x0', self.x0), ('x1', self.x1), ('x2', self.x2)]}
+        self.attrs = {'axis': self.axis}
+        if self.axis < 0:
+            self.actual_axis = self.axis + len(self.x0.shape)
+            self.actual_axis = self.actual_axis if self.actual_axis > 0 else 0
+        else:
+            self.actual_axis = self.axis
+
+        self.outputs = {
+            'Out': np.concatenate(
+                (self.x0, self.x1, self.x2), axis=self.actual_axis)
+        }
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+    def init_test_data(self):
+        self.x0 = np.random.random((1, 4, 50)).astype(self.dtype)
+        self.x1 = np.random.random((2, 4, 50)).astype(self.dtype)
+        self.x2 = np.random.random((3, 4, 50)).astype(self.dtype)
+        self.axis = 0
+
+    def test_check_grad(self):
+        self.check_grad_with_place(
+            self.place, ['x0', 'x2'], 'Out', check_dygraph=False)
+        self.check_grad_with_place(
+            self.place, ['x1'], 'Out', check_dygraph=False)
+        self.check_grad_with_place(
+            self.place, ['x2'], 'Out', check_dygraph=False)
+
+
+class TestConcatFP16(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "concat"
+        self.place = paddle.NPUPlace(0)
+        self.init_dtype()
+        self.init_test_data()
+
+        self.inputs = {'X': [('x0', self.x0), ('x1', self.x1), ('x2', self.x2)]}
+        self.attrs = {'axis': self.axis}
+        if self.axis < 0:
+            self.actual_axis = self.axis + len(self.x0.shape)
+            self.actual_axis = self.actual_axis if self.actual_axis > 0 else 0
+        else:
+            self.actual_axis = self.axis
+
+        self.outputs = {
+            'Out': np.concatenate(
+                (self.x0, self.x1, self.x2), axis=self.actual_axis)
+        }
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.__class__.no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+    def init_test_data(self):
+        self.x0 = np.random.random((1, 4, 50)).astype(self.dtype)
+        self.x1 = np.random.random((2, 4, 50)).astype(self.dtype)
+        self.x2 = np.random.random((3, 4, 50)).astype(self.dtype)
+        self.axis = 0
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py
index 47da4fdb23ec4..6a82157faaec4 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py
@@ -64,28 +64,28 @@ def init_axis(self):
     def test_check_output(self):
         self.check_output_with_place(self.place, check_dygraph=False)
 
-    # TODO(ascendrc): Test grad op after it is implemented.
-    # def test_check_grad_normal(self):
-    #     self.check_grad_with_place(
-    #         self.place, ['X', 'Y'],
-    #         'Out',
-    #         max_relative_error=0.006,
-    #         check_dygraph=False)
-    #
-    # def test_check_grad_ingore_x(self):
-    #     self.check_grad_with_place(
-    #         self.place, ['Y'],
-    #         'Out',
-    #         no_grad_set=set("X"),
-    #         max_relative_error=0.006,
-    #         check_dygraph=False)
-    #
-    # def test_check_grad_ingore_y(self):
-    #     self.check_grad_with_place(
-    #         self.place, ['X'],
-    #         'Out',
-    #         no_grad_set=set("Y"),
-    #         max_relative_error=0.006,check_dygraph=False)
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(
+            self.place, ['X', 'Y'],
+            'Out',
+            max_relative_error=0.006,
+            check_dygraph=False)
+
+    def test_check_grad_ingore_x(self):
+        self.check_grad_with_place(
+            self.place, ['Y'],
+            'Out',
+            no_grad_set=set("X"),
+            max_relative_error=0.006,
+            check_dygraph=False)
+
+    def test_check_grad_ingore_y(self):
+        self.check_grad_with_place(
+            self.place, ['X'],
+            'Out',
+            no_grad_set=set("Y"),
+            max_relative_error=0.006,
+            check_dygraph=False)
 
 
 @unittest.skipIf(not paddle.is_compiled_with_npu(),
@@ -133,10 +133,6 @@ def test_static(self):
                 True,
                 msg="z_value = {}, but expected {}".format(z_value, z_expected))
 
-    def test_backward(self):
-        # TODO(ascendrc): Test backward after add grad npu op implemented.
-        pass
-
 
 @unittest.skipIf(not paddle.is_compiled_with_npu(),
                  "core is not compiled with NPU")
diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_div_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_div_op_npu.py
new file mode 100644
index 0000000000000..0ae2678d10b47
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_div_op_npu.py
@@ -0,0 +1,183 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestElementwiseDiv(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "elementwise_div"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        y = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        out = np.divide(x, y)
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(y)
+        }
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(
+            self.place, ['X', 'Y'],
+            'Out',
+            max_relative_error=0.007,
+            check_dygraph=False)
+
+    def test_check_grad_ingore_x(self):
+        self.check_grad_with_place(
+            self.place, ['Y'],
+            'Out',
+            max_relative_error=0.007,
+            no_grad_set=set("X"),
+            check_dygraph=False)
+
+    def test_check_grad_ingore_y(self):
+        self.check_grad_with_place(
+            self.place, ['X'], 'Out', no_grad_set=set("Y"), check_dygraph=False)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestElementwiseDivFp16(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "elementwise_div"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [3, 4]).astype(self.dtype)
+        y = np.random.uniform(1, 2, [3, 4]).astype(self.dtype)
+        out = np.divide(x, y)
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(y)
+        }
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.__class__.no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False, atol=1e-5)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestElementwiseDivNet(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.uniform(1, 2, [32, 32]).astype('float32')
+        b_np = np.random.uniform(1, 2, [32, 32]).astype('float32')
+        c_np = np.random.uniform(1, 2, [32, 32]).astype('float32')
+        d_np = np.random.uniform(1, 2, [32, 32]).astype('float32')
+        label_np = np.random.randint(2, size=(32, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
+            b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
+            c = paddle.static.data(name="c", shape=[32, 32], dtype='float32')
+            d = paddle.static.data(name="d", shape=[32, 32], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[32, 1], dtype='int64')
+
+            e = paddle.multiply(a, b)
+            f = paddle.multiply(c, d)
+            f.stop_gradient = True
+            g = fluid.layers.elementwise_div(e, f)
+
+            fc_1 = fluid.layers.fc(input=g, size=128)
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(main_prog,
+                                         feed={
+                                             "a": a_np,
+                                             "b": b_np,
+                                             "c": c_np,
+                                             "d": d_np,
+                                             "label": label_np
+                                         },
+                                         fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        npu_pred, npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_floordiv_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_floordiv_op_npu.py
new file mode 100644
index 0000000000000..93538e938670f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_floordiv_op_npu.py
@@ -0,0 +1,67 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+
+paddle.enable_static()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestElementwiseFloorDiv(OpTest):
+    def setUp(self):
+        self.op_type = "elementwise_floordiv"
+        self.set_npu()
+        self.init_dtype()
+        self.init_input_output()
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(self.x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
+        }
+        self.attrs = {}
+        self.outputs = {'Out': self.out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.place = paddle.NPUPlace(0)
+
+    def init_input_output(self):
+        self.x = np.random.uniform(1, 1000, [10, 10]).astype(self.dtype)
+        self.y = np.random.uniform(1, 1000, [10, 10]).astype(self.dtype)
+        self.out = np.floor_divide(self.x, self.y)
+
+    def init_dtype(self):
+        self.dtype = "int64"
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestElementwiseFloorDiv2(TestElementwiseFloorDiv):
+    def init_dtype(self):
+        self.dtype = "int32"
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_min_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_min_op_npu.py
new file mode 100644
index 0000000000000..b4d9c7285b2b5
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_min_op_npu.py
@@ -0,0 +1,161 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestElementwiseMin(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "elementwise_min"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        y = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        out = np.minimum(x, y)
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(y)
+        }
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+    # TODO(ascendrc): Min grad test
+    # def test_check_grad(self):
+    #     if self.dtype == np.float16:
+    #         return
+    #     self.check_grad(['X'], 'Out')
+    #
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestElementwiseMinFp16(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "elementwise_min"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [3, 4]).astype(self.dtype)
+        y = np.random.uniform(1, 2, [3, 4]).astype(self.dtype)
+        out = np.minimum(x, y)
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(y)
+        }
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.__class__.no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False, atol=1e-5)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestElementwiseMinNet(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(32, 32)).astype('float32')
+        b_np = np.random.random(size=(32, 32)).astype('float32')
+        label_np = np.random.randint(2, size=(32, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
+            b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[32, 1], dtype='int64')
+
+            c = paddle.minimum(a, b)
+
+            fc_1 = fluid.layers.fc(input=c, size=128)
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(
+                main_prog,
+                feed={"a": a_np,
+                      "b": b_np,
+                      "label": label_np},
+                fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        npu_pred, npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_mul_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_mul_op_npu.py
new file mode 100644
index 0000000000000..9bfb7e033e7ea
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_mul_op_npu.py
@@ -0,0 +1,171 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestElementwiseMul(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "elementwise_mul"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        y = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        out = np.multiply(x, y)
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(y)
+        }
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+    # TODO(ascendrc): Mul grad test
+    # def test_check_grad(self):
+    #     if self.dtype == np.float16:
+    #         return
+    #     self.check_grad(['X'], 'Out')
+    #
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestElementwiseMulFp16(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "elementwise_mul"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [3, 4]).astype(self.dtype)
+        y = np.random.uniform(1, 2, [3, 4]).astype(self.dtype)
+        out = np.multiply(x, y)
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(y)
+        }
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.__class__.no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False, atol=1e-5)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestElementwiseMulNet(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(32, 32)).astype('float32')
+        b_np = np.random.random(size=(32, 32)).astype('float32')
+        c_np = np.random.random(size=(32, 32)).astype('float32')
+        d_np = np.random.random(size=(32, 32)).astype('float32')
+        label_np = np.random.randint(2, size=(32, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
+            b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
+            c = paddle.static.data(name="c", shape=[32, 32], dtype='float32')
+            d = paddle.static.data(name="d", shape=[32, 32], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[32, 1], dtype='int64')
+
+            e = paddle.multiply(a, b)
+            f = paddle.multiply(c, d)
+            f.stop_gradient = True
+            g = paddle.multiply(e, f)
+
+            fc_1 = fluid.layers.fc(input=g, size=128)
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(main_prog,
+                                         feed={
+                                             "a": a_np,
+                                             "b": b_np,
+                                             "c": c_np,
+                                             "d": d_np,
+                                             "label": label_np
+                                         },
+                                         fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        npu_pred, npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_pow_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_pow_op_npu.py
new file mode 100644
index 0000000000000..862c546b8e05e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_pow_op_npu.py
@@ -0,0 +1,161 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestElementwisePow(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "elementwise_pow"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        y = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        out = np.power(x, y)
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(y)
+        }
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+    # TODO(ascendrc): Pow grad test
+    # def test_check_grad(self):
+    #     if self.dtype == np.float16:
+    #         return
+    #     self.check_grad(['X'], 'Out')
+    #
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestElementwisePowFp16(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "elementwise_pow"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [3, 4]).astype(self.dtype)
+        y = np.random.uniform(1, 2, [3, 4]).astype(self.dtype)
+        out = np.power(x, y)
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(y)
+        }
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.__class__.no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False, atol=1e-5)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestElementwisePowNet(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(32, 32)).astype('float32')
+        b_np = np.random.random(size=(32, 32)).astype('float32')
+        label_np = np.random.randint(2, size=(32, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
+            b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[32, 1], dtype='int64')
+
+            c = paddle.pow(a, b)
+
+            fc_1 = fluid.layers.fc(input=c, size=128)
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(
+                main_prog,
+                feed={"a": a_np,
+                      "b": b_np,
+                      "label": label_np},
+                fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        npu_pred, npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_expand_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_expand_op_npu.py
new file mode 100644
index 0000000000000..f6a84d3be5c10
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_expand_op_npu.py
@@ -0,0 +1,144 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestExpand(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "expand"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.randn(3, 1, 7).astype(self.dtype)
+        out = np.tile(x, [1, 10, 1])
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {'expand_times': [1, 10, 1]}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+    # TODO(ascendrc): Add grad test
+    # def test_check_grad(self):
+    #     if self.dtype == np.float16:
+    #         return
+    #     self.check_grad(['X'], 'Out')
+    #
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestExpandV2(TestExpand):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "expand"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.randn(3, 1, 7).astype(self.dtype)
+        out = np.tile(x, [1, 10, 1])
+        expand_times = np.array([1, 10, 1]).astype(np.int32)
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(x),
+            'ExpandTimes': OpTest.np_dtype_to_fluid_dtype(expand_times)
+        }
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestExpandFp16(TestExpand):
+    no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestExpandNet(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(32, 1)).astype('float32')
+        label_np = np.random.randint(2, size=(32, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[32, 1], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[32, 1], dtype='int64')
+
+            res = paddle.fluid.layers.expand(a, [1, 32])
+            loss = res.sum()
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        for epoch in range(100):
+
+            loss_res = exe.run(main_prog,
+                               feed={"a": a_np,
+                                     "label": label_np},
+                               fetch_list=[loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Loss: {}".format(epoch, loss))
+
+        return loss_res
+
+    def test_npu(self):
+        cpu_loss = self._test(False)
+        npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(npu_loss, cpu_loss))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_fill_constant_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_fill_constant_op_npu.py
new file mode 100644
index 0000000000000..6e619bfd11fb9
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_fill_constant_op_npu.py
@@ -0,0 +1,102 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestFillConstant(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.place = paddle.NPUPlace(0)
+        self.op_type = "fill_constant"
+        self.init_dtype()
+
+        self.inputs = {}
+        self.attrs = {'shape': [123, 92], 'value': 3.8}
+        self.outputs = {'Out': np.full((123, 92), 3.8)}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+class TestFillConstantInt(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.place = paddle.NPUPlace(0)
+        self.op_type = "fill_constant"
+
+        self.inputs = {}
+        self.attrs = {
+            'shape': [123, 92],
+            'value': 1,
+            'dtype': core.VarDesc.VarType.INT32
+        }
+        self.outputs = {'Out': np.full((123, 92), 1).astype(self.dtype)}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.int32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+class TestFillConstantFP16(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.place = paddle.NPUPlace(0)
+        self.op_type = "fill_constant"
+
+        self.inputs = {}
+        self.attrs = {
+            'shape': [123, 92],
+            'value': 1.0,
+            'dtype': core.VarDesc.VarType.FP16
+        }
+        self.outputs = {'Out': np.full((123, 92), 1.0).astype(self.dtype)}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False, atol=1e-3)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_gather_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_gather_op_npu.py
new file mode 100644
index 0000000000000..008422ffd2118
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_gather_op_npu.py
@@ -0,0 +1,177 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.framework import core
+
+paddle.enable_static()
+SEED = 2021
+
+
+def gather_numpy(x, index, axis):
+    x_transpose = np.swapaxes(x, 0, axis)
+    tmp_gather = x_transpose[index, ...]
+    gather = np.swapaxes(tmp_gather, 0, axis)
+    return gather
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestGatherOp(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.place = paddle.NPUPlace(0)
+        self.op_type = "gather"
+        self.config()
+        xnp = np.random.random(self.x_shape).astype(self.x_type)
+        self.inputs = {
+            'X': xnp,
+            'Index': np.array(self.index).astype(self.index_type)
+        }
+        self.outputs = {'Out': self.inputs["X"][self.inputs["Index"]]}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(
+            self.place, ['X'],
+            'Out',
+            max_relative_error=0.006,
+            check_dygraph=False)
+
+    def config(self):
+        """
+        For multi-dimension input
+        """
+        self.x_shape = (10, 20)
+        self.x_type = "float32"
+        self.index = [1, 3, 5]
+        self.index_type = "int32"
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestCase1(TestGatherOp):
+    def config(self):
+        """
+        For one dimension input
+        """
+        self.x_shape = (100)
+        self.x_type = "float32"
+        self.index = [1, 3, 5]
+        self.index_type = "int32"
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class API_TestGather(unittest.TestCase):
+    def test_out1(self):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            data1 = fluid.layers.data('data1', shape=[-1, 2], dtype='float32')
+            index = fluid.layers.data('index', shape=[-1, 1], dtype='int32')
+            out = paddle.fluid.layers.gather(data1, index)
+            place = paddle.NPUPlace(0)
+            exe = fluid.Executor(place)
+            input = np.array([[1, 2], [3, 4], [5, 6]])
+            index_1 = np.array([1, 2])
+            result, = exe.run(feed={"data1": input,
+                                    "index": index_1},
+                              fetch_list=[out])
+            expected_output = np.array([[3, 4], [5, 6]])
+        self.assertTrue(np.allclose(result, expected_output))
+
+    def test_out2(self):
+        with paddle.static.program_guard(paddle.static.Program(),
+                                         paddle.static.Program()):
+            x = paddle.fluid.data('x', shape=[-1, 2], dtype='float32')
+            index = paddle.fluid.data('index', shape=[-1, 1], dtype='int32')
+            out = paddle.gather(x, index)
+            place = paddle.NPUPlace(0)
+            exe = paddle.static.Executor(place)
+            x_np = np.array([[1, 2], [3, 4], [5, 6]]).astype('float32')
+            index_np = np.array([1, 1]).astype('int32')
+            result, = exe.run(feed={"x": x_np,
+                                    "index": index_np},
+                              fetch_list=[out])
+            expected_output = gather_numpy(x_np, index_np, axis=0)
+        self.assertTrue(np.allclose(result, expected_output))
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestGatherGrad(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(8192, 768)).astype('float32')
+        index_np = np.random.randint(0, 8192, size=(1232, 1)).astype('int32')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[8192, 768], dtype='float32')
+            index = paddle.static.data(
+                name="index", shape=[1232, 1], dtype='int32')
+            a.stop_gradient = False
+            b = paddle.gather(a, index)
+
+            loss = fluid.layers.reduce_mean(b)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(main_prog,
+                                         feed={"a": a_np,
+                                               "index": index_np},
+                                         fetch_list=[b, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res[0]))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        npu_pred, npu_loss = self._test(True)
+        cpu_pred, cpu_loss = self._test(False)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_gelu_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_gelu_op_npu.py
new file mode 100644
index 0000000000000..efa1918206b03
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_gelu_op_npu.py
@@ -0,0 +1,160 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+from scipy import special
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+def np_gelu(x):
+    y = 0.5 * x * (1 + special.erf(x / np.sqrt(2)))
+    return y
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestGelu(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "gelu"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        out = np_gelu(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False, atol=1e-3)
+
+    # TODO(ascendrc): Add grad test
+    # def test_check_grad(self):
+    #     if self.dtype == np.float16:
+    #         return
+    #     self.check_grad(['X'], 'Out')
+    #
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestGeluFp16(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "gelu"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [3, 4]).astype(self.dtype)
+        out = np_gelu(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.__class__.no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False, atol=1e-3)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestGeluNet(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(32, 32)).astype('float32')
+        b_np = np.random.random(size=(32, 32)).astype('float32')
+        label_np = np.random.randint(2, size=(32, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
+            b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[32, 1], dtype='int64')
+
+            c = paddle.multiply(a, b)
+            d = fluid.layers.gelu(c)
+
+            fc_1 = fluid.layers.fc(input=d, size=128)
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(
+                main_prog,
+                feed={"a": a_np,
+                      "b": b_np,
+                      "label": label_np},
+                fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        npu_pred, npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred, atol=1e-3))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss, atol=1e-3))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_increment_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_increment_op_npu.py
new file mode 100644
index 0000000000000..3e2e8f944b84c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_increment_op_npu.py
@@ -0,0 +1,119 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+
+paddle.enable_static()
+SEED = 2021
+
+NPUPlace = 0
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestIncrement(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.place = paddle.NPUPlace(NPUPlace)
+        self.op_type = "increment"
+        self.init_dtype()
+
+        self.inputs = {
+            'X':
+            OpTest.np_dtype_to_fluid_dtype(np.array([1]).astype(self.dtype)),
+        }
+
+        self.attrs = {"Step": 1}
+        self.outputs = {'Out': np.array([2])}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.__class__.no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.int64
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestIncrementFP16(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.place = paddle.NPUPlace(NPUPlace)
+        self.op_type = "increment"
+        self.init_dtype()
+
+        self.inputs = {
+            'X':
+            OpTest.np_dtype_to_fluid_dtype(np.array([1]).astype(self.dtype)),
+        }
+        self.pre_input_id = id(self.inputs['X'])
+
+        self.attrs = {"Step": 1}
+        self.outputs = {'Out': np.array([2])}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestIncrementInplace(unittest.TestCase):
+    def test_npu(self):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.array([1]).astype('float32')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[1], dtype='float32')
+            b = fluid.layers.increment(a)
+
+        place = paddle.NPUPlace(NPUPlace)
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        b_value = exe.run(main_prog, feed={"a": a_np, }, fetch_list=[b])
+
+        print('input a id is : {}'.format(id(a)))
+        print('input b id is : {}'.format(id(b)))
+
+        self.assertEqual(id(a), id(b))
+        self.assertEqual(b_value[0], 2)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_layer_norm_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_layer_norm_op_npu.py
new file mode 100644
index 0000000000000..d447dfb8d4d03
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_layer_norm_op_npu.py
@@ -0,0 +1,203 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+from functools import reduce
+from operator import mul
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from test_layer_norm_op import _reference_layer_norm_naive, _reference_layer_norm_grad
+
+paddle.enable_static()
+
+SEED = 2021
+EPOCH = 100
+
+from op_test import _set_use_system_allocator
+
+_set_use_system_allocator(False)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestLayerNormOp(unittest.TestCase):
+    def setUp(self):
+        self.use_cudnn = True
+        self.set_npu()
+        self.init_dtype()
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.place = paddle.NPUPlace(0)
+
+    def init_dtype(self):
+        self.dtype = np.float32
+        self.atol = 1e-4
+
+    def __assert_close(self, tensor, np_array, msg, atol=1e-4):
+        self.assertTrue(
+            np.allclose(
+                np.array(tensor).astype(np_array.dtype), np_array, atol=atol),
+            msg)
+
+    def check_forward_backward(self,
+                               shape,
+                               begin_norm_axis,
+                               has_scale=True,
+                               has_bias=True,
+                               y_grad_scale=1.0,
+                               use_mkldnn=False):
+        def test_with_place(place,
+                            shape,
+                            begin_norm_axis,
+                            use_mkldnn=use_mkldnn):
+            # attr
+            epsilon = 0.00001
+            x_shape = shape
+            D = reduce(mul, x_shape[begin_norm_axis:len(x_shape)], 1)
+            scale_shape = [D]
+
+            np.random.seed(123)
+            x = np.random.random_sample(x_shape).astype(self.dtype)
+            scale = np.random.random_sample(scale_shape).astype(
+                np.float32) if has_scale else None
+            bias = np.random.random_sample(scale_shape).astype(
+                np.float32) if has_bias else None
+            y_grad = (np.random.random_sample(x_shape) *
+                      y_grad_scale).astype(self.dtype)
+
+            # reference forward & backward
+            y, mean, variance = _reference_layer_norm_naive(
+                x, scale, bias, epsilon, begin_norm_axis)
+            x_grad, scale_grad, bias_grad = _reference_layer_norm_grad(
+                x, y_grad, scale, bias, mean, variance, begin_norm_axis)
+
+            var_dict = locals()
+            var_dict['y@GRAD'] = y_grad
+            var_names = ['x', 'mean', 'variance', 'y', 'y@GRAD']
+            if has_scale:
+                var_names += ['scale']
+            if has_bias:
+                var_names += ['bias']
+            ground_truth = {name: var_dict[name] for name in var_names}
+
+            program = fluid.Program()
+            with fluid.program_guard(program):
+                block = program.global_block()
+                for name in ground_truth:
+                    block.create_var(
+                        name=name,
+                        dtype=self.dtype,
+                        shape=ground_truth[name].shape)
+                inputs = {"X": block.var('x')}
+                fetch_list = [
+                    'y',
+                    'mean',
+                    'variance',
+                    'x@GRAD',
+                ]
+                if has_scale:
+                    inputs["Scale"] = block.var('scale')
+                    fetch_list += ['scale@GRAD']
+                if has_bias:
+                    inputs["Bias"] = block.var('bias')
+                    fetch_list += ['bias@GRAD']
+                layer_norm_op = block.append_op(
+                    type="layer_norm",
+                    inputs=inputs,
+                    outputs={
+                        "Y": block.var('y'),
+                        "Mean": block.var('mean'),  # share the same memory
+                        "Variance":
+                        block.var('variance'),  # share the same memory
+                    },
+                    attrs={
+                        "epsilon": epsilon,
+                        "begin_norm_axis": begin_norm_axis,
+                        "use_mkldnn": use_mkldnn
+                    })
+                # generate backward op_desc
+                grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(
+                    layer_norm_op.desc, set(), [])
+                grad_op_desc = grad_op_desc_list[0]
+                new_op_desc = block.desc.append_op()
+                new_op_desc.copy_from(grad_op_desc)
+                for var_name in grad_op_desc.output_arg_names():
+                    block.desc.var(var_name.encode("ascii"))
+                grad_op_desc.infer_var_type(block.desc)
+                grad_op_desc.infer_shape(block.desc)
+                for arg in grad_op_desc.output_arg_names():
+                    grad_var = block.desc.find_var(arg.encode("ascii"))
+                    grad_var.set_dtype(core.VarDesc.VarType.FP32)
+
+                program._sync_with_cpp()
+                exe = fluid.Executor(place)
+                out = exe.run(program,
+                              feed={
+                                  name: var_dict[name]
+                                  for name in ['x', 'scale', 'bias', 'y@GRAD']
+                              },
+                              fetch_list=fetch_list)
+                self.__assert_close(y, out[0], "y", self.atol)
+                self.__assert_close(mean, out[1], "mean")
+                self.__assert_close(variance, out[2], "variance", 1e-3)
+                self.__assert_close(x_grad, out[3], "x_grad", 1e-2)
+                if has_scale:
+                    self.__assert_close(scale_grad,
+                                        out[fetch_list.index('scale@GRAD')],
+                                        "scale_grad", 1e-2)
+                if has_bias:
+                    self.__assert_close(bias_grad,
+                                        out[fetch_list.index('bias@GRAD')],
+                                        "bias_grad", self.atol)
+
+        test_with_place(self.place, shape, begin_norm_axis)
+
+    def test_check_forward_backward_with_scale_and_bias(self):
+        self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=1)
+        self.check_forward_backward(
+            shape=[2, 3, 4, 5],
+            begin_norm_axis=1,
+            has_scale=False,
+            has_bias=True)
+        self.check_forward_backward(
+            shape=[2, 3, 4, 5],
+            begin_norm_axis=1,
+            has_scale=True,
+            has_bias=False)
+        self.check_forward_backward(
+            shape=[2, 3, 4, 5],
+            begin_norm_axis=1,
+            has_scale=False,
+            has_bias=False)
+        self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=3)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestLayerNormOpFP16(TestLayerNormOp):
+    def init_dtype(self):
+        self.dtype = np.float16
+        self.atol = 1e-2
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_log_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_log_op_npu.py
new file mode 100644
index 0000000000000..3cdd2448628a0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_log_op_npu.py
@@ -0,0 +1,154 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestLog(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "log"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        out = np.log(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+    # TODO(ascendrc): Add grad test
+    # def test_check_grad(self):
+    #     if self.dtype == np.float16:
+    #         return
+    #     self.check_grad(['X'], 'Out')
+    #
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestLogFp16(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "log"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [3, 4]).astype(self.dtype)
+        out = np.log(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.__class__.no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False, atol=1e-5)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestLogNet(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(32, 32)).astype('float32')
+        b_np = np.random.random(size=(32, 32)).astype('float32')
+        label_np = np.random.randint(2, size=(32, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
+            b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[32, 1], dtype='int64')
+
+            c = paddle.multiply(a, b)
+            d = paddle.log(c)
+
+            fc_1 = fluid.layers.fc(input=d, size=128)
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(
+                main_prog,
+                feed={"a": a_np,
+                      "b": b_np,
+                      "label": label_np},
+                fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        npu_pred, npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred, atol=1e-4))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss, atol=1e-4))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_logical_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_logical_op_npu.py
new file mode 100644
index 0000000000000..a2b54be3a1482
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_logical_op_npu.py
@@ -0,0 +1,122 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestLogicalNot(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "logical_not"
+        self.place = paddle.NPUPlace(4)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        out = np.logical_not(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.bool
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+    # TODO(ascendrc): Add grad test
+    # def test_check_grad(self):
+    #     if self.dtype == np.float16:
+    #         return
+    #     self.check_grad(['X'], 'Out')
+    #
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestLogcialNotNet(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(32, 32)).astype('bool')
+        label_np = np.random.randint(2, size=(32, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[32, 32], dtype='bool')
+            label = paddle.static.data(
+                name="label", shape=[32, 1], dtype='int64')
+
+            c = paddle.logical_not(a)
+            d = paddle.cast(c, dtype="float32")
+
+            fc_1 = fluid.layers.fc(input=d, size=128)
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(4)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(main_prog,
+                                         feed={"a": a_np,
+                                               "label": label_np},
+                                         fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        npu_pred, npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py
new file mode 100644
index 0000000000000..2463ddb7137ac
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py
@@ -0,0 +1,89 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestLookupTableV2(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "lookup_table_v2"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        bsz = 6
+        seqlen = 8
+        vocab = 10
+        dim = 20
+        w = np.ones([vocab, dim]).astype(self.dtype)
+        x = np.random.randint(0, vocab, size=(bsz, seqlen)).astype(np.int32)
+        out = np.ones([bsz, seqlen, dim]).astype(self.dtype)
+
+        self.inputs = {
+            'W': OpTest.np_dtype_to_fluid_dtype(w),
+            'Ids': OpTest.np_dtype_to_fluid_dtype(x)
+        }
+        self.attrs = {
+            'is_sparse': False,
+            'is_distributed': False,
+            'remote_prefetch': False,
+            'padding_idx': -1
+        }
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+    def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
+        self.check_grad_with_place(
+            self.place, ['W'], 'Out', check_dygraph=False)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestLookupTableV2FP16(TestLookupTableV2):
+    no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.__class__.no_need_check_grad = True
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_matmulv2_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_matmulv2_op_npu.py
new file mode 100644
index 0000000000000..b27b9c0b97560
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_matmulv2_op_npu.py
@@ -0,0 +1,210 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+def reference_matmul(X, Y, transpose_X=False, transpose_Y=False):
+    """Reference forward implementation using np.matmul."""
+    # np.matmul does not support the transpose flags, so we manually
+    # transpose X and Y appropriately.
+    if transpose_X:
+        if X.ndim == 1:
+            X = X.reshape((X.size, ))
+        elif X.ndim == 2:
+            X = X.T
+        else:
+            dim = [i for i in range(len(X.shape))]
+            dim[-1], dim[len(X.shape) - 2] = dim[len(X.shape) - 2], dim[-1]
+            X = np.transpose(X, tuple(dim))
+    if transpose_Y:
+        if Y.ndim == 1:
+            Y = Y.reshape((Y.size, ))
+        else:
+            dim = [i for i in range(len(Y.shape))]
+            dim[-1], dim[len(Y.shape) - 2] = dim[len(Y.shape) - 2], dim[-1]
+            Y = np.transpose(Y, tuple(dim))
+
+    Out = np.matmul(X, Y)
+    if not Out.shape:
+        # We do not support 0-dimensional Tensors (scalars). So where
+        # np.matmul outputs a scalar, we must convert to a Tensor of
+        # shape (1, ) instead.
+        # Everywhere else, we are compatible with np.matmul.
+        Out = np.array([Out], dtype="float64")
+    return Out
+
+
+class TestMatMul(OpTest):
+    def config(self):
+        self.x_shape = (100, 24)
+        self.y_shape = (24, 100)
+        self.trans_x = False
+        self.trans_y = False
+
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "matmul_v2"
+        self.place = paddle.NPUPlace(0)
+        self.init_dtype()
+        self.config()
+        np.random.seed(SEED)
+        x = np.random.random(self.x_shape).astype(self.dtype)
+        y = np.random.random(self.y_shape).astype(self.dtype)
+        # -0.1 ~ 0.1
+        x = -0.1 + 0.2 * x
+        y = -0.1 + 0.2 * y
+        result = reference_matmul(x, y, self.trans_x, self.trans_y)
+        result = result.astype(self.dtype)
+        self.inputs = {
+            'X': x,
+            'Y': y,
+        }
+        self.attrs = {'trans_x': self.trans_x, 'trans_y': self.trans_y}
+        self.outputs = {'Out': result}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.__class__.no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False, atol=1e-5)
+
+
+    # TODO(ascendrc): Add grad test
+    # def test_check_grad(self):
+    #     if self.dtype == np.float16:
+    #         return
+    #     self.check_grad(['X'], 'Out')
+    #
+class TestMatMul2(TestMatMul):
+    """
+    case 2
+    """
+
+    def config(self):
+        self.x_shape = (32, 24)
+        self.y_shape = (32, 24)
+        self.trans_x = False
+        self.trans_y = True
+
+
+class TestMatMul3(TestMatMul):
+    """
+    case 3
+    """
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+class TestMatMul4(TestMatMul):
+    """
+    case 4 dim=3
+    """
+
+    def config(self):
+        self.x_shape = (2, 3, 4)
+        self.y_shape = (2, 4, 3)
+        self.trans_x = False
+        self.trans_y = False
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestMatMulNet(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(2, 3)).astype('float32')
+        b_np = np.random.random(size=(2, 3)).astype('float32')
+        c_np = np.random.random(size=(3, 2)).astype('float32')
+        d_np = np.random.random(size=(3, 2)).astype('float32')
+        label_np = np.random.randint(2, size=(2, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[2, 3], dtype='float32')
+            b = paddle.static.data(name="b", shape=[2, 3], dtype='float32')
+            c = paddle.static.data(name="c", shape=[3, 2], dtype='float32')
+            d = paddle.static.data(name="d", shape=[3, 2], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[2, 1], dtype='int64')
+
+            sum_1 = paddle.add(a, b)
+            sum_2 = paddle.add(c, d)
+            result = paddle.matmul(sum_1, sum_2)
+
+            fc_1 = fluid.layers.fc(input=result, size=8)
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(main_prog,
+                                         feed={
+                                             "a": a_np,
+                                             "b": b_np,
+                                             "c": c_np,
+                                             "d": d_np,
+                                             "label": label_np
+                                         },
+                                         fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        npu_pred, npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_mean_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_mean_op_npu.py
new file mode 100644
index 0000000000000..6e8f99a9dbb19
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_mean_op_npu.py
@@ -0,0 +1,88 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestMean(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.place = paddle.NPUPlace(0)
+        self.op_type = "mean"
+        self.init_dtype()
+
+        x = np.random.random([1, 100]).astype(self.dtype)
+        self.inputs = {'X': x}
+
+        self.attrs = {}
+        np_out = np.mean(x)
+        self.outputs = {'Out': np_out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(
+            self.place, ['X'], 'Out', check_dygraph=False)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestMeanFP16(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.place = paddle.NPUPlace(0)
+        self.op_type = "mean"
+        self.init_dtype()
+
+        x = np.random.random([3, 200]).astype(self.dtype)
+        self.inputs = {'X': x}
+
+        self.attrs = {}
+        np_out = np.mean(x)
+        self.outputs = {'Out': np_out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.__class__.no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_memcpy_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_memcpy_op_npu.py
new file mode 100755
index 0000000000000..63c4fb8e5885e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_memcpy_op_npu.py
@@ -0,0 +1,104 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid import compiler, Program, program_guard
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestMemcpy_FillConstant(unittest.TestCase):
+    def get_prog(self):
+        paddle.enable_static()
+        main_program = Program()
+        with program_guard(main_program):
+            cpu_var_name = "tensor@Cpu"
+            npu_var_name = "tensor@Npu"
+            cpu_var = main_program.global_block().create_var(
+                name=cpu_var_name,
+                shape=[10, 10],
+                dtype='float32',
+                persistable=False,
+                stop_gradient=True)
+            npu_var = main_program.global_block().create_var(
+                name=npu_var_name,
+                shape=[10, 10],
+                dtype='float32',
+                persistable=False,
+                stop_gradient=True)
+            main_program.global_block().append_op(
+                type="fill_constant",
+                outputs={"Out": npu_var_name},
+                attrs={
+                    "shape": [10, 10],
+                    "dtype": npu_var.dtype,
+                    "value": 1.0,
+                    "place_type": 1
+                })
+            main_program.global_block().append_op(
+                type="fill_constant",
+                outputs={"Out": cpu_var_name},
+                attrs={
+                    "shape": [10, 10],
+                    "dtype": cpu_var.dtype,
+                    "value": 0.0,
+                    "place_type": 2
+                })
+        return main_program, npu_var, cpu_var
+
+    def test_npu_cpoy_to_cpu(self):
+        main_program, npu_var, cpu_var = self.get_prog()
+        main_program.global_block().append_op(
+            type='memcpy',
+            inputs={'X': npu_var},
+            outputs={'Out': cpu_var},
+            attrs={'dst_place_type': 0})
+        place = fluid.NPUPlace(0)
+        exe = fluid.Executor(place)
+        npu_, cpu_ = exe.run(main_program,
+                             feed={},
+                             fetch_list=[npu_var.name, cpu_var.name])
+        self.assertTrue(np.allclose(npu_, cpu_))
+        self.assertTrue(np.allclose(cpu_, np.ones((10, 10))))
+
+    def test_cpu_cpoy_npu(self):
+        main_program, npu_var, cpu_var = self.get_prog()
+        main_program.global_block().append_op(
+            type='memcpy',
+            inputs={'X': cpu_var},
+            outputs={'Out': npu_var},
+            attrs={'dst_place_type': 4})
+        place = fluid.NPUPlace(0)
+        exe = fluid.Executor(place)
+        npu_, cpu_ = exe.run(main_program,
+                             feed={},
+                             fetch_list=[npu_var.name, cpu_var.name])
+        self.assertTrue(np.allclose(npu_, cpu_))
+        self.assertTrue(np.allclose(npu_, np.zeros((10, 10))))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py
new file mode 100644
index 0000000000000..4fcfd33b32f4e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py
@@ -0,0 +1,327 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+class TestMul(OpTest):
+    def config(self):
+        self.x_shape = (32, 5)
+        self.y_shape = (5, 100)
+
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "mul"
+        self.place = paddle.NPUPlace(0)
+        self.init_dtype()
+        self.config()
+        np.random.seed(SEED)
+        self.inputs = {
+            'X': np.random.random(self.x_shape).astype(self.dtype),
+            'Y': np.random.random(self.y_shape).astype(self.dtype)
+        }
+        self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.__class__.no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False, atol=1e-5)
+
+
+    #
+class TestMulFP16(TestMul):
+    """
+    case 2
+    """
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+class TestMul3(TestMul):
+    """
+    case 3
+    """
+
+    def config(self):
+        self.x_shape = (2, 2, 5)
+        self.y_shape = (10, 5)
+
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "mul"
+        self.place = paddle.NPUPlace(0)
+        self.init_dtype()
+        self.config()
+        np.random.seed(SEED)
+        self.inputs = {
+            'X': np.random.random(self.x_shape).astype(self.dtype),
+            'Y': np.random.random(self.y_shape).astype(self.dtype)
+        }
+        self.outputs = {
+            'Out': np.dot(self.inputs['X'].reshape(2, 10), self.inputs['Y'])
+        }
+
+
+class TestMul4(TestMul):
+    """
+    case 4
+    """
+
+    def config(self):
+        self.x_shape = (2, 3, 4)
+        self.y_shape = (4, 5)
+
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "mul"
+        self.place = paddle.NPUPlace(0)
+        self.init_dtype()
+        self.config()
+        np.random.seed(SEED)
+        self.inputs = {
+            'X': np.random.random(self.x_shape).astype(self.dtype),
+            'Y': np.random.random(self.y_shape).astype(self.dtype)
+        }
+        self.attrs = {"x_num_col_dims": 2}
+        self.outputs = {'Out': np.matmul(self.inputs['X'], self.inputs['Y'])}
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestMulNet(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(2, 3)).astype('float32')
+        b_np = np.random.random(size=(2, 3)).astype('float32')
+        c_np = np.random.random(size=(3, 2)).astype('float32')
+        d_np = np.random.random(size=(3, 2)).astype('float32')
+        label_np = np.random.randint(2, size=(2, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[2, 3], dtype='float32')
+            b = paddle.static.data(name="b", shape=[2, 3], dtype='float32')
+            c = paddle.static.data(name="c", shape=[3, 2], dtype='float32')
+            d = paddle.static.data(name="d", shape=[3, 2], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[2, 1], dtype='int64')
+
+            sum_1 = paddle.add(a, b)
+            sum_2 = paddle.add(c, d)
+            result = paddle.fluid.layers.mul(sum_1, sum_2)
+
+            fc_1 = fluid.layers.fc(input=result, size=8)
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("TestMulNet Start run on {} . ".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(main_prog,
+                                         feed={
+                                             "a": a_np,
+                                             "b": b_np,
+                                             "c": c_np,
+                                             "d": d_np,
+                                             "label": label_np
+                                         },
+                                         fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        npu_pred, npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss))
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestMulNet3_2(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(2, 3, 4)).astype('float32')
+        b_np = np.random.random(size=(2, 3, 4)).astype('float32')
+        c_np = np.random.random(size=(12, 5)).astype('float32')
+        d_np = np.random.random(size=(12, 5)).astype('float32')
+        label_np = np.random.randint(2, size=(2, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[2, 3, 4], dtype='float32')
+            b = paddle.static.data(name="b", shape=[2, 3, 4], dtype='float32')
+            c = paddle.static.data(name="c", shape=[12, 5], dtype='float32')
+            d = paddle.static.data(name="d", shape=[12, 5], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[2, 1], dtype='int64')
+
+            sum_1 = paddle.add(a, b)
+            sum_2 = paddle.add(c, d)
+            result = paddle.fluid.layers.mul(sum_1, sum_2)
+
+            fc_1 = fluid.layers.fc(input=result, size=8)
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("testMulNet3_2 tart run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(main_prog,
+                                         feed={
+                                             "a": a_np,
+                                             "b": b_np,
+                                             "c": c_np,
+                                             "d": d_np,
+                                             "label": label_np
+                                         },
+                                         fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        npu_pred, npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(
+            npu_pred, cpu_pred, atol=1e-5))  # atol needed on cann 20.3
+        self.assertTrue(np.allclose(npu_loss, cpu_loss, atol=1e-5))
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestMulNet3_2_xc2(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(2, 3, 4)).astype('float32')
+        b_np = np.random.random(size=(2, 3, 4)).astype('float32')
+        c_np = np.random.random(size=(4, 5)).astype('float32')
+        d_np = np.random.random(size=(4, 5)).astype('float32')
+        label_np = np.random.randint(2, size=(2, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[2, 3, 4], dtype='float32')
+            b = paddle.static.data(name="b", shape=[2, 3, 4], dtype='float32')
+            c = paddle.static.data(name="c", shape=[4, 5], dtype='float32')
+            d = paddle.static.data(name="d", shape=[4, 5], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[2, 1], dtype='int64')
+
+            sum_1 = paddle.add(a, b)
+            sum_2 = paddle.add(c, d)
+            result = paddle.fluid.layers.mul(sum_1, sum_2, x_num_col_dims=2)
+            result_re = paddle.reshape(result, shape=[2, 15])
+
+            fc_1 = fluid.layers.fc(input=result_re, size=8)
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("TestMulNet3_2_xc2. Start run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(main_prog,
+                                         feed={
+                                             "a": a_np,
+                                             "b": b_np,
+                                             "c": c_np,
+                                             "d": d_np,
+                                             "label": label_np
+                                         },
+                                         fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        npu_pred, npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_pow_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_pow_op_npu.py
new file mode 100644
index 0000000000000..8c67766b31184
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_pow_op_npu.py
@@ -0,0 +1,151 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestPow(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "pow"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        out = np.power(x, 3)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {'factor': 3.0}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(
+            self.place, ['X'], 'Out', check_dygraph=False)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestPowFp16(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "pow"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [3, 4]).astype(self.dtype)
+        out = np.power(x, 2)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {'factor': 2.0}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.__class__.no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False, atol=1e-5)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestPowNet(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(32, 32)).astype('float32')
+        b_np = np.random.random(size=(32, 32)).astype('float32')
+        label_np = np.random.randint(2, size=(32, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
+            b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[32, 1], dtype='int64')
+
+            sum = paddle.add(a, b)
+            z = paddle.pow(sum, 2.0)
+
+            fc_1 = fluid.layers.fc(input=z, size=128)
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(
+                main_prog,
+                feed={"a": a_np,
+                      "b": b_np,
+                      "label": label_np},
+                fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        npu_pred, npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_reduce_any_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_reduce_any_op_npu.py
new file mode 100644
index 0000000000000..583a648224d73
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_reduce_any_op_npu.py
@@ -0,0 +1,135 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+sys.path.append("..")
+from op_test import OpTest, skip_check_grad_ci
+import paddle
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from paddle.fluid import compiler, Program, program_guard
+from paddle.fluid.framework import convert_np_dtype_to_dtype_
+
+paddle.enable_static()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestAny8DOp(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "reduce_any"
+        self.place = paddle.NPUPlace(0)
+        self.inputs = {
+            'X': np.random.randint(0, 2,
+                                   (2, 5, 3, 2, 2, 3, 4, 2)).astype("bool")
+        }
+        self.attrs = {'dim': (3, 5, 4)}
+        self.outputs = {'Out': self.inputs['X'].any(axis=self.attrs['dim'])}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestAnyOpWithDim(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "reduce_any"
+        self.place = paddle.NPUPlace(0)
+        self.inputs = {'X': np.random.randint(0, 2, (5, 6, 10)).astype("bool")}
+        self.attrs = {'dim': [1]}
+        self.outputs = {'Out': self.inputs['X'].any(axis=1)}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestAny8DOpWithDim(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "reduce_any"
+        self.place = paddle.NPUPlace(0)
+        self.inputs = {
+            'X': np.random.randint(0, 2,
+                                   (2, 5, 3, 2, 2, 3, 4, 2)).astype("bool")
+        }
+        self.attrs = {'dim': (3, 6)}
+        self.outputs = {'Out': self.inputs['X'].any(axis=self.attrs['dim'])}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestAnyOpWithKeepDim(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "reduce_any"
+        self.place = paddle.NPUPlace(0)
+        self.inputs = {'X': np.random.randint(0, 2, (5, 6, 10)).astype("bool")}
+        self.attrs = {'dim': (1, ), 'keep_dim': True}
+        self.outputs = {
+            'Out': np.expand_dims(
+                self.inputs['X'].any(axis=self.attrs['dim']), axis=1)
+        }
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+class TestAny8DOpWithKeepDim(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "reduce_any"
+        self.place = paddle.NPUPlace(0)
+        self.inputs = {
+            'X': np.random.randint(0, 2,
+                                   (2, 5, 3, 2, 2, 3, 4, 2)).astype("bool")
+        }
+        self.attrs = {'dim': (1, ), 'keep_dim': True}
+        self.outputs = {
+            'Out': np.expand_dims(
+                self.inputs['X'].any(axis=self.attrs['dim']), axis=1)
+        }
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_reduce_sum_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_reduce_sum_op_npu.py
new file mode 100644
index 0000000000000..d3861bf0780cb
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_reduce_sum_op_npu.py
@@ -0,0 +1,206 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestReduceSum(OpTest):
+    def setUp(self):
+        np.random.seed(SEED)
+        self.set_npu()
+        self.init_dtype()
+        self.place = paddle.NPUPlace(0)
+        self.init_op_type()
+        self.initTestCase()
+
+        self.use_mkldnn = False
+        self.attrs = {
+            'dim': self.axis,
+            'keep_dim': self.keep_dim,
+            'reduce_all': self.reduce_all
+        }
+        self.inputs = {'X': np.random.random(self.shape).astype(self.dtype)}
+        if self.attrs['reduce_all']:
+            self.outputs = {'Out': self.inputs['X'].sum()}
+        else:
+            self.outputs = {
+                'Out': self.inputs['X'].sum(axis=self.axis,
+                                            keepdims=self.attrs['keep_dim'])
+            }
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def init_op_type(self):
+        self.op_type = "reduce_sum"
+        self.use_mkldnn = False
+        self.keep_dim = False
+        self.reduce_all = False
+
+    def initTestCase(self):
+        self.shape = (5, 6)
+        self.axis = (0, )
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+    # TODO(ascendrc): Add grad test
+    # def test_check_grad(self):
+    #     if self.dtype == np.float16:
+    #         return
+    #     self.check_grad(['X'], 'Out')
+    #
+
+
+class TestReduceSum2(OpTest):
+    def init_dtype(self):
+        self.dtype = np.int32
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestReduceSumNet(unittest.TestCase):
+    def set_reduce_sum_function(self, x):
+        # keep_dim = False
+        return paddle.fluid.layers.reduce_sum(x, dim=-1)
+
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(2, 3, 4)).astype('float32')
+        b_np = np.random.random(size=(2, 3, 4)).astype('float32')
+        label_np = np.random.randint(2, size=(2, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[2, 3, 4], dtype='float32')
+            b = paddle.static.data(name="b", shape=[2, 3, 4], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[2, 1], dtype='int64')
+
+            a_1 = fluid.layers.fc(input=a, size=4, num_flatten_dims=2, act=None)
+            b_1 = fluid.layers.fc(input=b, size=4, num_flatten_dims=2, act=None)
+            z = paddle.add(a_1, b_1)
+            z_1 = self.set_reduce_sum_function(z)
+
+            prediction = fluid.layers.fc(input=z_1, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(
+                main_prog,
+                feed={"a": a_np,
+                      "b": b_np,
+                      "label": label_np},
+                fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        npu_pred, npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss))
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestReduceSumNet2(TestReduceSumNet):
+    def set_reduce_sum_function(self, x):
+        # keep_dim = True
+        return paddle.fluid.layers.reduce_sum(x, dim=-1, keep_dim=True)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestReduceSumNet3(TestReduceSumNet):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(2, 3, 4)).astype('float32')
+        b_np = np.random.random(size=(2, 3, 4)).astype('float32')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[2, 3, 4], dtype='float32')
+            b = paddle.static.data(name="b", shape=[2, 3, 4], dtype='float32')
+
+            z = paddle.add(a, b)
+            loss = fluid.layers.reduce_sum(z)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(100):
+
+            loss_res = exe.run(main_prog,
+                               feed={"a": a_np,
+                                     "b": b_np},
+                               fetch_list=[loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Loss: {}".format(epoch, loss_res))
+
+        return loss_res, loss_res
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_relu_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_relu_op_npu.py
new file mode 100644
index 0000000000000..9273d01299d8f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_relu_op_npu.py
@@ -0,0 +1,176 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestRelu(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "relu"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.rand(3, 2).astype(self.dtype)
+        out = x
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestReluFp16(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "relu"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.rand(3, 2).astype(self.dtype)
+        out = x
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.__class__.no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False, atol=1e-5)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestReluNeg(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "relu"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.array([0.1, -0.1, -1.0]).astype(self.dtype)
+        out = np.array([0.1, 0.0, 0.0]).astype(self.dtype)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+#
+#
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestReluNet(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(32, 32)).astype('float32')
+        b_np = np.random.random(size=(32, 32)).astype('float32')
+        label_np = np.random.randint(2, size=(32, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
+            b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[32, 1], dtype='int64')
+
+            sum = paddle.add(a, b)
+            z = paddle.nn.functional.relu(sum)
+
+            fc_1 = fluid.layers.fc(input=z, size=128)
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(
+                main_prog,
+                feed={"a": a_np,
+                      "b": b_np,
+                      "label": label_np},
+                fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        npu_pred, npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_reshape2_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_reshape2_op_npu.py
new file mode 100644
index 0000000000000..885c990c702bd
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_reshape2_op_npu.py
@@ -0,0 +1,77 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestReshape2(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "reshape2"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_data()
+        self.inputs = {"X": np.random.random(self.ori_shape).astype("float32")}
+        self.attrs = {"shape": self.new_shape}
+        self.outputs = {
+            "Out": self.inputs["X"].reshape(self.infered_shape),
+            'XShape': np.random.random(self.ori_shape).astype("float32")
+        }
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_data(self):
+        self.ori_shape = (2, 100)
+        self.new_shape = (20, 10)
+        self.infered_shape = (20, 10)
+
+    def test_check_output(self):
+        self.check_output_with_place(
+            self.place, check_dygraph=False, no_check_set=['XShape'])
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(
+            self.place, ['X'], 'Out', check_dygraph=False)
+
+
+class TestReshape2_case2(TestReshape2):
+    def init_data(self):
+        self.ori_shape = (2, 100)
+        self.new_shape = (-1, 10)
+        self.infered_shape = (20, 10)
+
+
+class TestReshape2_case3(TestReshape2):
+    def init_data(self):
+        self.ori_shape = (100, 5, 6)
+        self.new_shape = (-1, 0, 3)
+        self.infered_shape = (200, 5, 3)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_save_load_npu.py b/python/paddle/fluid/tests/unittests/npu/test_save_load_npu.py
new file mode 100644
index 0000000000000..e7e7fb39c913b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_save_load_npu.py
@@ -0,0 +1,108 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import sys
+sys.path.append("..")
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.nn import Embedding
+import paddle.fluid.framework as framework
+from paddle.fluid.optimizer import Adam
+from paddle.fluid.dygraph.base import to_variable
+from test_imperative_base import new_program_scope
+from paddle.fluid.executor import global_scope
+import numpy as np
+import six
+import pickle
+import os
+import errno
+from test_static_save_load import *
+
+paddle.enable_static()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestNPUSaveLoadBase(TestSaveLoadBase):
+    def set_place(self):
+        return fluid.CPUPlace() if not core.is_compiled_with_npu(
+        ) else paddle.NPUPlace(0)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestNPUSaveLoadPartial(TestSaveLoadPartial):
+    def set_place(self):
+        return fluid.CPUPlace() if not core.is_compiled_with_npu(
+        ) else paddle.NPUPlace(0)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestNPUSaveLoadSetStateDict(TestSaveLoadSetStateDict):
+    def set_place(self):
+        return fluid.CPUPlace() if not core.is_compiled_with_npu(
+        ) else paddle.NPUPlace(0)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestNPUProgramStatePartial(TestProgramStatePartial):
+    def set_place(self):
+        return fluid.CPUPlace() if not core.is_compiled_with_npu(
+        ) else paddle.NPUPlace(0)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestNPULoadFromOldInterface(TestLoadFromOldInterface):
+    def set_place(self):
+        return fluid.CPUPlace() if not core.is_compiled_with_npu(
+        ) else paddle.NPUPlace(0)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestNPULoadFromOldInterfaceSingleFile(TestLoadFromOldInterfaceSingleFile):
+    def set_place(self):
+        return fluid.CPUPlace() if not core.is_compiled_with_npu(
+        ) else paddle.NPUPlace(0)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestNPUProgramStateOldSave(TestProgramStateOldSave):
+    def setUp(self):
+        self.test_dygraph = False
+
+    def set_place(self):
+        return fluid.CPUPlace() if not core.is_compiled_with_npu(
+        ) else paddle.NPUPlace(0)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestNPUProgramStateOldSaveSingleModel(TestProgramStateOldSaveSingleModel):
+    def set_place(self):
+        return fluid.CPUPlace() if not core.is_compiled_with_npu(
+        ) else paddle.NPUPlace(0)
+
+
+if __name__ == '__main__':
+    paddle.enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_scale_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_scale_op_npu.py
new file mode 100644
index 0000000000000..9b4547bc24474
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_scale_op_npu.py
@@ -0,0 +1,89 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestScale(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "scale"
+        self.place = paddle.NPUPlace(0)
+        self.init_dtype()
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(
+                np.random.random((10, 10)).astype(self.dtype))
+        }
+        self.attrs = {'scale': -2.3, 'bias': 0, 'bias_after_scale': True}
+        self.outputs = {
+            'Out': self.inputs['X'] * self.dtype(self.attrs['scale'])
+        }
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+class TestFP16Scale(TestScale):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+class TestBiasAfterScale(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "scale"
+        self.place = paddle.NPUPlace(0)
+        self.init_dtype()
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(
+                np.random.random((10, 10)).astype(self.dtype))
+        }
+        self.attrs = {'scale': -2.3, 'bias': 0, 'bias_after_scale': False}
+        self.outputs = {
+            'Out': self.inputs['X'] * self.dtype(self.attrs['scale'])
+        }
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_scatter_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_scatter_op_npu.py
new file mode 100755
index 0000000000000..c3e52c9bfad53
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_scatter_op_npu.py
@@ -0,0 +1,126 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestCast1(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "scatter"
+        self.place = paddle.NPUPlace(0)
+
+        ref_np = np.ones((3, 2)).astype("float32")
+        index_np = np.array([1]).astype("int32")
+        updates_np = np.random.random((1, 2)).astype("float32")
+
+        output_np = np.copy(ref_np)
+        output_np[index_np] = updates_np
+        self.inputs = {'X': ref_np, 'Ids': index_np, 'Updates': updates_np}
+        self.outputs = {'Out': output_np}
+        self.attrs = {'overwrite': True}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+class TestCast2(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "scatter"
+        self.place = paddle.NPUPlace(0)
+
+        ref_np = np.ones((3, 2)).astype("int32")
+        index_np = np.array([1]).astype("int32")
+        updates_np = np.zeros((1, 2)).astype("int32")
+
+        output_np = np.copy(ref_np)
+        output_np[index_np] = updates_np
+        self.inputs = {'X': ref_np, 'Ids': index_np, 'Updates': updates_np}
+        self.outputs = {'Out': output_np}
+        self.attrs = {'overwrite': True}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+class TestCast3(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "scatter"
+        self.place = paddle.NPUPlace(0)
+
+        ref_np = np.ones((3, 2)).astype("float32")
+        index_np = np.array([1]).astype("int32")
+        updates_np = np.random.random((1, 2)).astype("float32")
+
+        output_np = np.copy(ref_np)
+        output_np[index_np] += updates_np
+        self.inputs = {'X': ref_np, 'Ids': index_np, 'Updates': updates_np}
+        self.outputs = {'Out': output_np}
+        self.attrs = {'overwrite': False}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+class TestCast4(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "scatter"
+        self.place = paddle.NPUPlace(0)
+
+        ref_np = np.ones((3, 2)).astype("float32")
+        index_np = np.array([1, 2]).astype("int32")
+        updates_np = np.random.random((2, 2)).astype("float32")
+
+        output_np = np.copy(ref_np)
+        output_np[1] = updates_np[0]
+        output_np[2] = updates_np[1]
+        self.inputs = {'X': ref_np, 'Ids': index_np, 'Updates': updates_np}
+        self.outputs = {'Out': output_np}
+        self.attrs = {'overwrite': True}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_sgd_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_sgd_op_npu.py
new file mode 100644
index 0000000000000..af0dea4776d23
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_sgd_op_npu.py
@@ -0,0 +1,119 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestSGD(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.place = paddle.NPUPlace(0)
+        self.op_type = "sgd"
+        self.conf()
+        w = np.random.random((self.h, self.w)).astype("float32")
+        g = np.random.random((self.h, self.w)).astype("float32")
+        lr = np.array([0.1]).astype("float32")
+
+        self.inputs = {'Param': w, 'Grad': g, 'LearningRate': lr}
+        self.outputs = {'ParamOut': w - lr * g}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def conf(self):
+        self.h = 12
+        self.w = 15
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestNet(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(32, 32)).astype('float32')
+        b_np = np.random.random(size=(32, 32)).astype('float32')
+        label_np = np.random.randint(2, size=(32, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
+            b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[32, 1], dtype='int64')
+
+            sum = paddle.add(a, b)
+            z = paddle.pow(sum, 2.0)
+
+            fc_1 = fluid.layers.fc(input=z, size=128)
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(
+                main_prog,
+                feed={"a": a_np,
+                      "b": b_np,
+                      "label": label_np},
+                fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        npu_pred, npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_shape_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_shape_op_npu.py
new file mode 100644
index 0000000000000..7b9a74b2be98d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_shape_op_npu.py
@@ -0,0 +1,57 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestShape(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "shape"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [5, 10]).astype(self.dtype)
+        out = np.array([5, 10])
+
+        self.inputs = {'Input': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_slice_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_slice_op_npu.py
new file mode 100644
index 0000000000000..500618f509f68
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_slice_op_npu.py
@@ -0,0 +1,158 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+
+paddle.enable_static()
+
+SEED = 2021
+EPOCH = 100
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestSliceOp(OpTest):
+    def setUp(self):
+        self.op_type = "slice"
+        self.set_npu()
+        self.init_dtype()
+        self.config()
+        self.inputs = {'Input': self.input}
+        self.outputs = {'Out': self.out}
+        self.attrs = {
+            'axes': self.axes,
+            'starts': self.starts,
+            'ends': self.ends,
+            'infer_flags': self.infer_flags
+        }
+
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype)
+        self.starts = [1, 0, 2]
+        self.ends = [3, 3, 4]
+        self.axes = [0, 1, 2]
+        self.infer_flags = [1, 1, 1]
+        self.out = self.input[1:3, 0:3, 2:4, :]
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.place = paddle.NPUPlace(0)
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+    def test_check_grad_normal(self):
+        if self.dtype == np.float16:
+            return
+        self.check_grad_with_place(
+            self.place, ['Input'], 'Out', check_dygraph=False)
+
+
+class TestSliceOp2(TestSliceOp):
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype)
+        self.starts = [1, 0, -3]
+        self.ends = [3, 3, -1]
+        self.axes = [0, 1, 2]
+        self.infer_flags = [1, 1, 1]
+        self.out = self.input[1:3, 0:3, -3:-1, :]
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestSliceOpFp16(TestSliceOp):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.__class__.no_need_check_grad = True
+        self.place = paddle.NPUPlace(0)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestSliceNet(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        batch_size = 32
+        data_shape = (32, 32)
+        a_np = np.random.random(size=data_shape).astype('float32')
+        b_np = np.random.random(size=data_shape).astype('float32')
+        label_np = np.random.randint(2, size=(batch_size, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=data_shape, dtype='float32')
+            b = paddle.static.data(name="b", shape=data_shape, dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[batch_size, 1], dtype='int64')
+
+            sum = paddle.add(a, b)
+            z = paddle.slice(sum, axes=[0, 1], starts=[0, 0], ends=[33, 2])
+
+            prediction = paddle.static.nn.fc(z, size=2, activation='softmax')
+
+            cost = paddle.nn.functional.cross_entropy(
+                input=prediction, label=label)
+            loss = paddle.mean(cost)
+            sgd = paddle.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+        print("Start run on {}".format(place))
+        for epoch in range(EPOCH):
+
+            pred_res, loss_res = exe.run(
+                main_prog,
+                feed={"a": a_np,
+                      "b": b_np,
+                      "label": label_np},
+                fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        npu_pred, npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_softmax_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_softmax_op_npu.py
new file mode 100644
index 0000000000000..c1ba41943a359
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_softmax_op_npu.py
@@ -0,0 +1,125 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestSoftmax(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.place = paddle.NPUPlace(0)
+        self.op_type = "softmax"
+        self.init_dtype()
+
+        x = np.random.random([3, 3]).astype(self.dtype)
+        np_out = np.exp(x) / np.sum(np.exp(x), axis=1, keepdims=True)
+        self.inputs = {'X': x}
+
+        self.attrs = {}
+        self.outputs = {'Out': np_out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.__class__.no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestSoftmaxNet(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(4, 32)).astype('float32')
+        b_np = np.random.random(size=(4, 32)).astype('float32')
+        label_np = np.random.randint(2, size=(4, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[4, 32], dtype='float32')
+            b = paddle.static.data(name="b", shape=[4, 32], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[4, 1], dtype='int64')
+
+            c = paddle.multiply(a, b)
+            d = paddle.sqrt(c)
+
+            # 4 x 128
+            fc_1 = fluid.layers.fc(input=d, size=128)
+            # 4 x 2
+            prediction = fluid.layers.fc(input=fc_1, size=2)
+
+            # 4 x 2
+            prob = fluid.layers.softmax(prediction, axis=1)
+
+            cost = fluid.layers.cross_entropy(input=prob, label=label)
+            loss = fluid.layers.mean(cost)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(
+                main_prog,
+                feed={"a": a_np,
+                      "b": b_np,
+                      "label": label_np},
+                fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        npu_pred, npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred, rtol=1e-2))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss, rtol=1e-2))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_softmax_with_cross_entropy_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_softmax_with_cross_entropy_op_npu.py
new file mode 100644
index 0000000000000..1b48268b0e77e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_softmax_with_cross_entropy_op_npu.py
@@ -0,0 +1,159 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from test_softmax_op import stable_softmax
+from test_softmax_with_cross_entropy_op import cross_entropy
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestSoftmaxWithCrossEntropyOp(OpTest):
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def initParams(self):
+        self.set_npu()
+        self.op_type = "softmax_with_cross_entropy"
+        self.numeric_stable_mode = False
+        self.place = paddle.NPUPlace(0)
+        self.soft_label = False
+        self.init_dtype()
+        self.axis = -1
+        self.ignore_index = -1
+        self.shape = [41, 37]
+        np.random.seed(SEED)
+
+    def setUp(self):
+        self.initParams()
+
+        logits = getattr(
+            self, "logits",
+            np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype))
+        softmax = np.apply_along_axis(stable_softmax, self.axis, logits)
+
+        if self.soft_label:
+            labels = np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype)
+            labels /= np.sum(labels, axis=self.axis, keepdims=True)
+        else:
+            axis_dim = self.shape[self.axis]
+            self.shape[self.axis] = 1
+            labels = np.random.randint(0, axis_dim, self.shape, dtype="int64")
+
+        loss = cross_entropy(softmax, labels, self.soft_label, self.axis,
+                             self.ignore_index)
+
+        self.inputs = {"Logits": logits, "Label": labels}
+        self.outputs = {
+            "Softmax": softmax.astype(self.dtype),
+            "Loss": loss.astype(self.dtype)
+        }
+        self.attrs = {
+            "numeric_stable_mode": self.numeric_stable_mode,
+            "soft_label": self.soft_label,
+            "ignore_index": self.ignore_index,
+        }
+
+        if self.axis != -1:
+            self.attrs['axis'] = self.axis
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+    # TODO(ascendrc): Add grad test
+    # def test_check_grad(self):
+    #     if self.dtype == np.float16:
+    #         return
+    #     self.check_grad(['X'], 'Out')
+    #
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestPowNet(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(32, 32)).astype('float32')
+        b_np = np.random.random(size=(32, 32)).astype('float32')
+        label_np = np.random.randint(2, size=(32, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
+            b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[32, 1], dtype='int64')
+
+            sum = paddle.add(a, b)
+            z = paddle.pow(sum, 2.0)
+
+            fc_1 = fluid.layers.fc(input=z, size=128)
+            prediction = fluid.layers.fc(input=fc_1, size=2)
+
+            cost = fluid.layers.softmax_with_cross_entropy(prediction, label)
+            loss = fluid.layers.reduce_mean(cost)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(
+                main_prog,
+                feed={"a": a_np,
+                      "b": b_np,
+                      "label": label_np},
+                fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        npu_pred, npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_sqrt_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_sqrt_op_npu.py
new file mode 100644
index 0000000000000..556fa76424b8b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_sqrt_op_npu.py
@@ -0,0 +1,154 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestSqrt(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "sqrt"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        out = np.sqrt(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+    # TODO(ascendrc): Add grad test
+    # def test_check_grad(self):
+    #     if self.dtype == np.float16:
+    #         return
+    #     self.check_grad(['X'], 'Out')
+    #
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestSqrtFp16(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "sqrt"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [3, 4]).astype(self.dtype)
+        out = np.sqrt(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.__class__.no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False, atol=1e-5)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestSqrtNet(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(32, 32)).astype('float32')
+        b_np = np.random.random(size=(32, 32)).astype('float32')
+        label_np = np.random.randint(2, size=(32, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
+            b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[32, 1], dtype='int64')
+
+            c = paddle.multiply(a, b)
+            d = paddle.sqrt(c)
+
+            fc_1 = fluid.layers.fc(input=d, size=128)
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(
+                main_prog,
+                feed={"a": a_np,
+                      "b": b_np,
+                      "label": label_np},
+                fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        npu_pred, npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_square_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_square_op_npu.py
new file mode 100644
index 0000000000000..8c1a8d0070484
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_square_op_npu.py
@@ -0,0 +1,154 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestSquare(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "square"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        out = np.square(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+    # TODO(ascendrc): Add grad test
+    # def test_check_grad(self):
+    #     if self.dtype == np.float16:
+    #         return
+    #     self.check_grad(['X'], 'Out')
+    #
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestSquareFp16(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "square"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [3, 4]).astype(self.dtype)
+        out = np.square(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.__class__.no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False, atol=1e-5)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestSquareNet(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(32, 32)).astype('float32')
+        b_np = np.random.random(size=(32, 32)).astype('float32')
+        label_np = np.random.randint(2, size=(32, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
+            b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[32, 1], dtype='int64')
+
+            c = paddle.multiply(a, b)
+            d = paddle.square(c)
+
+            fc_1 = fluid.layers.fc(input=d, size=128)
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(
+                main_prog,
+                feed={"a": a_np,
+                      "b": b_np,
+                      "label": label_np},
+                fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        npu_pred, npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_stack_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_stack_op_npu.py
new file mode 100644
index 0000000000000..6db98be9328a4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_stack_op_npu.py
@@ -0,0 +1,153 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestStack1(OpTest):
+    def initDefaultParameters(self):
+        self.num_inputs = 4
+        self.input_dim = (5, 6, 7)
+        self.axis = 0
+        self.dtype = 'float32'
+
+    def get_x_names(self):
+        x_names = []
+        for i in range(self.num_inputs):
+            x_names.append('x{}'.format(i))
+        return x_names
+
+    def setUp(self):
+        self.initDefaultParameters()
+        self.set_npu()
+        self.op_type = "stack"
+        self.place = paddle.NPUPlace(0)
+
+        self.x = []
+        for i in range(self.num_inputs):
+            self.x.append(
+                np.random.random(size=self.input_dim).astype(self.dtype))
+
+        tmp = []
+        x_names = self.get_x_names()
+        for i in range(self.num_inputs):
+            tmp.append((x_names[i], self.x[i]))
+
+        self.inputs = {'X': tmp}
+        self.outputs = {'Y': np.stack(self.x, axis=self.axis)}
+        self.attrs = {'axis': self.axis}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+class TestStack2(OpTest):
+    def initDefaultParameters(self):
+        self.num_inputs = 4
+        self.input_dim = (2, 3, 4)
+        self.axis = -1
+        self.dtype = 'float32'
+
+    def get_x_names(self):
+        x_names = []
+        for i in range(self.num_inputs):
+            x_names.append('x{}'.format(i))
+        return x_names
+
+    def setUp(self):
+        self.initDefaultParameters()
+        self.set_npu()
+        self.op_type = "stack"
+        self.place = paddle.NPUPlace(0)
+
+        self.x = []
+        for i in range(self.num_inputs):
+            self.x.append(
+                np.random.random(size=self.input_dim).astype(self.dtype))
+
+        tmp = []
+        x_names = self.get_x_names()
+        for i in range(self.num_inputs):
+            tmp.append((x_names[i], self.x[i]))
+
+        self.inputs = {'X': tmp}
+        self.outputs = {'Y': np.stack(self.x, axis=self.axis)}
+        self.attrs = {'axis': self.axis}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+class TestStack3(OpTest):
+    def initDefaultParameters(self):
+        self.num_inputs = 4
+        self.input_dim = (2, 3, 4)
+        self.axis = 1
+        self.dtype = 'float32'
+
+    def get_x_names(self):
+        x_names = []
+        for i in range(self.num_inputs):
+            x_names.append('x{}'.format(i))
+        return x_names
+
+    def setUp(self):
+        self.initDefaultParameters()
+        self.set_npu()
+        self.op_type = "stack"
+        self.place = paddle.NPUPlace(0)
+
+        self.x = []
+        for i in range(self.num_inputs):
+            self.x.append(
+                np.random.random(size=self.input_dim).astype(self.dtype))
+
+        tmp = []
+        x_names = self.get_x_names()
+        for i in range(self.num_inputs):
+            tmp.append((x_names[i], self.x[i]))
+
+        self.inputs = {'X': tmp}
+        self.outputs = {'Y': np.stack(self.x, axis=self.axis)}
+        self.attrs = {'axis': self.axis}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_sum_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_sum_op_npu.py
new file mode 100755
index 0000000000000..6d39aa383ce94
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_sum_op_npu.py
@@ -0,0 +1,86 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestSum1(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.init_dtype()
+        self.op_type = "sum"
+        self.place = paddle.NPUPlace(0)
+
+        x0 = np.random.random((3, 40)).astype(self.dtype)
+        x1 = np.random.random((3, 40)).astype(self.dtype)
+        x2 = np.random.random((3, 40)).astype(self.dtype)
+        self.inputs = {'X': [("x0", x0), ("x1", x1), ("x2", x2)]}
+        y = x0 + x1 + x2
+        self.outputs = {'Out': y}
+
+        self.attrs = {'use_mkldnn': False}
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+class TestSum2(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.init_dtype()
+        self.op_type = "sum"
+        self.place = paddle.NPUPlace(0)
+
+        x0 = np.random.random((3, 3)).astype(self.dtype)
+        x1 = np.random.random((3, 3)).astype(self.dtype)
+        x2 = np.random.random((3, 3)).astype(self.dtype)
+        x3 = np.random.random((3, 3)).astype(self.dtype)
+        self.inputs = {'X': [("x0", x0), ("x1", x1), ("x2", x2), ("x3", x3)]}
+        y = x0 + x1 + x2 + x3
+        self.outputs = {'Out': y}
+
+        self.attrs = {'use_mkldnn': False}
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_tanh_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_tanh_op_npu.py
new file mode 100644
index 0000000000000..235fa2783fb3c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_tanh_op_npu.py
@@ -0,0 +1,154 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestTanh(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "tanh"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        out = np.tanh(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+    # TODO(ascendrc): Add grad test
+    # def test_check_grad(self):
+    #     if self.dtype == np.float16:
+    #         return
+    #     self.check_grad(['X'], 'Out')
+    #
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestTanhFp16(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "tanh"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [3, 4]).astype(self.dtype)
+        out = np.tanh(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.__class__.no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False, atol=1e-3)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestTanhNet(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(32, 32)).astype('float32')
+        b_np = np.random.random(size=(32, 32)).astype('float32')
+        label_np = np.random.randint(2, size=(32, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
+            b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[32, 1], dtype='int64')
+
+            c = paddle.multiply(a, b)
+            d = paddle.tanh(c)
+
+            fc_1 = fluid.layers.fc(input=d, size=128)
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(
+                main_prog,
+                feed={"a": a_np,
+                      "b": b_np,
+                      "label": label_np},
+                fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        npu_pred, npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_top_k_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_top_k_op_npu.py
new file mode 100644
index 0000000000000..04d4565f74858
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_top_k_op_npu.py
@@ -0,0 +1,95 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestTopk(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.place = paddle.NPUPlace(0)
+        self.op_type = "top_k"
+        self.init_dtype()
+
+        x = np.array([[0.78104149, 0.88745828, 0.32362268],
+                      [0.82196718, 0.48763277, 0.42826136],
+                      [0.96527182, 0.34851612, 0.12959783]]).astype(self.dtype)
+
+        self.inputs = {'X': x}
+        np_out = np.array(
+            [[0.88745828], [0.82196718], [0.96527182]]).astype(self.dtype)
+        np_indices = np.array([[1], [0], [0]])
+
+        self.attrs = {'k': 1, "axis": -1}
+        self.outputs = {'Out': np_out, 'Indices': np_indices}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.__class__.no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestTopkV2(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.place = paddle.NPUPlace(0)
+        self.op_type = "top_k"
+        self.init_dtype()
+
+        x = np.array([[0.78104149, 0.88745828, 0.32362268],
+                      [0.82196718, 0.48763277, 0.42826136],
+                      [0.96527182, 0.34851612, 0.12959783]]).astype(self.dtype)
+
+        self.inputs = {'X': x}
+        np_out = np.array([[0.88745828, 0.78104149], [0.82196718, 0.48763277],
+                           [0.96527182, 0.34851612]]).astype(self.dtype)
+        np_indices = np.array([[1, 0], [0, 1], [0, 1]])
+
+        self.attrs = {'k': 2, "axis": -1}
+        self.outputs = {'Out': np_out, 'Indices': np_indices}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.__class__.no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_transpose_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_transpose_op_npu.py
new file mode 100644
index 0000000000000..17f6a0ae1ca9b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_transpose_op_npu.py
@@ -0,0 +1,74 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest, _set_use_system_allocator
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestTransposeOp(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "transpose2"
+        self.place = paddle.NPUPlace(0)
+        self.init_dtype()
+        self.init_input_output()
+        self.init_kernel_type()
+        self.init_axis()
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(self.x)}
+        self.attrs = {'axis': [0, 2, 1, 3], 'data_format': 'AnyLayout'}
+        self.outputs = {'Out': self.out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_kernel_type(self):
+        self.use_mkldnn = False
+
+    def init_input_output(self):
+        self.x = np.random.uniform(0.1, 1, [8, 512, 12, 64]).astype(self.dtype)
+        self.out = np.transpose(self.x, [0, 2, 1, 3])
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def init_axis(self):
+        self.axis = -1
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestTransposeOpFP16(TestTransposeOp):
+    no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_truncated_gaussian_random_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_truncated_gaussian_random_op_npu.py
new file mode 100644
index 0000000000000..ff89508d19623
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_truncated_gaussian_random_op_npu.py
@@ -0,0 +1,71 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+from paddle.fluid.executor import Executor
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestTruncatedNormal(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        scope = paddle.fluid.core.Scope()
+
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+        paddle.seed(SEED)
+
+        with fluid.scope_guard(scope):
+            with paddle.static.program_guard(main_prog, startup_prog):
+                weight_attr = paddle.framework.ParamAttr(
+                    name="linear_weight",
+                    initializer=paddle.nn.initializer.TruncatedNormal(
+                        mean=0.0, std=2.0))
+                linear = paddle.nn.Linear(
+                    2, 2, weight_attr=weight_attr, bias_attr=False)
+
+            if run_npu:
+                place = paddle.NPUPlace(0)
+            else:
+                place = paddle.CPUPlace()
+
+            exe = paddle.static.Executor(place)
+            w = exe.run(startup_prog, fetch_list=['linear_weight'])
+            return w
+
+    def test_npu(self):
+        cpu_w = self._test(False)
+        npu_w = self._test(True)
+
+        self.assertTrue(np.allclose(npu_w, cpu_w))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_update_loss_scaling_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_update_loss_scaling_op_npu.py
new file mode 100644
index 0000000000000..1060e67078f8d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_update_loss_scaling_op_npu.py
@@ -0,0 +1,268 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.contrib.mixed_precision.amp_nn as amp_nn
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestUpdateLossScalingOp(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "update_loss_scaling"
+        self.place = paddle.NPUPlace(0)
+
+        self.init()
+        found_inf = np.array([False], dtype=np.bool)
+        x = np.random.random((1024, 1024)).astype(self.dtype)
+
+        self.inputs = {
+            'X': [('x0', x)],
+            'FoundInfinite': found_inf,
+            'PrevLossScaling': self.prev_loss_scaling,
+            'InGoodSteps': self.num_good_steps,
+            'InBadSteps': self.num_bad_steps
+        }
+
+        self.outputs = {
+            'Out': [('out0', x)],
+            'LossScaling': self.prev_loss_scaling * self.incr_ratio,
+            'OutGoodSteps': self.zero_steps,
+            'OutBadSteps': self.zero_steps
+        }
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init(self):
+        self.incr_ratio = 2.0
+        self.decr_ratio = 0.8
+        self.dtype = np.float32
+        self.prev_loss_scaling = np.array([2048]).astype(self.dtype)
+        self.num_good_steps = np.array([999], dtype=np.int32)
+        self.num_bad_steps = np.array([1], dtype=np.int32)
+        self.zero_steps = np.array([0], dtype=np.int32)
+        self.attrs = {
+            'incr_every_n_steps': 1000,
+            'decr_every_n_nan_or_inf': 2,
+            'incr_ratio': self.incr_ratio,
+            'decr_ratio': self.decr_ratio,
+        }
+
+    def test_check_output(self):
+        self.check_output_with_place(
+            self.place, check_dygraph=False, no_check_set=['Out'])
+
+
+class TestUpdateLossScalingOpBad(TestUpdateLossScalingOp):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "update_loss_scaling"
+        self.place = paddle.NPUPlace(0)
+
+        self.init()
+        found_inf = np.array([True], dtype=np.bool)
+        x = np.random.random((1024, 1024)).astype(self.dtype)
+        i = np.random.randint(0, 1024, 1)
+        j = np.random.randint(0, 1024, 1)
+        x[i[0]][j[0]] = np.inf
+
+        self.inputs = {
+            'X': [('x0', x)],
+            'FoundInfinite': found_inf,
+            'PrevLossScaling': self.prev_loss_scaling,
+            'InGoodSteps': self.num_good_steps,
+            'InBadSteps': self.num_bad_steps
+        }
+
+        self.outputs = {
+            'Out': [('out0', np.zeros_like(x))],
+            'LossScaling': self.prev_loss_scaling * self.decr_ratio,
+            'OutGoodSteps': self.zero_steps,
+            'OutBadSteps': self.zero_steps
+        }
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestUpdateLossScalingLayer(unittest.TestCase):
+    def loss_scaling_check(self, use_npu=True, scope=fluid.Scope()):
+        a = fluid.data(name="a", shape=[1024, 1024], dtype='float32')
+        b = fluid.data(name="b", shape=[512, 128], dtype='float32')
+        x = [a, b]
+        found_inf = fluid.data(name="found_inf", shape=[1], dtype='bool')
+        prev_loss_scaling = fluid.data(
+            name="prev_loss_scaling", shape=[1], dtype='float32')
+        num_good_steps = fluid.data(
+            name="num_good_steps", shape=[1], dtype='int32')
+        num_bad_steps = fluid.data(
+            name="num_bad_steps", shape=[1], dtype='int32')
+
+        a_v = np.random.random([1024, 1024]).astype('float32')
+        b_v = np.random.random([512, 128]).astype('float32')
+        found_inf_v = np.array([False]).astype('bool')
+        prev_loss_scaling_v = np.array([2048]).astype('float32')
+        num_good_steps_v = np.array([999], dtype=np.int32)
+        num_bad_steps_v = np.array([1], dtype=np.int32)
+
+        incr_every_n_steps = 1000
+        decr_every_n_nan_or_inf = 2
+        incr_ratio = 2
+        decr_ratio = 0.8
+
+        result = amp_nn.update_loss_scaling(
+            x,
+            found_inf,
+            prev_loss_scaling,
+            num_good_steps,
+            num_bad_steps,
+            incr_every_n_steps,
+            decr_every_n_nan_or_inf,
+            incr_ratio,
+            decr_ratio,
+            name="update_loss_scaling")
+
+        place = paddle.NPUPlace(0) if use_npu else fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        with fluid.scope_guard(scope):
+            exe.run(fluid.default_startup_program())
+            result_v = exe.run(feed={
+                'a': a_v,
+                'b': b_v,
+                'found_inf': found_inf_v,
+                'prev_loss_scaling': prev_loss_scaling_v,
+                'num_good_steps': num_good_steps_v,
+                'num_bad_steps': num_bad_steps_v
+            },
+                               fetch_list=[
+                                   result, x, found_inf, prev_loss_scaling,
+                                   num_good_steps, num_bad_steps
+                               ])
+        assert np.array_equal(result_v[0], a_v)
+        assert np.array_equal(result_v[1], b_v)
+        assert np.array_equal(result_v[0], result_v[2])
+        assert np.array_equal(result_v[1], result_v[3])
+        assert np.array_equal(result_v[4], found_inf_v)
+        assert np.array_equal(result_v[5], prev_loss_scaling_v * incr_ratio)
+        assert np.array_equal(result_v[6], np.zeros_like(num_good_steps_v))
+        assert np.array_equal(result_v[7], np.zeros_like(num_bad_steps_v))
+
+    def loss_scaling_check_inf(self, use_npu=True, scope=fluid.Scope()):
+        a = fluid.data(name="a", shape=[1024, 1024], dtype='float32')
+        b = fluid.data(name="b", shape=[512, 128], dtype='float32')
+        x = [a, b]
+        found_inf = fluid.data(name="found_inf", shape=[1], dtype='bool')
+        prev_loss_scaling = fluid.data(
+            name="prev_loss_scaling", shape=[1], dtype='float32')
+        num_good_steps = fluid.data(
+            name="num_good_steps", shape=[1], dtype='int32')
+        num_bad_steps = fluid.data(
+            name="num_bad_steps", shape=[1], dtype='int32')
+
+        a_v = np.random.random([1024, 1024]).astype('float32')
+        b_v = np.random.random([512, 128]).astype('float32')
+        i = np.random.randint(0, 1024, 1)
+        j = np.random.randint(0, 1024, 1)
+        a_v[i[0]][j[0]] = np.inf
+        found_inf_v = np.array([True]).astype('bool')
+        prev_loss_scaling_v = np.array([2048]).astype('float32')
+        num_good_steps_v = np.array([999], dtype=np.int32)
+        num_bad_steps_v = np.array([1], dtype=np.int32)
+
+        incr_every_n_steps = 1000
+        decr_every_n_nan_or_inf = 2
+        incr_ratio = 2
+        decr_ratio = 0.8
+
+        result = amp_nn.update_loss_scaling(
+            x,
+            found_inf,
+            prev_loss_scaling,
+            num_good_steps,
+            num_bad_steps,
+            incr_every_n_steps,
+            decr_every_n_nan_or_inf,
+            incr_ratio,
+            decr_ratio,
+            name="update_loss_scaling")
+
+        place = paddle.NPUPlace(0) if use_npu else fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        with fluid.scope_guard(scope):
+            exe.run(fluid.default_startup_program())
+            result_v = exe.run(feed={
+                'a': a_v,
+                'b': b_v,
+                'found_inf': found_inf_v,
+                'prev_loss_scaling': prev_loss_scaling_v,
+                'num_good_steps': num_good_steps_v,
+                'num_bad_steps': num_bad_steps_v
+            },
+                               fetch_list=[
+                                   result, x, found_inf, prev_loss_scaling,
+                                   num_good_steps, num_bad_steps
+                               ])
+        assert np.array_equal(result_v[0], np.zeros_like(a_v))
+        assert np.array_equal(result_v[1], np.zeros_like(b_v))
+        assert np.array_equal(result_v[2], np.zeros_like(a_v))
+        assert np.array_equal(result_v[3], np.zeros_like(b_v))
+        assert np.array_equal(result_v[4], found_inf_v)
+        assert np.array_equal(result_v[5], prev_loss_scaling_v * decr_ratio)
+        assert np.array_equal(result_v[6], np.zeros_like(num_good_steps_v))
+        assert np.array_equal(result_v[7], np.zeros_like(num_bad_steps_v))
+
+    def test_loss_scaling_cpu(self):
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.unique_name.guard():
+            with fluid.program_guard(main, startup):
+                self.loss_scaling_check(use_npu=False)
+
+    def test_loss_scaling_cpu_inf(self):
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.unique_name.guard():
+            with fluid.program_guard(main, startup):
+                self.loss_scaling_check_inf(use_npu=False)
+
+    def test_loss_scaling_npu(self):
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.unique_name.guard():
+            with fluid.program_guard(main, startup):
+                self.loss_scaling_check(use_npu=True)
+
+    def test_loss_scaling_npu_inf(self):
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.unique_name.guard():
+            with fluid.program_guard(main, startup):
+                self.loss_scaling_check_inf(use_npu=True)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index 569c4316880df..25717b7967712 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -1171,7 +1171,9 @@ def find_actual(target_name, fetch_list):
                 expect = self.outputs[out_name]
                 expect_t = expect[0] if isinstance(expect, tuple) else expect
 
-                if actual_t.dtype == np.uint16 and expect_t.dtype == np.float32:
+                if actual_t.dtype == np.uint16 and expect_t.dtype in [
+                        np.float32, np.float64
+                ]:
                     actual_t = convert_uint16_to_float(actual_t)
                     atol = 0.03
 
@@ -1449,9 +1451,18 @@ def check_grad_with_place(self,
         if not type(output_names) is list:
             output_names = [output_names]
 
+        # FIXME: Replace numeric_place with place to calculate numeric_grads.
+        # NOTE(liym27): There is an unknown error when call op.run() on NPUPlace, which
+        # needs to be fixed.
+        if hasattr(self.__class__,
+                   "use_npu") and self.__class__.use_npu == True:
+            numeric_place = paddle.CPUPlace()
+        else:
+            numeric_place = place
+
         numeric_grads = user_defined_grads or [
             get_numeric_gradient(
-                place,
+                numeric_place,
                 self.scope,
                 self.op,
                 self.inputs,
diff --git a/python/paddle/fluid/tests/unittests/op_test_xpu.py b/python/paddle/fluid/tests/unittests/op_test_xpu.py
index 37b446174d6d0..133367a5f3625 100644
--- a/python/paddle/fluid/tests/unittests/op_test_xpu.py
+++ b/python/paddle/fluid/tests/unittests/op_test_xpu.py
@@ -296,7 +296,7 @@ def check_grad_with_place(self,
             no_grad_set=no_grad_set)
         self._assert_is_close(a1, a2, inputs_to_check, 0.00000001,
                               "Gradient Check On two xpu")
-        self._assert_is_close(a1, a3, inputs_to_check, 0.001,
+        self._assert_is_close(a1, a3, inputs_to_check, max_relative_error,
                               "Gradient Check On cpu & xpu")
 
     def get_grad_with_place(self,
diff --git a/python/paddle/fluid/tests/unittests/seresnext_net.py b/python/paddle/fluid/tests/unittests/seresnext_net.py
index 2e4b1828c5bbe..1f02562dcb4fb 100644
--- a/python/paddle/fluid/tests/unittests/seresnext_net.py
+++ b/python/paddle/fluid/tests/unittests/seresnext_net.py
@@ -173,7 +173,7 @@ def optimizer(learning_rate=0.01):
 def batch_size(use_device):
     if use_device == DeviceType.CUDA:
         # Paddle uses 8GB P4 GPU for unittest so we decreased the batch size.
-        return 8
+        return 4
     return 12
 
 
diff --git a/python/paddle/fluid/tests/unittests/static_model_parallel_by_col.py b/python/paddle/fluid/tests/unittests/static_model_parallel_by_col.py
new file mode 100644
index 0000000000000..416f6bc4f0d41
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/static_model_parallel_by_col.py
@@ -0,0 +1,119 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import time
+import math
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import os
+import signal
+from functools import reduce
+from test_dist_base import TestDistRunnerBase, runtime_main
+import paddle.distributed.fleet as fleet
+
+paddle.enable_static()
+
+DTYPE = "float32"
+MODEL_PARALLEL_SIZE = 2
+IN_SIZE = 2 * MODEL_PARALLEL_SIZE
+OUT_SIZE = 2 * MODEL_PARALLEL_SIZE
+
+# Fix seed for test
+#fluid.default_startup_program().random_seed = 1
+#fluid.default_main_program().random_seed = 1
+
+
+def create_model(data, rank):
+    np.random.seed(2021)
+    np_weight = np.random.uniform(-1, 1, size=(IN_SIZE, OUT_SIZE)).astype(DTYPE)
+    if rank is not None:
+        start_col = 0 if rank == 0 else OUT_SIZE // 2
+        np_weight_part = np_weight[:, start_col:start_col + OUT_SIZE // 2]
+        result = paddle.distributed.split(
+            data,
+            size=(IN_SIZE, OUT_SIZE),
+            operation='linear',
+            axis=1,
+            num_partitions=MODEL_PARALLEL_SIZE,
+            weight_attr=paddle.ParamAttr(
+                initializer=fluid.initializer.NumpyArrayInitializer(
+                    np_weight_part)),
+            bias_attr=False, )
+    else:
+        result = fluid.layers.fc(
+            data,
+            size=OUT_SIZE,
+            param_attr=paddle.ParamAttr(
+                initializer=fluid.initializer.NumpyArrayInitializer(np_weight)),
+            bias_attr=False, )
+
+    predict = paddle.sum(result)
+    return predict
+
+
+class TestModelParallel(TestDistRunnerBase):
+    def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None):
+        # Input data
+        data_in = fluid.data(
+            name='data_in', shape=[batch_size, IN_SIZE], dtype=DTYPE)
+
+        if dist_strategy:
+            data_loader = fluid.io.DataLoader.from_generator(
+                feed_list=[data_in],
+                capacity=64,
+                use_double_buffer=False,
+                iterable=False)
+
+        if dist_strategy:
+            fleet.init(is_collective=True)
+            strategy = fleet.DistributedStrategy()
+            strategy.tensor_parallel = True
+            strategy.tensor_parallel_configs = {'tensor_parallel_degree': 2}
+
+        rank = fleet.worker_index() if dist_strategy else None
+        avg_cost = create_model(data_in, rank)
+        opt = fluid.optimizer.SGD(0.1)
+
+        if dist_strategy:
+            dist_opt = fleet.distributed_optimizer(
+                optimizer=opt, strategy=strategy)
+            dist_opt.minimize(avg_cost)
+        else:
+            opt.minimize(avg_cost)
+
+        def gen_data():
+            np.random.seed(2021)
+            while True:
+                data = [np.random.random([IN_SIZE]).astype(DTYPE)]
+                yield data
+
+        train_reader = paddle.batch(gen_data, batch_size=batch_size)
+
+        if dist_strategy:
+            return None, avg_cost, train_reader, None, None, None, data_loader
+        else:
+            return None, avg_cost, train_reader, None, None, None
+
+
+if __name__ == "__main__":
+    runtime_main(TestModelParallel)
diff --git a/python/paddle/fluid/tests/unittests/static_model_parallel_by_row.py b/python/paddle/fluid/tests/unittests/static_model_parallel_by_row.py
new file mode 100644
index 0000000000000..4a98792f8a047
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/static_model_parallel_by_row.py
@@ -0,0 +1,119 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import time
+import math
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import os
+import signal
+from functools import reduce
+from test_dist_base import TestDistRunnerBase, runtime_main
+import paddle.distributed.fleet as fleet
+
+paddle.enable_static()
+
+DTYPE = "float32"
+MODEL_PARALLEL_SIZE = 2
+IN_SIZE = 2 * MODEL_PARALLEL_SIZE
+OUT_SIZE = 2 * MODEL_PARALLEL_SIZE
+
+# Fix seed for test
+#fluid.default_startup_program().random_seed = 1
+#fluid.default_main_program().random_seed = 1
+
+
+def create_model(data, rank):
+    np.random.seed(2021)
+    np_weight = np.random.uniform(-1, 1, size=(IN_SIZE, OUT_SIZE)).astype(DTYPE)
+    if rank is not None:
+        start_row = 0 if rank == 0 else IN_SIZE // 2
+        np_weight_part = np_weight[start_row:start_row + IN_SIZE // 2, :]
+        result = paddle.distributed.split(
+            data,
+            size=(IN_SIZE, OUT_SIZE),
+            operation='linear',
+            axis=0,
+            num_partitions=MODEL_PARALLEL_SIZE,
+            weight_attr=paddle.ParamAttr(
+                initializer=fluid.initializer.NumpyArrayInitializer(
+                    np_weight_part)),
+            bias_attr=False, )
+    else:
+        result = fluid.layers.fc(
+            data,
+            size=OUT_SIZE,
+            param_attr=paddle.ParamAttr(
+                initializer=fluid.initializer.NumpyArrayInitializer(np_weight)),
+            bias_attr=False, )
+
+    predict = paddle.sum(result)
+    return predict
+
+
+class TestModelParallel(TestDistRunnerBase):
+    def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None):
+        # Input data
+        data_in = fluid.data(
+            name='data_in', shape=[batch_size, IN_SIZE], dtype=DTYPE)
+
+        if dist_strategy:
+            data_loader = fluid.io.DataLoader.from_generator(
+                feed_list=[data_in],
+                capacity=64,
+                use_double_buffer=False,
+                iterable=False)
+
+        if dist_strategy:
+            fleet.init(is_collective=True)
+            strategy = fleet.DistributedStrategy()
+            strategy.tensor_parallel = True
+            strategy.tensor_parallel_configs = {'tensor_parallel_degree': 2}
+
+        rank = fleet.worker_index() if dist_strategy else None
+        avg_cost = create_model(data_in, rank)
+        opt = fluid.optimizer.SGD(0.1)
+
+        if dist_strategy:
+            dist_opt = fleet.distributed_optimizer(
+                optimizer=opt, strategy=strategy)
+            dist_opt.minimize(avg_cost)
+        else:
+            opt.minimize(avg_cost)
+
+        def gen_data():
+            np.random.seed(2021)
+            while True:
+                data = [np.random.random([IN_SIZE]).astype(DTYPE)]
+                yield data
+
+        train_reader = paddle.batch(gen_data, batch_size=batch_size)
+
+        if dist_strategy:
+            return None, avg_cost, train_reader, None, None, None, data_loader
+        else:
+            return None, avg_cost, train_reader, None, None, None
+
+
+if __name__ == "__main__":
+    runtime_main(TestModelParallel)
diff --git a/python/paddle/fluid/tests/unittests/static_model_parallel_embedding.py b/python/paddle/fluid/tests/unittests/static_model_parallel_embedding.py
new file mode 100644
index 0000000000000..4a98792f8a047
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/static_model_parallel_embedding.py
@@ -0,0 +1,119 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import time
+import math
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import os
+import signal
+from functools import reduce
+from test_dist_base import TestDistRunnerBase, runtime_main
+import paddle.distributed.fleet as fleet
+
+paddle.enable_static()
+
+DTYPE = "float32"
+MODEL_PARALLEL_SIZE = 2
+IN_SIZE = 2 * MODEL_PARALLEL_SIZE
+OUT_SIZE = 2 * MODEL_PARALLEL_SIZE
+
+# Fix seed for test
+#fluid.default_startup_program().random_seed = 1
+#fluid.default_main_program().random_seed = 1
+
+
+def create_model(data, rank):
+    np.random.seed(2021)
+    np_weight = np.random.uniform(-1, 1, size=(IN_SIZE, OUT_SIZE)).astype(DTYPE)
+    if rank is not None:
+        start_row = 0 if rank == 0 else IN_SIZE // 2
+        np_weight_part = np_weight[start_row:start_row + IN_SIZE // 2, :]
+        result = paddle.distributed.split(
+            data,
+            size=(IN_SIZE, OUT_SIZE),
+            operation='linear',
+            axis=0,
+            num_partitions=MODEL_PARALLEL_SIZE,
+            weight_attr=paddle.ParamAttr(
+                initializer=fluid.initializer.NumpyArrayInitializer(
+                    np_weight_part)),
+            bias_attr=False, )
+    else:
+        result = fluid.layers.fc(
+            data,
+            size=OUT_SIZE,
+            param_attr=paddle.ParamAttr(
+                initializer=fluid.initializer.NumpyArrayInitializer(np_weight)),
+            bias_attr=False, )
+
+    predict = paddle.sum(result)
+    return predict
+
+
+class TestModelParallel(TestDistRunnerBase):
+    def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None):
+        # Input data
+        data_in = fluid.data(
+            name='data_in', shape=[batch_size, IN_SIZE], dtype=DTYPE)
+
+        if dist_strategy:
+            data_loader = fluid.io.DataLoader.from_generator(
+                feed_list=[data_in],
+                capacity=64,
+                use_double_buffer=False,
+                iterable=False)
+
+        if dist_strategy:
+            fleet.init(is_collective=True)
+            strategy = fleet.DistributedStrategy()
+            strategy.tensor_parallel = True
+            strategy.tensor_parallel_configs = {'tensor_parallel_degree': 2}
+
+        rank = fleet.worker_index() if dist_strategy else None
+        avg_cost = create_model(data_in, rank)
+        opt = fluid.optimizer.SGD(0.1)
+
+        if dist_strategy:
+            dist_opt = fleet.distributed_optimizer(
+                optimizer=opt, strategy=strategy)
+            dist_opt.minimize(avg_cost)
+        else:
+            opt.minimize(avg_cost)
+
+        def gen_data():
+            np.random.seed(2021)
+            while True:
+                data = [np.random.random([IN_SIZE]).astype(DTYPE)]
+                yield data
+
+        train_reader = paddle.batch(gen_data, batch_size=batch_size)
+
+        if dist_strategy:
+            return None, avg_cost, train_reader, None, None, None, data_loader
+        else:
+            return None, avg_cost, train_reader, None, None, None
+
+
+if __name__ == "__main__":
+    runtime_main(TestModelParallel)
diff --git a/python/paddle/fluid/tests/unittests/test_Tensor_type.py b/python/paddle/fluid/tests/unittests/test_Tensor_type.py
new file mode 100644
index 0000000000000..59395b94279ea
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_Tensor_type.py
@@ -0,0 +1,60 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid.core as core
+
+
+class TensorTypeTest(unittest.TestCase):
+    def test_type_totensor(self):
+        paddle.disable_static()
+        inx = np.array([1, 2])
+        tensorx = paddle.to_tensor(inx)
+        typex_str = str(type(tensorx))
+        expectx = "<class 'paddle.Tensor'>"
+        self.assertEqual((typex_str == expectx), True)
+
+    def test_type_Tensor(self):
+        paddle.disable_static()
+        inx = np.array([1, 2])
+        tensorx = paddle.Tensor(inx)
+        typex_str = str(type(tensorx))
+        expectx = "<class 'paddle.Tensor'>"
+        self.assertEqual((typex_str == expectx), True)
+
+        tensorx = paddle.tensor.logic.Tensor(inx)
+        typex_str = str(type(tensorx))
+        expectx = "<class 'paddle.Tensor'>"
+        self.assertEqual((typex_str == expectx), True)
+
+    def test_type_core(self):
+        paddle.disable_static()
+        inx = np.array([1, 2])
+        tensorx = core.VarBase(inx)
+        typex_str = str(type(tensorx))
+        expectx = "<class 'paddle.Tensor'>"
+        self.assertEqual((typex_str == expectx), True)
+
+        tensorx = paddle.framework.VarBase(inx)
+        typex_str = str(type(tensorx))
+        expectx = "<class 'paddle.Tensor'>"
+        self.assertEqual((typex_str == expectx), True)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
index cfa487a8354cf..6c35d445b43b7 100644
--- a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
@@ -18,6 +18,7 @@
 import numpy as np
 
 import paddle.fluid as fluid
+import paddle
 import paddle.fluid.layers as layers
 import paddle.fluid.core as core
 import gradient_checker
@@ -25,6 +26,28 @@
 from decorator_helper import prog_scope
 
 
+class TestTanhDoubleGradCheck(unittest.TestCase):
+    @prog_scope()
+    def func(self, place):
+        shape = [2, 3, 7, 9]
+        eps = 0.0005
+        dtype = np.float64
+        x = layers.data('x', shape, False, dtype=dtype)
+        x.persistable = True
+        y = paddle.tanh(x)
+        x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
+        x_arr[np.abs(x_arr) < 0.005] = 0.002
+        gradient_checker.double_grad_check(
+            [x], y, x_init=x_arr, place=place, eps=eps)
+
+    def test_grad(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
 class TestReluDoubleGradCheck(unittest.TestCase):
     @prog_scope()
     def func(self, place):
diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py
index ea183e9444878..92465c3e28401 100755
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -119,6 +119,72 @@ def test_check_grad(self):
         self.check_grad(['X'], 'Out', max_relative_error=0.01)
 
 
+class TestSilu(TestActivation):
+    def setUp(self):
+        self.op_type = "silu"
+        self.init_dtype()
+
+        np.random.seed(1024)
+        x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
+        out = x / (np.exp(-x) + 1)
+
+        self.inputs = {'X': x}
+        self.outputs = {'Out': out}
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
+        self.check_grad(['X'], 'Out')
+
+
+class TestSiluAPI(unittest.TestCase):
+    # test paddle.nn.Silu, paddle.nn.functional.silu
+    def setUp(self):
+        self.x_np = np.random.uniform(-1, 1, [11, 17]).astype('float32')
+        self.place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def test_static_api(self):
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.fluid.data('X', [11, 17])
+            out1 = F.silu(x)
+            m = paddle.nn.Silu()
+            out2 = m(x)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out1, out2])
+        out_ref = self.x_np / (1 + np.exp(-self.x_np))
+        for r in res:
+            self.assertEqual(np.allclose(out_ref, r), True)
+
+    def test_dygraph_api(self):
+        paddle.disable_static(self.place)
+        x = paddle.to_tensor(self.x_np)
+        out1 = F.silu(x)
+        m = paddle.nn.Silu()
+        out2 = m(x)
+        out_ref = self.x_np / (1 + np.exp(-self.x_np))
+        for r in [out1, out2]:
+            self.assertEqual(np.allclose(out_ref, r.numpy()), True)
+        paddle.enable_static()
+
+    def test_errors(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            # The input type must be Variable.
+            self.assertRaises(TypeError, F.silu, 1)
+            # The input dtype must be float16, float32, float64.
+            x_int32 = paddle.fluid.data(
+                name='x_int32', shape=[11, 17], dtype='int32')
+            self.assertRaises(TypeError, F.silu, x_int32)
+            # support the input dtype is float16
+            x_fp16 = paddle.fluid.data(
+                name='x_fp16', shape=[11, 17], dtype='float16')
+            F.silu(x_fp16)
+
+
 class TestLogSigmoid(TestActivation):
     def setUp(self):
         self.op_type = "logsigmoid"
@@ -2629,6 +2695,7 @@ def test_check_grad(self):
 
 create_test_act_fp16_class(TestActivation)
 create_test_act_fp16_class(TestSigmoid)
+create_test_act_fp16_class(TestSilu)
 create_test_act_fp16_class(TestLogSigmoid)
 create_test_act_fp16_class(TestTanh)
 create_test_act_fp16_class(TestTanhshrink)
diff --git a/python/paddle/fluid/tests/unittests/test_ascend_group.sh b/python/paddle/fluid/tests/unittests/test_ascend_group.sh
index 31c442e096262..68cb075b90c3a 100644
--- a/python/paddle/fluid/tests/unittests/test_ascend_group.sh
+++ b/python/paddle/fluid/tests/unittests/test_ascend_group.sh
@@ -16,15 +16,14 @@
 
 set -e
 
-cluster_node_ips="127.0.0.1"
-export PADDLE_TRAINERS_NUM=4
-export POD_IP=127.0.0.1
-export PADDLE_TRAINERS=127.0.0.1
-export PADDLE_TRAINER_ID=0
+curr_host_ip=`hostname -i`
+python hccl_tools.py --device_num "[0,4)" --server_ip ${curr_host_ip}
 
-export PADDLE_PORT=35789
-export TRAINER_PORTS_NUM=4
+export RANK_TABLE_FILE="${PWD}/hccl_4p_0123_${curr_host_ip}.json"
 
-distributed_args="--ips=${cluster_node_ips} --ascend_npus=0,1,2,3 --log_dir=testlog"
+# use ascend
+echo "begin test use ascend npu"
+
+distributed_args="--run_mode=collective --log_dir=testlog"
 python -m paddle.distributed.fleet.launch ${distributed_args} \
   ascend_group.py fleetascendgroup
diff --git a/python/paddle/fluid/tests/unittests/test_assign_op_npu.py b/python/paddle/fluid/tests/unittests/test_assign_op_npu.py
new file mode 100644
index 0000000000000..ed21549b7e01f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_assign_op_npu.py
@@ -0,0 +1,56 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestAssign(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.place = paddle.NPUPlace(0)
+        self.op_type = "assign"
+        self.init_dtype()
+
+        x = np.random.random([3, 3]).astype(self.dtype)
+        self.inputs = {'X': x}
+
+        self.attrs = {}
+        self.outputs = {'Out': x}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_base_layer.py b/python/paddle/fluid/tests/unittests/test_base_layer.py
index 31879dae0dad0..e6e15575f2ca6 100644
--- a/python/paddle/fluid/tests/unittests/test_base_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_base_layer.py
@@ -331,5 +331,72 @@ def test_modified(self):
                 np.array_equal(dy_outs[i].numpy(), st_outs[i].numpy()))
 
 
+class TestLayerTo(unittest.TestCase):
+    def setUp(self):
+        paddle.disable_static()
+        self.linear = paddle.nn.Linear(2, 2)
+        self.new_grad = np.random.random([2, 2])
+        self.linear.weight._set_grad_ivar(paddle.to_tensor(self.new_grad))
+        buffer = paddle.to_tensor([0.0], dtype='float32')
+        self.linear.register_buffer("buf_name", buffer, persistable=True)
+
+        sublayer = paddle.nn.Conv1D(3, 2, 3)
+        self.linear.add_sublayer(1, sublayer)
+
+    def test_to_api(self):
+        self.linear.to(dtype='double')
+        self.assertEqual(self.linear.weight.dtype,
+                         paddle.fluid.core.VarDesc.VarType.FP64)
+        self.assertEqual(self.linear.buf_name.dtype,
+                         paddle.fluid.core.VarDesc.VarType.FP64)
+        self.assertTrue(np.allclose(self.linear.weight.grad, self.new_grad))
+        self.assertTrue(self.linear.weight._grad_ivar().dtype,
+                        paddle.fluid.core.VarDesc.VarType.FP64)
+
+        self.linear.to()
+        self.assertEqual(self.linear.weight.dtype,
+                         paddle.fluid.core.VarDesc.VarType.FP64)
+        self.assertEqual(self.linear.buf_name.dtype,
+                         paddle.fluid.core.VarDesc.VarType.FP64)
+        self.assertTrue(np.allclose(self.linear.weight.grad, self.new_grad))
+        self.assertTrue(self.linear.weight._grad_ivar().dtype,
+                        paddle.fluid.core.VarDesc.VarType.FP64)
+
+        if paddle.fluid.is_compiled_with_cuda():
+            self.linear.to(device=paddle.CUDAPlace(0))
+            self.assertTrue(self.linear.weight.place.is_gpu_place())
+            self.assertEqual(self.linear.weight.place.gpu_device_id(), 0)
+            self.assertTrue(self.linear.buf_name.place.is_gpu_place())
+            self.assertEqual(self.linear.buf_name.place.gpu_device_id(), 0)
+            self.assertTrue(self.linear.weight._grad_ivar().place.is_gpu_place(
+            ))
+            self.assertEqual(
+                self.linear.weight._grad_ivar().place.gpu_device_id(), 0)
+
+            self.linear.to(device='gpu:0')
+            self.assertTrue(self.linear.weight.place.is_gpu_place())
+            self.assertEqual(self.linear.weight.place.gpu_device_id(), 0)
+            self.assertTrue(self.linear.buf_name.place.is_gpu_place())
+            self.assertEqual(self.linear.buf_name.place.gpu_device_id(), 0)
+            self.assertTrue(self.linear.weight._grad_ivar().place.is_gpu_place(
+            ))
+            self.assertEqual(
+                self.linear.weight._grad_ivar().place.gpu_device_id(), 0)
+
+        self.linear.to(device=paddle.CPUPlace())
+        self.assertTrue(self.linear.weight.place.is_cpu_place())
+        self.assertTrue(self.linear.buf_name.place.is_cpu_place())
+        self.assertTrue(self.linear.weight._grad_ivar().place.is_cpu_place())
+
+        self.linear.to(device='cpu')
+        self.assertTrue(self.linear.weight.place.is_cpu_place())
+        self.assertTrue(self.linear.buf_name.place.is_cpu_place())
+        self.assertTrue(self.linear.weight._grad_ivar().place.is_cpu_place())
+
+        self.assertRaises(ValueError, self.linear.to, device=1)
+
+        self.assertRaises(AssertionError, self.linear.to, blocking=1)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
index ee69a37f943a2..6a6f85a483206 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
@@ -195,7 +195,13 @@ def test_1d(self):
                 channel_first_x = paddle.transpose(x, [0, 2, 1])
                 y2 = net2(channel_first_x)
                 y2 = paddle.transpose(y2, [0, 2, 1])
-                self.assertEqual(np.allclose(y1.numpy(), y2.numpy()), True)
+                if core.is_compiled_with_rocm():
+                    # HIP will fail if no atol
+                    self.assertEqual(
+                        np.allclose(
+                            y1.numpy(), y2.numpy(), atol=1e-07), True)
+                else:
+                    self.assertEqual(np.allclose(y1.numpy(), y2.numpy()), True)
 
     def test_2d(self):
         for p in self.places:
@@ -209,7 +215,13 @@ def test_2d(self):
                 channel_first_x = paddle.transpose(x, [0, 3, 1, 2])
                 y2 = net2(channel_first_x)
                 y2 = paddle.transpose(y2, [0, 2, 3, 1])
-                self.assertEqual(np.allclose(y1.numpy(), y2.numpy()), True)
+                if core.is_compiled_with_rocm():
+                    # HIP will fail if no atol
+                    self.assertEqual(
+                        np.allclose(
+                            y1.numpy(), y2.numpy(), atol=1e-07), True)
+                else:
+                    self.assertEqual(np.allclose(y1.numpy(), y2.numpy()), True)
 
     def test_3d(self):
         for p in self.places:
diff --git a/python/paddle/fluid/tests/unittests/test_c_concat.py b/python/paddle/fluid/tests/unittests/test_c_concat.py
new file mode 100644
index 0000000000000..20f166af14c9c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_c_concat.py
@@ -0,0 +1,34 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+import paddle
+
+from test_collective_base import TestDistBase
+
+paddle.enable_static()
+
+
+class TestConcatOp(TestDistBase):
+    def _setup_config(self):
+        pass
+
+    def test_concat(self, col_type="concat"):
+        self.check_with_place("collective_concat_op.py", col_type)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_c_identity.py b/python/paddle/fluid/tests/unittests/test_c_identity.py
new file mode 100644
index 0000000000000..c780f800d1ed5
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_c_identity.py
@@ -0,0 +1,34 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+import paddle
+
+from test_collective_base import TestDistBase
+
+paddle.enable_static()
+
+
+class TestIdentityOp(TestDistBase):
+    def _setup_config(self):
+        pass
+
+    def test_identity(self, col_type="identity"):
+        self.check_with_place("collective_identity_op.py", col_type)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_c_split.py b/python/paddle/fluid/tests/unittests/test_c_split.py
new file mode 100644
index 0000000000000..0a5d91e0625e2
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_c_split.py
@@ -0,0 +1,34 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+import paddle
+
+from test_collective_base import TestDistBase
+
+paddle.enable_static()
+
+
+class TestSplitOp(TestDistBase):
+    def _setup_config(self):
+        pass
+
+    def test_split(self, col_type="split"):
+        self.check_with_place("collective_split_op.py", col_type)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_clip_op.py b/python/paddle/fluid/tests/unittests/test_clip_op.py
index 2946798a82f78..b05100fc7b433 100644
--- a/python/paddle/fluid/tests/unittests/test_clip_op.py
+++ b/python/paddle/fluid/tests/unittests/test_clip_op.py
@@ -50,10 +50,14 @@ def setUp(self):
         self.outputs = {'Out': np.clip(self.inputs['X'], min_v, max_v)}
 
     def test_check_output(self):
+        paddle.enable_static()
         self.check_output()
+        paddle.disable_static()
 
     def test_check_grad_normal(self):
+        paddle.enable_static()
         self.check_grad(['X'], 'Out')
+        paddle.disable_static()
 
     def initTestCase(self):
         self.shape = (4, 10, 10)
@@ -102,6 +106,7 @@ def initTestCase(self):
 
 class TestClipOpError(unittest.TestCase):
     def test_errors(self):
+        paddle.enable_static()
         with program_guard(Program(), Program()):
             input_data = np.random.random((2, 4)).astype("float32")
 
@@ -115,6 +120,7 @@ def test_dtype():
                 fluid.layers.clip(x=x2, min=-1.0, max=1.0)
 
             self.assertRaises(TypeError, test_dtype)
+        paddle.disable_static()
 
 
 class TestClipAPI(unittest.TestCase):
@@ -140,7 +146,10 @@ def test_clip(self):
         out_8 = paddle.clip(images)
         out_9 = paddle.clip(paddle.cast(images, 'float64'), min=0.2, max=0.9)
 
-        res1, res2, res3, res4, res5, res6, res7, res8, res9 = exe.run(
+        out_10 = paddle.clip(paddle.cast(images * 10, 'int32'), min=2, max=8)
+        out_11 = paddle.clip(paddle.cast(images * 10, 'int64'), min=2, max=8)
+
+        res1, res2, res3, res4, res5, res6, res7, res8, res9, res10, res11 = exe.run(
             fluid.default_main_program(),
             feed={
                 "image": data,
@@ -148,7 +157,8 @@ def test_clip(self):
                 "max": np.array([0.8]).astype('float32')
             },
             fetch_list=[
-                out_1, out_2, out_3, out_4, out_5, out_6, out_7, out_8, out_9
+                out_1, out_2, out_3, out_4, out_5, out_6, out_7, out_8, out_9,
+                out_10, out_11
             ])
 
         self.assertTrue(np.allclose(res1, data.clip(0.2, 0.8)))
@@ -161,8 +171,14 @@ def test_clip(self):
         self.assertTrue(np.allclose(res8, data))
         self.assertTrue(
             np.allclose(res9, data.astype(np.float64).clip(0.2, 0.9)))
+        self.assertTrue(
+            np.allclose(res10, (data * 10).astype(np.int32).clip(2, 8)))
+        self.assertTrue(
+            np.allclose(res11, (data * 10).astype(np.int64).clip(2, 8)))
+        paddle.disable_static()
 
     def test_clip_dygraph(self):
+        paddle.disable_static()
         place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
         ) else fluid.CPUPlace()
         paddle.disable_static(place)
@@ -176,9 +192,16 @@ def test_clip_dygraph(self):
         out_2 = paddle.clip(images, min=0.2, max=0.9)
         out_3 = paddle.clip(images, min=v_min, max=v_max)
 
+        out_4 = paddle.clip(paddle.cast(images * 10, 'int32'), min=2, max=8)
+        out_5 = paddle.clip(paddle.cast(images * 10, 'int64'), min=2, max=8)
+
         self.assertTrue(np.allclose(out_1.numpy(), data.clip(0.2, 0.8)))
         self.assertTrue(np.allclose(out_2.numpy(), data.clip(0.2, 0.9)))
         self.assertTrue(np.allclose(out_3.numpy(), data.clip(0.2, 0.8)))
+        self.assertTrue(
+            np.allclose(out_4.numpy(), (data * 10).astype(np.int32).clip(2, 8)))
+        self.assertTrue(
+            np.allclose(out_5.numpy(), (data * 10).astype(np.int64).clip(2, 8)))
 
     def test_errors(self):
         paddle.enable_static()
@@ -186,6 +209,7 @@ def test_errors(self):
         x2 = fluid.data(name='x2', shape=[1], dtype="int8")
         self.assertRaises(TypeError, paddle.clip, x=x1, min=0.2, max=0.8)
         self.assertRaises(TypeError, paddle.clip, x=x2, min=0.2, max=0.8)
+        paddle.disable_static()
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_collective_allreduce_api.py b/python/paddle/fluid/tests/unittests/test_collective_allreduce_api.py
index a405da80adaf0..eed2388f36ffe 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_allreduce_api.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_allreduce_api.py
@@ -27,8 +27,14 @@ def _setup_config(self):
         pass
 
     def test_allreduce_nccl(self):
-        self.check_with_place("collective_allreduce_api.py", "allreduce",
-                              "nccl")
+        if paddle.fluid.core.is_compiled_with_cuda():
+            self.check_with_place("collective_allreduce_api.py", "allreduce",
+                                  "nccl")
+
+    def test_allreduce_bkcl(self):
+        if paddle.fluid.core.is_compiled_with_xpu():
+            self.check_with_place("collective_allreduce_api.py", "allreduce",
+                                  "bkcl")
 
     def test_allreduce_gloo(self):
         self.check_with_place("collective_allreduce_api.py", "allreduce",
diff --git a/python/paddle/fluid/tests/unittests/test_collective_api_base.py b/python/paddle/fluid/tests/unittests/test_collective_api_base.py
index 660018e285a85..ad85adb2d5197 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_api_base.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_api_base.py
@@ -50,6 +50,9 @@ def run_trainer(self, args):
             device_id = int(os.getenv("FLAGS_selected_gpus", "0"))
             place = fluid.CUDAPlace(
                 device_id)  #if args.use_gpu else fluid.CPUPlace()
+        elif args['backend'] == 'bkcl':
+            device_id = int(os.getenv("FLAGS_selected_xpus", "0"))
+            place = fluid.XPUPlace(device_id)
         else:
             place = fluid.CPUPlace()
         exe = fluid.Executor(place)
@@ -71,7 +74,6 @@ def run_trainer(self, args):
 def runtime_main(test_class, col_type):
     args = {}
     model = test_class()
-    args["deviceid"] = os.getenv("FLAGS_selected_gpus")
     args["trainerid"] = int(os.getenv("PADDLE_TRAINER_ID"))
     args["trainernum"] = int(os.getenv("PADDLE_TRAINERS_NUM"))
     args["endpoints"] = os.getenv('PADDLE_TRAINER_ENDPOINTS')
@@ -112,21 +114,38 @@ def _run_cluster(self, model_file, envs):
         worker_endpoints = self._ps_endpoints.split(",")
         w0_ep, w1_ep = worker_endpoints
         #print("w0_ep:",w0_ep," w1_ep:",w1_ep)
-        env0 = {
-            "FLAGS_selected_gpus": "0",
-            "PADDLE_TRAINER_ID": "0",
-            "PADDLE_TRAINERS_NUM": "2",
-            "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints,
-            "PADDLE_CURRENT_ENDPOINT": w0_ep
-        }
+        if core.is_compiled_with_cuda():
+            env0 = {
+                "FLAGS_selected_gpus": "0",
+                "PADDLE_TRAINER_ID": "0",
+                "PADDLE_TRAINERS_NUM": "2",
+                "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints,
+                "PADDLE_CURRENT_ENDPOINT": w0_ep
+            }
 
-        env1 = {
-            "FLAGS_selected_gpus": "1",
-            "PADDLE_TRAINER_ID": "1",
-            "PADDLE_TRAINERS_NUM": "2",
-            "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints,
-            "PADDLE_CURRENT_ENDPOINT": w1_ep
-        }
+            env1 = {
+                "FLAGS_selected_gpus": "1",
+                "PADDLE_TRAINER_ID": "1",
+                "PADDLE_TRAINERS_NUM": "2",
+                "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints,
+                "PADDLE_CURRENT_ENDPOINT": w1_ep
+            }
+        elif core.is_compiled_with_xpu():
+            env0 = {
+                "FLAGS_selected_xpus": "0",
+                "PADDLE_TRAINER_ID": "0",
+                "PADDLE_TRAINERS_NUM": "2",
+                "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints,
+                "PADDLE_CURRENT_ENDPOINT": w0_ep
+            }
+
+            env1 = {
+                "FLAGS_selected_xpus": "1",
+                "PADDLE_TRAINER_ID": "1",
+                "PADDLE_TRAINERS_NUM": "2",
+                "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints,
+                "PADDLE_CURRENT_ENDPOINT": w1_ep
+            }
         #update environment
         env0.update(envs)
         env1.update(envs)
@@ -169,7 +188,10 @@ def check_with_place(self,
                          path_id="0",
                          check_error_log=False,
                          need_envs={}):
-        with_gloo = '0' if backend == "nccl" else '1'
+        if backend == "nccl" or backend == "bkcl":
+            with_gloo = '0'
+        else:
+            with_gloo = '1'
         required_envs = {
             "FLAGS_fraction_of_gpu_memory_to_use": "0.15",
             "FLAGS_eager_delete_tensor_gb": "0.0",
diff --git a/python/paddle/fluid/tests/unittests/test_collective_base.py b/python/paddle/fluid/tests/unittests/test_collective_base.py
index fc267ed914ec2..697e8d32d67a8 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_base.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_base.py
@@ -274,6 +274,11 @@ def check_with_place(self,
             self.assertTrue(
                 np.allclose(
                     tr1_out, need_result, rtol=1e-05, atol=1e-05))
+        elif col_type == "identity":
+            need_result1 = input1
+            need_result2 = input2
+            self.assertTrue(np.allclose(tr0_out, need_result1, rtol=0, atol=0))
+            self.assertTrue(np.allclose(tr1_out, need_result2, rtol=0, atol=0))
         elif col_type == "reduce_slicegather":
             slicesize = input1.shape[0] // 2
             tmp10 = input1[0:slicesize]
@@ -284,5 +289,22 @@ def check_with_place(self,
             need_result2 = np.concatenate((tmp20, tmp21), axis=1)
             self.assertTrue(np.allclose(tr0_out, need_result1))
             self.assertTrue(np.allclose(tr1_out, need_result2))
+        elif col_type == "concat":
+            need_result = np.concatenate((input1, input2), axis=1)
+            self.assertTrue(
+                np.allclose(
+                    tr0_out, need_result, rtol=1e-05, atol=1e-05))
+            self.assertTrue(
+                np.allclose(
+                    tr1_out, need_result, rtol=1e-05, atol=1e-05))
+        elif col_type == "split":
+            need_result1 = np.split(input1, 2, axis=1)[0]
+            need_result2 = np.split(input2, 2, axis=1)[1]
+            self.assertTrue(
+                np.allclose(
+                    tr0_out, need_result1, rtol=1e-05, atol=1e-05))
+            self.assertTrue(
+                np.allclose(
+                    tr1_out, need_result2, rtol=1e-05, atol=1e-05))
         else:
             pass
diff --git a/python/paddle/fluid/tests/unittests/test_collective_reduce_api.py b/python/paddle/fluid/tests/unittests/test_collective_reduce_api.py
index 8d28c794f023a..721f446c9f094 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_reduce_api.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_reduce_api.py
@@ -27,7 +27,12 @@ def _setup_config(self):
         pass
 
     def test_reduce_nccl(self):
-        self.check_with_place("collective_reduce_api.py", "reduce", "nccl")
+        if paddle.fluid.core.is_compiled_with_cuda():
+            self.check_with_place("collective_reduce_api.py", "reduce", "nccl")
+
+    def test_reduce_bkcl(self):
+        if paddle.fluid.core.is_compiled_with_xpu():
+            self.check_with_place("collective_reduce_api.py", "reduce", "bkcl")
 
     def test_reduce_gloo(self):
         self.check_with_place("collective_reduce_api.py", "reduce", "gloo", "1")
diff --git a/python/paddle/fluid/tests/unittests/test_communicator_ps_gpu.py b/python/paddle/fluid/tests/unittests/test_communicator_ps_gpu.py
new file mode 100644
index 0000000000000..5de1ebf581372
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_communicator_ps_gpu.py
@@ -0,0 +1,97 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import unittest
+import time
+import threading
+import numpy
+
+import paddle
+paddle.enable_static()
+
+import paddle.fluid as fluid
+import paddle.distributed.fleet.base.role_maker as role_maker
+import paddle.distributed.fleet as fleet
+
+
+class TestCommunicator(unittest.TestCase):
+    def test_communicator_ps_gpu(self):
+        with open("test_communicator_ps_gpu.txt", "w") as f:
+            data = "1 0.6 1 0.7\n"
+            f.write(data)
+
+        os.environ["PADDLE_PSERVER_NUMS"] = "2"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+        os.environ["PADDLE_TRAINER_ID"] = "0"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ[
+            "PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001,127.0.0.2:36001"
+        os.environ[
+            "PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36002,127.0.0.2:36002"
+        os.environ["TRAINING_ROLE"] = "TRAINER"
+        os.environ["FLAGS_selected_gpus"] = "0"
+        role = role_maker.PaddleCloudRoleMaker()
+
+        fleet.init(role)
+        x = fluid.layers.data(name='x', shape=[1], dtype='float32')
+        y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+        slots_vars = [x, y]
+
+        cost = fluid.layers.square_error_cost(input=x, label=y)
+        avg_cost = fluid.layers.mean(cost)
+
+        optimizer = fluid.optimizer.Adam(0.01)
+
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.a_sync = True
+        strategy.a_sync_configs = {
+            "launch_barrier": False,
+            "use_ps_gpu": 1,
+        }
+        startup_program = paddle.static.Program()
+        main_program = paddle.static.Program()
+        optimizer = fleet.distributed_optimizer(optimizer, strategy)
+        optimizer.minimize(avg_cost)
+
+        dataset = paddle.distributed.InMemoryDataset()
+        dataset.init(
+            batch_size=32, thread_num=1, pipe_command="cat", use_var=slots_vars)
+        dataset.set_filelist(["test_communicator_ps_gpu.txt"])
+        dataset.load_into_memory()
+
+        os.environ["TEST_MODE"] = "1"
+        exe = fluid.Executor(fluid.CPUPlace())
+        exe.run(startup_program)
+        main_program._fleet_opt = {"stat_var_names": [x.name]}
+        fleet.init_worker()
+
+        try:
+            exe.train_from_dataset(main_program, dataset)
+        except ImportError as e:
+            pass
+        except Exception as e:
+            self.assertTrue(False)
+
+        time.sleep(10)
+        fleet.stop_worker()
+        os.remove("./test_communicator_ps_gpu.txt")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_compare_op.py b/python/paddle/fluid/tests/unittests/test_compare_op.py
index fbf7384b86bc1..8dc80c8931269 100644
--- a/python/paddle/fluid/tests/unittests/test_compare_op.py
+++ b/python/paddle/fluid/tests/unittests/test_compare_op.py
@@ -122,6 +122,23 @@ def test_broadcast_api_1(self):
                                fetch_list=[out])
             self.assertEqual((res == real_result).all(), True)
 
+        def test_broadcast_api_2(self):
+            paddle.enable_static()
+            with program_guard(Program(), Program()):
+                x = paddle.static.data(name='x', shape=[1, 2, 3], dtype='int32')
+                y = paddle.static.data(
+                    name='y', shape=[1, 2, 1, 3], dtype='int32')
+                op = eval("paddle.%s" % (self.op_type))
+                out = op(x, y)
+                exe = paddle.static.Executor(self.place)
+                input_x = np.arange(0, 6).reshape((1, 2, 3)).astype(np.int32)
+                input_y = np.arange(1, 7).reshape((1, 2, 1, 3)).astype(np.int32)
+                real_result = callback(input_x, input_y)
+                res, = exe.run(feed={"x": input_x,
+                                     "y": input_y},
+                               fetch_list=[out])
+            self.assertEqual((res == real_result).all(), True)
+
         def test_attr_name(self):
             paddle.enable_static()
             with program_guard(Program(), Program()):
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
index bbb0f5b10393d..77eac2fbd7fe0 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
@@ -1248,6 +1248,17 @@ def init_paddings(self):
 create_test_cudnn_channel_last_class(TestWithGroup_AsyPadding)
 create_test_cudnn_channel_last_class(TestWithDilation_AsyPadding)
 
+# ------------ depthwise conv2d in MIOPEN ---------
+if core.is_compiled_with_rocm():
+    create_test_cudnn_padding_SAME_class(TestDepthwiseConv_AsyPadding)
+    create_test_cudnn_padding_SAME_class(
+        TestDepthwiseConvWithDilation_AsyPadding)
+    create_test_padding_VALID_class(TestDepthwiseConv_AsyPadding)
+    create_test_padding_VALID_class(TestDepthwiseConvWithDilation_AsyPadding)
+    create_test_cudnn_channel_last_class(TestDepthwiseConv_AsyPadding)
+    create_test_cudnn_channel_last_class(
+        TestDepthwiseConvWithDilation2_AsyPadding)
+
 create_test_cudnn_channel_last_fp16_class(
     TestConv2DOp_AsyPadding, grad_check=False)
 create_test_cudnn_channel_last_fp16_class(
diff --git a/python/paddle/fluid/tests/unittests/test_conv_transpose_nn_grad.py b/python/paddle/fluid/tests/unittests/test_conv_transpose_nn_grad.py
index 110cfc47cae41..a4ef15b1f0db3 100644
--- a/python/paddle/fluid/tests/unittests/test_conv_transpose_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_conv_transpose_nn_grad.py
@@ -32,6 +32,8 @@ def func(self, place):
         shape = [2, 4, 3, 3]
         eps = 0.005
         dtype = np.float64
+        if core.is_compiled_with_rocm():
+            dtype = np.float32
         x = layers.data('x', shape, False, dtype)
         y = layers.conv2d_transpose(
             x, 2, filter_size=1, groups=1, bias_attr=False)
@@ -41,8 +43,18 @@ def func(self, place):
         w_arr = []
         for p in w:
             w_arr.append(np.random.uniform(-1, 1, p.shape).astype(dtype))
-        gradient_checker.double_grad_check(
-            [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
+        if core.is_compiled_with_rocm():
+            # HIP will sometimes fail if no atol
+            gradient_checker.double_grad_check(
+                [x] + w,
+                y,
+                x_init=[x_arr] + w_arr,
+                place=place,
+                eps=eps,
+                atol=1e-4)
+        else:
+            gradient_checker.double_grad_check(
+                [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
 
     def test_grad(self):
         places = []
@@ -60,6 +72,8 @@ def func(self, place):
         shape = [2, 2, 3, 3]
         eps = 0.005
         dtype = np.float64
+        if core.is_compiled_with_rocm():
+            dtype = np.float32
         x = layers.data('x', shape, False, dtype)
         y = layers.conv2d_transpose(
             input=x,
@@ -74,8 +88,18 @@ def func(self, place):
         w_arr = []
         for p in w:
             w_arr.append(np.random.uniform(-1, 1, p.shape).astype(dtype))
-        gradient_checker.double_grad_check(
-            [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
+        if core.is_compiled_with_rocm():
+            # HIP will sometimes fail if no atol
+            gradient_checker.double_grad_check(
+                [x] + w,
+                y,
+                x_init=[x_arr] + w_arr,
+                place=place,
+                eps=eps,
+                atol=1e-4)
+        else:
+            gradient_checker.double_grad_check(
+                [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
 
 
 class TestConvTranspose2DoubleGradCheck_PaddingSAME(
@@ -85,6 +109,8 @@ def func(self, place):
         shape = [2, 2, 3, 3]
         eps = 0.005
         dtype = np.float64
+        if core.is_compiled_with_rocm():
+            dtype = np.float32
         x = layers.data('x', shape, False, dtype)
         y = layers.conv2d_transpose(
             input=x,
@@ -99,8 +125,18 @@ def func(self, place):
         w_arr = []
         for p in w:
             w_arr.append(np.random.uniform(-1, 1, p.shape).astype(dtype))
-        gradient_checker.double_grad_check(
-            [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
+        if core.is_compiled_with_rocm():
+            # HIP will sometimes fail if no atol
+            gradient_checker.double_grad_check(
+                [x] + w,
+                y,
+                x_init=[x_arr] + w_arr,
+                place=place,
+                eps=eps,
+                atol=1e-4)
+        else:
+            gradient_checker.double_grad_check(
+                [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
 
 
 class TestConvTranspose2DoubleGradCheck_PaddingVALID(
@@ -110,6 +146,8 @@ def func(self, place):
         shape = [2, 2, 3, 3]
         eps = 0.005
         dtype = np.float64
+        if core.is_compiled_with_rocm():
+            dtype = np.float32
         x = layers.data('x', shape, False, dtype)
         y = layers.conv2d_transpose(
             input=x,
@@ -124,8 +162,18 @@ def func(self, place):
         w_arr = []
         for p in w:
             w_arr.append(np.random.uniform(-1, 1, p.shape).astype(dtype))
-        gradient_checker.double_grad_check(
-            [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
+        if core.is_compiled_with_rocm():
+            # HIP will sometimes fail if no atol
+            gradient_checker.double_grad_check(
+                [x] + w,
+                y,
+                x_init=[x_arr] + w_arr,
+                place=place,
+                eps=eps,
+                atol=1e-4)
+        else:
+            gradient_checker.double_grad_check(
+                [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
 
 
 class TestConvTranspose2DoubleGradCheck_ChannelLast(
@@ -135,6 +183,8 @@ def func(self, place):
         shape = [2, 3, 3, 2]
         eps = 0.005
         dtype = np.float64
+        if core.is_compiled_with_rocm():
+            dtype = np.float32
         x = layers.data('x', shape, False, dtype)
         y = layers.conv2d_transpose(
             input=x,
@@ -151,8 +201,18 @@ def func(self, place):
         w_arr = []
         for p in w:
             w_arr.append(np.random.uniform(-1, 1, p.shape).astype(dtype))
-        gradient_checker.double_grad_check(
-            [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
+        if core.is_compiled_with_rocm():
+            # HIP will sometimes fail if no atol
+            gradient_checker.double_grad_check(
+                [x] + w,
+                y,
+                x_init=[x_arr] + w_arr,
+                place=place,
+                eps=eps,
+                atol=1e-4)
+        else:
+            gradient_checker.double_grad_check(
+                [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps11.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps11.py
new file mode 100644
index 0000000000000..cad7d067e9019
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps11.py
@@ -0,0 +1,180 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import unittest
+import tempfile
+import shutil
+
+import paddle
+import paddle.fluid as fluid
+import paddle.distributed.fleet.base.role_maker as role_maker
+import paddle.distributed.fleet as fleet
+
+paddle.enable_static()
+
+# For Net
+base_lr = 0.2
+emb_lr = base_lr * 3
+dict_dim = 1500
+emb_dim = 128
+hid_dim = 128
+margin = 0.1
+sample_rate = 1
+batch_size = 4
+
+
+class TestPSPassWithBow(unittest.TestCase):
+    def net(self):
+        def get_acc(cos_q_nt, cos_q_pt, batch_size):
+            cond = fluid.layers.less_than(cos_q_nt, cos_q_pt)
+            cond = fluid.layers.cast(cond, dtype='float64')
+            cond_3 = fluid.layers.reduce_sum(cond)
+            acc = fluid.layers.elementwise_div(
+                cond_3,
+                fluid.layers.fill_constant(
+                    shape=[1], value=batch_size * 1.0, dtype='float64'),
+                name="simnet_acc")
+            return acc
+
+        def get_loss(cos_q_pt, cos_q_nt):
+            loss_op1 = fluid.layers.elementwise_sub(
+                fluid.layers.fill_constant_batch_size_like(
+                    input=cos_q_pt,
+                    shape=[-1, 1],
+                    value=margin,
+                    dtype='float32'),
+                cos_q_pt)
+            loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt)
+            loss_op3 = fluid.layers.elementwise_max(
+                fluid.layers.fill_constant_batch_size_like(
+                    input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32'),
+                loss_op2)
+            avg_cost = fluid.layers.mean(loss_op3)
+            return avg_cost
+
+        is_distributed = False
+        is_sparse = True
+
+        # query
+        q = fluid.layers.data(
+            name="query_ids", shape=[1], dtype="int64", lod_level=1)
+        # embedding
+        q_emb = fluid.contrib.layers.sparse_embedding(
+            input=q,
+            size=[dict_dim, emb_dim],
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01),
+                name="__emb__",
+                learning_rate=emb_lr))
+        q_emb = fluid.layers.reshape(q_emb, [-1, emb_dim])
+        # vsum
+        q_sum = fluid.layers.sequence_pool(input=q_emb, pool_type='sum')
+        q_ss = fluid.layers.softsign(q_sum)
+        # fc layer after conv
+        q_fc = fluid.layers.fc(
+            input=q_ss,
+            size=hid_dim,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01),
+                name="__q_fc__",
+                learning_rate=base_lr))
+        # label data
+        label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+        # pt
+        pt = fluid.layers.data(
+            name="pos_title_ids", shape=[1], dtype="int64", lod_level=1)
+        # embedding
+        pt_emb = fluid.contrib.layers.sparse_embedding(
+            input=pt,
+            size=[dict_dim, emb_dim],
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01),
+                name="__emb__",
+                learning_rate=emb_lr))
+        pt_emb = fluid.layers.reshape(pt_emb, [-1, emb_dim])
+        # vsum
+        pt_sum = fluid.layers.sequence_pool(input=pt_emb, pool_type='sum')
+        pt_ss = fluid.layers.softsign(pt_sum)
+        # fc layer
+        pt_fc = fluid.layers.fc(
+            input=pt_ss,
+            size=hid_dim,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01),
+                name="__fc__",
+                learning_rate=base_lr),
+            bias_attr=fluid.ParamAttr(name="__fc_b__"))
+        # nt
+        nt = fluid.layers.data(
+            name="neg_title_ids", shape=[1], dtype="int64", lod_level=1)
+        # embedding
+        nt_emb = fluid.contrib.layers.sparse_embedding(
+            input=nt,
+            size=[dict_dim, emb_dim],
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01),
+                name="__emb__",
+                learning_rate=emb_lr))
+        nt_emb = fluid.layers.reshape(nt_emb, [-1, emb_dim])
+        # vsum
+        nt_sum = fluid.layers.sequence_pool(input=nt_emb, pool_type='sum')
+        nt_ss = fluid.layers.softsign(nt_sum)
+        # fc layer
+        nt_fc = fluid.layers.fc(
+            input=nt_ss,
+            size=hid_dim,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01),
+                name="__fc__",
+                learning_rate=base_lr),
+            bias_attr=fluid.ParamAttr(name="__fc_b__"))
+        cos_q_pt = fluid.layers.cos_sim(q_fc, pt_fc)
+        cos_q_nt = fluid.layers.cos_sim(q_fc, nt_fc)
+        # loss
+        avg_cost = get_loss(cos_q_pt, cos_q_nt)
+        # acc
+        acc = get_acc(cos_q_nt, cos_q_pt, batch_size)
+        return [avg_cost, acc, cos_q_pt]
+
+    def test(self):
+        os.environ["PADDLE_PSERVER_NUMS"] = "2"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+        os.environ["PADDLE_TRAINER_ID"] = "0"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ[
+            "PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001,127.0.0.2:36001"
+        os.environ[
+            "PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36002,127.0.0.2:36002"
+        os.environ["TRAINING_ROLE"] = "TRAINER"
+        os.environ["FLAGS_selected_gpus"] = "0"
+        role = role_maker.PaddleCloudRoleMaker()
+        fleet.init(role)
+        loss, acc, _ = self.net()
+
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        configs = {"use_ps_gpu": 1, "launch_barrier": False}
+        strategy.a_sync_configs = configs
+        strategy.a_sync = True
+        optimizer = paddle.fluid.optimizer.Adam(learning_rate=0.01)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(loss)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps12.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps12.py
new file mode 100644
index 0000000000000..74c1ccd8a8a76
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps12.py
@@ -0,0 +1,180 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import unittest
+import tempfile
+import shutil
+
+import paddle
+import paddle.fluid as fluid
+import paddle.distributed.fleet.base.role_maker as role_maker
+import paddle.distributed.fleet as fleet
+
+paddle.enable_static()
+
+# For Net
+base_lr = 0.2
+emb_lr = base_lr * 3
+dict_dim = 1500
+emb_dim = 128
+hid_dim = 128
+margin = 0.1
+sample_rate = 1
+batch_size = 4
+
+
+class TestPSPassWithBow(unittest.TestCase):
+    def net(self):
+        def get_acc(cos_q_nt, cos_q_pt, batch_size):
+            cond = fluid.layers.less_than(cos_q_nt, cos_q_pt)
+            cond = fluid.layers.cast(cond, dtype='float64')
+            cond_3 = fluid.layers.reduce_sum(cond)
+            acc = fluid.layers.elementwise_div(
+                cond_3,
+                fluid.layers.fill_constant(
+                    shape=[1], value=batch_size * 1.0, dtype='float64'),
+                name="simnet_acc")
+            return acc
+
+        def get_loss(cos_q_pt, cos_q_nt):
+            loss_op1 = fluid.layers.elementwise_sub(
+                fluid.layers.fill_constant_batch_size_like(
+                    input=cos_q_pt,
+                    shape=[-1, 1],
+                    value=margin,
+                    dtype='float32'),
+                cos_q_pt)
+            loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt)
+            loss_op3 = fluid.layers.elementwise_max(
+                fluid.layers.fill_constant_batch_size_like(
+                    input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32'),
+                loss_op2)
+            avg_cost = fluid.layers.mean(loss_op3)
+            return avg_cost
+
+        is_distributed = False
+        is_sparse = True
+
+        # query
+        q = fluid.layers.data(
+            name="query_ids", shape=[1], dtype="int64", lod_level=1)
+        # embedding
+        q_emb = fluid.contrib.layers.sparse_embedding(
+            input=q,
+            size=[dict_dim, emb_dim],
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01),
+                name="__emb__",
+                learning_rate=emb_lr))
+        q_emb = fluid.layers.reshape(q_emb, [-1, emb_dim])
+        # vsum
+        q_sum = fluid.layers.sequence_pool(input=q_emb, pool_type='sum')
+        q_ss = fluid.layers.softsign(q_sum)
+        # fc layer after conv
+        q_fc = fluid.layers.fc(
+            input=q_ss,
+            size=hid_dim,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01),
+                name="__q_fc__",
+                learning_rate=base_lr))
+        # label data
+        label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+        # pt
+        pt = fluid.layers.data(
+            name="pos_title_ids", shape=[1], dtype="int64", lod_level=1)
+        # embedding
+        pt_emb = fluid.contrib.layers.sparse_embedding(
+            input=pt,
+            size=[dict_dim, emb_dim],
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01),
+                name="__emb__",
+                learning_rate=emb_lr))
+        pt_emb = fluid.layers.reshape(pt_emb, [-1, emb_dim])
+        # vsum
+        pt_sum = fluid.layers.sequence_pool(input=pt_emb, pool_type='sum')
+        pt_ss = fluid.layers.softsign(pt_sum)
+        # fc layer
+        pt_fc = fluid.layers.fc(
+            input=pt_ss,
+            size=hid_dim,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01),
+                name="__fc__",
+                learning_rate=base_lr),
+            bias_attr=fluid.ParamAttr(name="__fc_b__"))
+        # nt
+        nt = fluid.layers.data(
+            name="neg_title_ids", shape=[1], dtype="int64", lod_level=1)
+        # embedding
+        nt_emb = fluid.contrib.layers.sparse_embedding(
+            input=nt,
+            size=[dict_dim, emb_dim],
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01),
+                name="__emb__",
+                learning_rate=emb_lr))
+        nt_emb = fluid.layers.reshape(nt_emb, [-1, emb_dim])
+        # vsum
+        nt_sum = fluid.layers.sequence_pool(input=nt_emb, pool_type='sum')
+        nt_ss = fluid.layers.softsign(nt_sum)
+        # fc layer
+        nt_fc = fluid.layers.fc(
+            input=nt_ss,
+            size=hid_dim,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01),
+                name="__fc__",
+                learning_rate=base_lr),
+            bias_attr=fluid.ParamAttr(name="__fc_b__"))
+        cos_q_pt = fluid.layers.cos_sim(q_fc, pt_fc)
+        cos_q_nt = fluid.layers.cos_sim(q_fc, nt_fc)
+        # loss
+        avg_cost = get_loss(cos_q_pt, cos_q_nt)
+        # acc
+        acc = get_acc(cos_q_nt, cos_q_pt, batch_size)
+        return [avg_cost, acc, cos_q_pt]
+
+    def test(self):
+        os.environ["PADDLE_PSERVER_NUMS"] = "2"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+        os.environ["PADDLE_TRAINER_ID"] = "0"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ[
+            "PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001,127.0.0.2:36001"
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+
+        role = role_maker.PaddleCloudRoleMaker()
+        fleet.init(role)
+        loss, acc, _ = self.net()
+
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        configs = {"use_ps_gpu": 1}
+        strategy.a_sync_configs = configs
+        strategy.a_sync = True
+        optimizer = paddle.fluid.optimizer.Adam(learning_rate=0.01)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(loss)
+
+        fleet.init_server()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_tree_index.py b/python/paddle/fluid/tests/unittests/test_dist_tree_index.py
new file mode 100644
index 0000000000000..feb52b18dad3d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_tree_index.py
@@ -0,0 +1,198 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from paddle.dataset.common import download, DATA_HOME
+from paddle.distributed.fleet.dataset import TreeIndex
+
+
+class TestTreeIndex(unittest.TestCase):
+    def test_tree_index(self):
+        path = download(
+            "https://paddlerec.bj.bcebos.com/tree-based/data/demo_tree.pb",
+            "tree_index_unittest", "cadec20089f5a8a44d320e117d9f9f1a")
+
+        tree = TreeIndex("demo", path)
+        height = tree.height()
+        branch = tree.branch()
+        self.assertTrue(height == 14)
+        self.assertTrue(branch == 2)
+        self.assertEqual(tree.total_node_nums(), 15581)
+        self.assertEqual(tree.emb_size(), 5171136)
+
+        # get_layer_codes
+        layer_node_ids = []
+        layer_node_codes = []
+        for i in range(tree.height()):
+            layer_node_codes.append(tree.get_layer_codes(i))
+            layer_node_ids.append(
+                [node.id() for node in tree.get_nodes(layer_node_codes[-1])])
+
+        all_leaf_ids = [node.id() for node in tree.get_all_leafs()]
+        self.assertEqual(sum(all_leaf_ids), sum(layer_node_ids[-1]))
+
+        # get_travel
+        travel_codes = tree.get_travel_codes(all_leaf_ids[0])
+        travel_ids = [node.id() for node in tree.get_nodes(travel_codes)]
+
+        for i in range(height):
+            self.assertIn(travel_ids[i], layer_node_ids[height - 1 - i])
+            self.assertIn(travel_codes[i], layer_node_codes[height - 1 - i])
+
+        # get_ancestor
+        ancestor_codes = tree.get_ancestor_codes([all_leaf_ids[0]], height - 2)
+        ancestor_ids = [node.id() for node in tree.get_nodes(ancestor_codes)]
+
+        self.assertEqual(ancestor_ids[0], travel_ids[1])
+        self.assertEqual(ancestor_codes[0], travel_codes[1])
+
+        # get_pi_relation
+        pi_relation = tree.get_pi_relation([all_leaf_ids[0]], height - 2)
+        self.assertEqual(pi_relation[all_leaf_ids[0]], ancestor_codes[0])
+
+        # get_travel_path
+        travel_path_codes = tree.get_travel_path(travel_codes[0],
+                                                 travel_codes[-1])
+        travel_path_ids = [
+            node.id() for node in tree.get_nodes(travel_path_codes)
+        ]
+
+        self.assertEquals(travel_path_ids + [travel_ids[-1]], travel_ids)
+        self.assertEquals(travel_path_codes + [travel_codes[-1]], travel_codes)
+
+        # get_children
+        children_codes = tree.get_children_codes(travel_codes[1], height - 1)
+        children_ids = [node.id() for node in tree.get_nodes(children_codes)]
+        self.assertIn(all_leaf_ids[0], children_ids)
+
+
+class TestIndexSampler(unittest.TestCase):
+    def test_layerwise_sampler(self):
+        path = download(
+            "https://paddlerec.bj.bcebos.com/tree-based/data/demo_tree.pb",
+            "tree_index_unittest", "cadec20089f5a8a44d320e117d9f9f1a")
+
+        tree = TreeIndex("demo", path)
+
+        layer_nodes = []
+        for i in range(tree.height()):
+            layer_codes = tree.get_layer_codes(i)
+            layer_nodes.append(
+                [node.id() for node in tree.get_nodes(layer_codes)])
+
+        sample_num = range(1, 10000)
+        start_sample_layer = 1
+        seed = 0
+        sample_layers = tree.height() - start_sample_layer
+        sample_num = sample_num[:sample_layers]
+        layer_sample_counts = list(sample_num) + [1] * (sample_layers -
+                                                        len(sample_num))
+        total_sample_num = sum(layer_sample_counts) + len(layer_sample_counts)
+        tree.init_layerwise_sampler(sample_num, start_sample_layer, seed)
+
+        ids = [315757, 838060, 1251533, 403522, 2473624, 3321007]
+        parent_path = {}
+        for i in range(len(ids)):
+            tmp = tree.get_travel_codes(ids[i], start_sample_layer)
+            parent_path[ids[i]] = [node.id() for node in tree.get_nodes(tmp)]
+
+        # check sample res with_hierarchy = False
+        sample_res = tree.layerwise_sample(
+            [[315757, 838060], [1251533, 403522]], [2473624, 3321007], False)
+        idx = 0
+        layer = tree.height() - 1
+        for i in range(len(layer_sample_counts)):
+            for j in range(layer_sample_counts[0 - (i + 1)] + 1):
+                self.assertTrue(sample_res[idx + j][0] == 315757)
+                self.assertTrue(sample_res[idx + j][1] == 838060)
+                self.assertTrue(sample_res[idx + j][2] in layer_nodes[layer])
+                if j == 0:
+                    self.assertTrue(sample_res[idx + j][3] == 1)
+                    self.assertTrue(
+                        sample_res[idx + j][2] == parent_path[2473624][i])
+                else:
+                    self.assertTrue(sample_res[idx + j][3] == 0)
+                    self.assertTrue(
+                        sample_res[idx + j][2] != parent_path[2473624][i])
+            idx += layer_sample_counts[0 - (i + 1)] + 1
+            layer -= 1
+        self.assertTrue(idx == total_sample_num)
+        layer = tree.height() - 1
+        for i in range(len(layer_sample_counts)):
+            for j in range(layer_sample_counts[0 - (i + 1)] + 1):
+                self.assertTrue(sample_res[idx + j][0] == 1251533)
+                self.assertTrue(sample_res[idx + j][1] == 403522)
+                self.assertTrue(sample_res[idx + j][2] in layer_nodes[layer])
+                if j == 0:
+                    self.assertTrue(sample_res[idx + j][3] == 1)
+                    self.assertTrue(
+                        sample_res[idx + j][2] == parent_path[3321007][i])
+                else:
+                    self.assertTrue(sample_res[idx + j][3] == 0)
+                    self.assertTrue(
+                        sample_res[idx + j][2] != parent_path[3321007][i])
+            idx += layer_sample_counts[0 - (i + 1)] + 1
+            layer -= 1
+        self.assertTrue(idx == total_sample_num * 2)
+
+        # check sample res with_hierarchy = True
+        sample_res_with_hierarchy = tree.layerwise_sample(
+            [[315757, 838060], [1251533, 403522]], [2473624, 3321007], True)
+        idx = 0
+        layer = tree.height() - 1
+        for i in range(len(layer_sample_counts)):
+            for j in range(layer_sample_counts[0 - (i + 1)] + 1):
+                self.assertTrue(sample_res_with_hierarchy[idx + j][0] ==
+                                parent_path[315757][i])
+                self.assertTrue(sample_res_with_hierarchy[idx + j][1] ==
+                                parent_path[838060][i])
+                self.assertTrue(
+                    sample_res_with_hierarchy[idx + j][2] in layer_nodes[layer])
+                if j == 0:
+                    self.assertTrue(sample_res_with_hierarchy[idx + j][3] == 1)
+                    self.assertTrue(sample_res_with_hierarchy[idx + j][2] ==
+                                    parent_path[2473624][i])
+                else:
+                    self.assertTrue(sample_res_with_hierarchy[idx + j][3] == 0)
+                    self.assertTrue(sample_res_with_hierarchy[idx + j][2] !=
+                                    parent_path[2473624][i])
+
+            idx += layer_sample_counts[0 - (i + 1)] + 1
+            layer -= 1
+        self.assertTrue(idx == total_sample_num)
+        layer = tree.height() - 1
+        for i in range(len(layer_sample_counts)):
+            for j in range(layer_sample_counts[0 - (i + 1)] + 1):
+                self.assertTrue(sample_res_with_hierarchy[idx + j][0] ==
+                                parent_path[1251533][i])
+                self.assertTrue(sample_res_with_hierarchy[idx + j][1] ==
+                                parent_path[403522][i])
+                self.assertTrue(
+                    sample_res_with_hierarchy[idx + j][2] in layer_nodes[layer])
+                if j == 0:
+                    self.assertTrue(sample_res_with_hierarchy[idx + j][3] == 1)
+                    self.assertTrue(sample_res_with_hierarchy[idx + j][2] ==
+                                    parent_path[3321007][i])
+                else:
+                    self.assertTrue(sample_res_with_hierarchy[idx + j][3] == 0)
+                    self.assertTrue(sample_res_with_hierarchy[idx + j][2] !=
+                                    parent_path[3321007][i])
+
+            idx += layer_sample_counts[0 - (i + 1)] + 1
+            layer -= 1
+        self.assertTrue(idx == 2 * total_sample_num)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
index fde7ea4b23801..cc362005f3311 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
@@ -520,6 +520,23 @@ def init_grad_input_output(self):
         self.grad_y = self.grad_out
 
 
+class TestBoolAddFloatElementwiseAddop(unittest.TestCase):
+    def test_static_add(self):
+        paddle.enable_static()
+        a = 1.5
+        b = paddle.full([4, 5, 6], True, dtype='bool')
+        c = a + b
+        self.assertTrue(c.dtype == core.VarDesc.VarType.FP32)
+        paddle.enable_static()
+
+    def test_dygraph_add(self):
+        paddle.disable_static()
+        a = 1.5
+        b = paddle.full([4, 5, 6], True, dtype='bool')
+        c = a + b
+        self.assertTrue(c.dtype == core.VarDesc.VarType.FP32)
+
+
 if __name__ == '__main__':
     paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_max_op_npu.py b/python/paddle/fluid/tests/unittests/test_elementwise_max_op_npu.py
new file mode 100644
index 0000000000000..6475caf970cba
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_max_op_npu.py
@@ -0,0 +1,161 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestElementwiseMax(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "elementwise_max"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        y = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        out = np.maximum(x, y)
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(y)
+        }
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+    # TODO(ascendrc): Max grad test
+    # def test_check_grad(self):
+    #     if self.dtype == np.float16:
+    #         return
+    #     self.check_grad(['X'], 'Out')
+    #
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestElementwiseMaxFp16(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "elementwise_max"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [3, 4]).astype(self.dtype)
+        y = np.random.uniform(1, 2, [3, 4]).astype(self.dtype)
+        out = np.maximum(x, y)
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(y)
+        }
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.__class__.no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False, atol=1e-5)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestElementwiseMaxNet(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(32, 32)).astype('float32')
+        b_np = np.random.random(size=(32, 32)).astype('float32')
+        label_np = np.random.randint(2, size=(32, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
+            b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[32, 1], dtype='int64')
+
+            c = paddle.maximum(a, b)
+
+            fc_1 = fluid.layers.fc(input=c, size=128)
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(
+                main_prog,
+                feed={"a": a_np,
+                      "b": b_np,
+                      "label": label_np},
+                fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        npu_pred, npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_empty_like_op.py b/python/paddle/fluid/tests/unittests/test_empty_like_op.py
index 32d732d9a8099..385a0c0b6e84c 100644
--- a/python/paddle/fluid/tests/unittests/test_empty_like_op.py
+++ b/python/paddle/fluid/tests/unittests/test_empty_like_op.py
@@ -38,7 +38,7 @@ def __check_out__(self, out):
         if data_type in ['float32', 'float64', 'int32', 'int64']:
             max_value = np.nanmax(out)
             min_value = np.nanmin(out)
-            always_non_full_zero = max_value > min_value
+            always_non_full_zero = max_value >= min_value
             always_full_zero = max_value == 0.0 and min_value == 0.0
             self.assertTrue(always_full_zero or always_non_full_zero,
                             'always_full_zero or always_non_full_zero.')
@@ -146,6 +146,8 @@ def setUp(self):
         self.init_config()
 
     def test_static_graph(self):
+        paddle.enable_static()
+
         dtype = 'float32'
 
         train_program = Program()
@@ -167,6 +169,8 @@ def test_static_graph(self):
         self.dst_shape = x.shape
         self.__check_out__(res[0])
 
+        paddle.disable_static()
+
     def init_config(self):
         self.x_shape = (200, 3)
         self.data_x_shape = [200, 3]
diff --git a/python/paddle/fluid/tests/unittests/test_fill_constant_op.py b/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
index 0dd78ea53c27b..770b6d3e92e16 100644
--- a/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
@@ -16,7 +16,7 @@
 
 import unittest
 import numpy as np
-from op_test import OpTest
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 import paddle.fluid.core as core
@@ -425,5 +425,31 @@ def test_shape_tensor_list_dtype():
             self.assertRaises(TypeError, test_shape_tensor_list_dtype)
 
 
+class TestFillConstantOp_ValueTensorBf16(OpTest):
+    def setUp(self):
+        '''Test fill_constant op with specified value
+        '''
+        self.op_type = "fill_constant"
+        self.init_data()
+
+        self.inputs = {
+            "ShapeTensor": np.array(self.shape).astype("int32"),
+            'ValueTensor':
+            convert_float_to_uint16(np.array([self.value]).astype("float32"))
+        }
+        self.attrs = {'value': self.value, 'dtype': core.VarDesc.VarType.BF16}
+        self.outputs = {'Out': np.full(self.shape, self.value)}
+
+    def init_data(self):
+        self.shape = [123, 92]
+        self.value = 3.0
+        self.dtype = np.uint16
+        self.mkldnn_data_type = "bfloat16"
+
+    def test_check_output(self):
+        self.check_output_with_place(core.CPUPlace())
+
+
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_amp_init.py b/python/paddle/fluid/tests/unittests/test_fleet_amp_init.py
index 869ca41a1923d..6930a330a7c31 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_amp_init.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_amp_init.py
@@ -70,6 +70,8 @@ def test_fleet_amp_init(self):
             optimizer = fleet.distributed_optimizer(optimizer)
             optimizer.minimize(cost)
 
+        loss_scale = optimizer.get_loss_scaling()
+
         place = paddle.CUDAPlace(0)
 
         exe = paddle.static.Executor(place)
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_ascend_utils.py b/python/paddle/fluid/tests/unittests/test_fleet_ascend_utils.py
new file mode 100644
index 0000000000000..b9d88a8e1155e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fleet_ascend_utils.py
@@ -0,0 +1,62 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import sys
+import os
+import time
+import six
+import copy
+import json
+import unittest
+import paddle.fluid as fluid
+
+import paddle.distributed.fleet.ascend_utils as ascend_utils
+
+RANK_TABLE_JSON = {
+    "status": "completed",
+    "version": "1.0",
+    "server_count": "1",
+    "server_list": [{
+        "server_id": "127.0.0.1",
+        "device": [{
+            "device_id": "0",
+            "device_ip": "192.1.184.23",
+            "rank_id": "0"
+        }, {
+            "device_id": "1",
+            "device_ip": "192.2.21.93",
+            "rank_id": "1"
+        }]
+    }]
+}
+
+
+class TestAscendUtil(unittest.TestCase):
+    def test_get_cloud_cluster(self):
+        cluster, pod = ascend_utils.get_cloud_cluster()
+        self.assertTrue(cluster)
+        self.assertTrue(pod)
+
+        with open('rank_table_file.json', 'w') as f:
+            json.dump(RANK_TABLE_JSON, f)
+        rank_table_file = "./rank_table_file.json"
+        cluster, pod = ascend_utils.get_cloud_cluster(
+            rank_table_file=rank_table_file)
+        self.assertTrue(cluster)
+        self.assertTrue(pod)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py b/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
index d843e172763fe..52895217d3f90 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
@@ -73,6 +73,17 @@ def test_pipeline_configs(self):
         strategy.pipeline_configs = configs
         self.assertEqual(strategy.pipeline_configs["accumulate_steps"], 2)
 
+    def test_hybrid_parallel_configs(self):
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.hybrid_configs = {
+            "dp_degree": 1,
+            "mp_degree": 2,
+            "pp_degree": 4
+        }
+        self.assertEqual(strategy.hybrid_configs["dp_degree"], 1)
+        self.assertEqual(strategy.hybrid_configs["mp_degree"], 2)
+        self.assertEqual(strategy.hybrid_configs["pp_degree"], 4)
+
     def test_localsgd(self):
         strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.localsgd = True
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_launch_ascend.sh b/python/paddle/fluid/tests/unittests/test_fleet_launch_ascend.sh
index 0960083abf28e..a54334692214c 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_launch_ascend.sh
+++ b/python/paddle/fluid/tests/unittests/test_fleet_launch_ascend.sh
@@ -16,22 +16,43 @@
 
 set -e
 
-# use paddlecloud
-echo "begin test use paddlecloud"
-cluster_node_ips="127.0.0.1,127.0.0.2"
-export PADDLE_TRAINERS_NUM=2
-export POD_IP=127.0.0.1
-export PADDLE_TRAINERS=127.0.0.1,127.0.0.2
-export PADDLE_TRAINER_ID=0
-
-export PADDLE_PORT=35789
-export TRAINER_PORTS_NUM=2
-
-distributed_args="--ips=${cluster_node_ips} --ascend_npus=0,1 --log_dir=testlog"
+RANK_TABLE_FILE_NAME="rank_table_file.json"
+cat > ${RANK_TABLE_FILE_NAME} <<EOF
+{
+    "status": "completed",
+    "version": "1.0",
+    "server_count": "1",
+    "server_list": [
+        {
+            "server_id": "127.0.0.1",
+            "device": [
+                {
+                    "device_id": "0",
+                    "device_ip": "192.1.184.23",
+                    "rank_id": "0"
+                },
+                {
+                    "device_id": "1",
+                    "device_ip": "192.2.21.93",
+                    "rank_id": "1"
+                }
+            ]
+        }
+    ]
+}
+EOF
+
+# set ascend rank table file env
+export RANK_TABLE_FILE="${PWD}/${RANK_TABLE_FILE_NAME}"
+
+# use ascend
+echo "begin test use ascend npu"
+
+distributed_args="--run_mode=collective --log_dir=testlog"
 python -m paddle.distributed.fleet.launch ${distributed_args} ascend_multi_process_collective.py fleetlaunchascend
 
-str1="selected_accelerators:0 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35789 trainer_id:0 device_ids:0,1,0,1 device_id:0"
-str2="selected_accelerators:1 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35790 trainer_id:1 device_ids:0,1,0,1 device_id:1"
+str1="selected_accelerators:0 selected_npus:0 worker_endpoints:127.0.0.1:6170,127.0.0.1:6171 trainers_num:2 current_endpoint:127.0.0.1:6170 trainer_id:0 device_ids:0,1 device_id:0"
+str2="selected_accelerators:1 selected_npus:1 worker_endpoints:127.0.0.1:6170,127.0.0.1:6171 trainers_num:2 current_endpoint:127.0.0.1:6171 trainer_id:1 device_ids:0,1 device_id:1"
 file_0="multi_process_fleetlaunchascend.check_0.log"
 file_1="multi_process_fleetlaunchascend.check_1.log"
 
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_launch_ascend2.sh b/python/paddle/fluid/tests/unittests/test_fleet_launch_ascend2.sh
deleted file mode 100644
index 2e9c1e6995399..0000000000000
--- a/python/paddle/fluid/tests/unittests/test_fleet_launch_ascend2.sh
+++ /dev/null
@@ -1,103 +0,0 @@
-#!/bin/bash
-
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-set -e
-
-RANK_TABLE_FILE_NAME="rank_table_file.json"
-cat > ${RANK_TABLE_FILE_NAME} <<EOF
-{
-    "status": "completed",
-    "version": "1.0",
-    "server_count": "2",
-    "server_list": [
-        {
-            "server_id": "127.0.0.1",
-            "device": [
-                {
-                    "device_id": "0",
-                    "device_ip": "192.1.184.23",
-                    "rank_id": "0"
-                },
-                {
-                    "device_id": "1",
-                    "device_ip": "192.2.21.93",
-                    "rank_id": "1"
-                }
-            ]
-        },
-        {
-            "server_id": "127.0.0.2",
-            "device": [
-                {
-                    "device_id": "0",
-                    "device_ip": "192.1.94.132",
-                    "rank_id": "2"
-                },
-                {
-                    "device_id": "1",
-                    "device_ip": "192.2.94.30",
-                    "rank_id": "3"
-                }
-            ]
-        }
-    ]
-}
-EOF
-
-# set ascend rank table file env
-export RANK_TABLE_FILE="${PWD}/${RANK_TABLE_FILE_NAME}"
-
-# use paddlecloud
-echo "begin test use paddlecloud"
-cluster_node_ips="127.0.0.1,127.0.0.2"
-export PADDLE_TRAINERS_NUM=2
-export POD_IP=127.0.0.1
-export PADDLE_TRAINERS=127.0.0.1,127.0.0.2
-export PADDLE_TRAINER_ID=0
-
-export PADDLE_PORT=35789
-export TRAINER_PORTS_NUM=2
-
-distributed_args="--run_mode=collective --log_dir=testlog"
-python -m paddle.distributed.fleet.launch ${distributed_args} ascend_multi_process_collective.py fleetlaunchascend
-
-str1="selected_accelerators:0 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35789 trainer_id:0 device_ids:0,1,0,1 device_id:0"
-str2="selected_accelerators:1 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35790 trainer_id:1 device_ids:0,1,0,1 device_id:1"
-file_0="multi_process_fleetlaunchascend.check_0.log"
-file_1="multi_process_fleetlaunchascend.check_1.log"
-
-echo "paddlecloud params test"
-if grep -q "$str1" "$file_0"; then
-    echo "find trainer 0"
-else
-    echo "not find trainer 0"
-    exit -1
-fi
-
-if grep -q "$str2" "$file_1"; then
-    echo "find trainer 1"
-else
-    echo "not find trainer 1"
-    exit -1
-fi
-
-# test async poll process
-if [ -f $file_0 ]; then
-    rm $file_0
-fi
-if [ -f $file_1 ]; then
-    rm $file_1
-fi
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_utils.py b/python/paddle/fluid/tests/unittests/test_fleet_utils.py
index 51c12375948f5..09de4867ef9f4 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_utils.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_utils.py
@@ -24,7 +24,6 @@
 from paddle.dataset.common import download, DATA_HOME
 import paddle.fluid.incubate.fleet.base.role_maker as role_maker
 from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
-from paddle.fluid.incubate.fleet.utils.fleet_barrier_util import check_all_trainers_ready
 from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
 import paddle.fluid.incubate.fleet.utils.utils as utils
 
@@ -50,15 +49,6 @@ def test_fleet_util_init(self):
         fleet_util_transpiler = FleetUtil(mode="transpiler")
         self.assertRaises(Exception, FleetUtil, "other")
 
-    def test_fleet_barrier(self):
-        role = role_maker.UserDefinedRoleMaker(
-            current_id=0,
-            role=role_maker.Role.WORKER,
-            worker_num=1,
-            server_endpoints=['127.0.0.1'])
-        fleet.init(role)
-        check_all_trainers_ready("/ready_path/", 0)
-
     def test_program_type_trans(self):
         data_dir = self.download_files()
         program_dir = os.path.join(data_dir, self.pruned_dir)
diff --git a/python/paddle/fluid/tests/unittests/test_glu.py b/python/paddle/fluid/tests/unittests/test_glu.py
index 63818d8ac50f2..25f1975db0c52 100644
--- a/python/paddle/fluid/tests/unittests/test_glu.py
+++ b/python/paddle/fluid/tests/unittests/test_glu.py
@@ -17,6 +17,9 @@
 import paddle.fluid.dygraph as dg
 import unittest
 
+import paddle
+from paddle.nn import functional as F
+
 
 def sigmoid(x):
     return 1.0 / (1.0 + np.exp(-x))
@@ -48,5 +51,25 @@ def test_case(self):
             self.check_identity(fluid.CUDAPlace(0))
 
 
+class TestGLUV2(unittest.TestCase):
+    def setUp(self):
+        self.x = np.random.randn(5, 20)
+        self.dim = -1
+        self.out = glu(self.x, self.dim)
+
+    def check_identity(self, place):
+        with dg.guard(place):
+            x_var = paddle.to_tensor(self.x)
+            y_var = F.glu(x_var, self.dim)
+            y_np = y_var.numpy()
+
+        np.testing.assert_allclose(y_np, self.out)
+
+    def test_case(self):
+        self.check_identity(fluid.CPUPlace())
+        if fluid.is_compiled_with_cuda():
+            self.check_identity(fluid.CUDAPlace(0))
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_gpu_package_without_gpu_device.py b/python/paddle/fluid/tests/unittests/test_gpu_package_without_gpu_device.py
index 2b51bec9cb0e7..e528e742a277a 100644
--- a/python/paddle/fluid/tests/unittests/test_gpu_package_without_gpu_device.py
+++ b/python/paddle/fluid/tests/unittests/test_gpu_package_without_gpu_device.py
@@ -34,6 +34,7 @@ def test_import_paddle(self):
             with open(test_file, 'w') as wb:
                 cmd_test = """
 import paddle
+paddle.utils.run_check()
 x = paddle.rand([3,4])
 assert x.place.is_gpu_place() is False, "There is no CUDA device, but Tensor's place is CUDAPlace"
 """
@@ -52,7 +53,7 @@ def test_import_paddle(self):
             assert 'CPU device will be used by default' in str(
                 stderr
             ), "GPU version Paddle is installed. But CPU device can't be used when CUDA device is not set properly"
-            assert "Error" not in str(
+            assert "AssertionError" not in str(
                 stderr
             ), "There is no CUDA device, but Tensor's place is CUDAPlace"
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_basic.py b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
index cb48013902a53..9dae36c3c223f 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_basic.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
@@ -296,6 +296,28 @@ def test_paddle_imperative_no_grad_guard(self):
             self.assertTrue(tmp._grad_ivar() is None)
             self.assertTrue(l0.weight._grad_ivar() is not None)
 
+    def test_paddle_imperative_set_grad_enabled(self):
+        data = np.array([[2, 3], [4, 5]]).astype('float32')
+        with fluid.dygraph.guard():
+            l0 = fluid.Linear(2, 2)
+            self.assertTrue(l0.weight._grad_ivar() is None)
+            l1 = fluid.Linear(2, 2)
+            with paddle.set_grad_enabled(False):
+                self.assertTrue(l1.weight.stop_gradient is False)
+                tmp = l1.weight * 2
+                with paddle.set_grad_enabled(True):
+                    tmp2 = l1.weight * 2
+                self.assertTrue(tmp.stop_gradient)
+                self.assertTrue(tmp2.stop_gradient is False)
+            x = fluid.dygraph.to_variable(data)
+            y = l0(x) + tmp2
+            o = l1(y)
+            o.backward()
+
+            self.assertTrue(tmp._grad_ivar() is None)
+            self.assertTrue(tmp2._grad_ivar() is not None)
+            self.assertTrue(l0.weight._grad_ivar() is not None)
+
     def test_sum_op(self):
         x = np.ones([2, 2], np.float32)
         with fluid.dygraph.guard():
@@ -472,7 +494,7 @@ def test_mlp(self):
         self.assertEqual("linear_1.b_0", params[3].name)
         self.assertEqual(len(params), 4)
 
-        sublayers = mlp.sublayers(True)
+        sublayers = mlp.sublayers()
         self.assertEqual(mlp._linear1, sublayers[0])
         self.assertEqual(mlp._linear2, sublayers[1])
         self.assertEqual(len(sublayers), 2)
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_named_members.py b/python/paddle/fluid/tests/unittests/test_imperative_named_members.py
index 721453c512421..dfcd6392b46fb 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_named_members.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_named_members.py
@@ -52,10 +52,6 @@ def test_named_sublayers(self):
                                                            list_sublayers):
                 self.assertEqual(sublayer, expected_sublayer)
 
-            for name, sublayer in model.named_sublayers(
-                    include_sublayers=False):
-                self.assertEqual(model[name], sublayer)
-
             self.assertListEqual(
                 [l for _, l in list(model.named_sublayers(include_self=True))],
                 [model] + expected_sublayers)
@@ -71,7 +67,7 @@ def test_named_parameters(self):
 
             named_parameters = list(model.named_parameters())
             expected_named_parameters = list()
-            for prefix, layer in model.named_sublayers(include_sublayers=True):
+            for prefix, layer in model.named_sublayers():
                 for name, param in layer.named_parameters(
                         include_sublayers=False):
                     full_name = prefix + ('.' if prefix else '') + name
diff --git a/python/paddle/fluid/tests/unittests/test_initializer.py b/python/paddle/fluid/tests/unittests/test_initializer.py
index 952265e1195f5..3d1b08186384c 100644
--- a/python/paddle/fluid/tests/unittests/test_initializer.py
+++ b/python/paddle/fluid/tests/unittests/test_initializer.py
@@ -29,7 +29,7 @@
 def check_cast_op(op):
     return op.type == 'cast' and \
            op.attr('in_dtype') == VarDesc.VarType.FP32 and \
-           op.attr('out_dtype') == VarDesc.VarType.FP16
+           op.attr('out_dtype') in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]
 
 
 def output_hist(out):
@@ -53,7 +53,7 @@ def test_constant_initializer_default_value(self, dtype="float32"):
                 lod_level=0,
                 name="param",
                 initializer=initializer.ConstantInitializer())
-        num_ops = 2 if dtype == "float16" else 1
+        num_ops = 2 if dtype in ["float16"] else 1
         self.assertEqual(len(block.ops), num_ops)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'fill_constant')
@@ -72,7 +72,7 @@ def test_constant_initializer(self, dtype="float32"):
                 lod_level=0,
                 name="param",
                 initializer=initializer.ConstantInitializer(2.3))
-        num_ops = 2 if dtype == "float16" else 1
+        num_ops = 2 if dtype in ["float16"] else 1
         self.assertEqual(len(block.ops), num_ops)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'fill_constant')
@@ -87,6 +87,13 @@ def test_constant_initializer_fp16(self):
         block = self.test_constant_initializer("float16")
         self.assertTrue(check_cast_op(block.ops[1]))
 
+    def test_constant_initializer_bf16(self):
+        """Test constant initializer with bfloat16
+           No cast operator has been added here
+        """
+        self.test_constant_initializer_default_value("uint16")
+        self.test_constant_initializer("uint16")
+
 
 class TestUniformInitializer(unittest.TestCase):
     def test_uniform_initializer_default_value(self, dtype="float32"):
@@ -101,7 +108,7 @@ def test_uniform_initializer_default_value(self, dtype="float32"):
                 lod_level=0,
                 name="param",
                 initializer=initializer.UniformInitializer())
-        num_ops = 2 if dtype == "float16" else 1
+        num_ops = 2 if dtype in ["float16", "uint16"] else 1
         self.assertEqual(len(block.ops), num_ops)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'uniform_random')
@@ -146,7 +153,7 @@ def test_uniform_initializer(self, dtype="float32"):
                 lod_level=0,
                 name="param",
                 initializer=initializer.UniformInitializer(-4.2, 3.1, 123))
-        num_ops = 2 if dtype == "float16" else 1
+        num_ops = 2 if dtype in ["float16", "uint16"] else 1
         self.assertEqual(len(block.ops), num_ops)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'uniform_random')
@@ -167,7 +174,7 @@ def test_uniform_initializer_two_op(self, dtype="float32"):
                 lod_level=0,
                 name="param",
                 initializer=initializer.UniformInitializer(-4.2, float(i), 123))
-        num_ops = 2 if dtype == "float16" else 1
+        num_ops = 2 if dtype in ["float16", "uint16"] else 1
         self.assertEqual(len(block.ops), num_ops)
         init_op0 = block.ops[0]
         self.assertEqual(init_op0.type, 'uniform_random')
@@ -186,6 +193,16 @@ def test_uniform_initializer_fp16(self):
         block = self.test_uniform_initializer_two_op("float16")
         self.assertTrue(check_cast_op(block.ops[1]))
 
+    def test_uniform_initializer_bf16(self):
+        """Test uniform initializer with bfloat16
+        """
+        block = self.test_uniform_initializer_default_value("uint16")
+        self.assertTrue(check_cast_op(block.ops[1]))
+        block = self.test_uniform_initializer(dtype="uint16")
+        self.assertTrue(check_cast_op(block.ops[1]))
+        block = self.test_uniform_initializer_two_op("uint16")
+        self.assertTrue(check_cast_op(block.ops[1]))
+
 
 class TestNormalInitializer(unittest.TestCase):
     def test_normal_initializer_default_value(self):
@@ -219,7 +236,7 @@ def test_normal_initializer(self, dtype="float32"):
                 lod_level=0,
                 name="param",
                 initializer=initializer.NormalInitializer(2.3, 1.9, 123))
-        num_ops = 2 if dtype == "float16" else 1
+        num_ops = 2 if dtype in ["float16", "uint16"] else 1
         self.assertEqual(len(block.ops), num_ops)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'gaussian_random')
@@ -234,6 +251,12 @@ def test_normal_initializer_fp16(self):
         block = self.test_normal_initializer("float16")
         self.assertTrue(check_cast_op(block.ops[1]))
 
+    def test_normal_initializer_bf16(self):
+        """Test normal initializer with bfloat16
+        """
+        block = self.test_normal_initializer("uint16")
+        self.assertTrue(check_cast_op(block.ops[1]))
+
 
 class TestXavierInitializer(unittest.TestCase):
     def test_uniform_xavier_initializer(self):
@@ -337,7 +360,7 @@ def test_xavier_initializer_supplied_arguments(self, dtype="float32"):
                 name="param",
                 initializer=initializer.XavierInitializer(
                     fan_in=12, fan_out=23, seed=134))
-        num_ops = 2 if dtype == "float16" else 1
+        num_ops = 2 if dtype in ["float16", "uint16"] else 1
         self.assertEqual(len(block.ops), num_ops)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'uniform_random')
@@ -353,6 +376,12 @@ def test_xavier_initializer_fp16(self):
         block = self.test_xavier_initializer_supplied_arguments("float16")
         self.assertTrue(check_cast_op(block.ops[1]))
 
+    def test_xavier_initializer_bf16(self):
+        """Test the Xavier initializer with bfloat16
+        """
+        block = self.test_xavier_initializer_supplied_arguments("uint16")
+        self.assertTrue(check_cast_op(block.ops[1]))
+
 
 class TestMSRAInitializer(unittest.TestCase):
     def test_uniform_msra_initializer(self):
@@ -454,7 +483,7 @@ def test_msra_initializer_supplied_arguments(self, dtype="float32"):
                 name="param",
                 initializer=initializer.MSRAInitializer(
                     fan_in=12, seed=134))
-        num_ops = 2 if dtype == "float16" else 1
+        num_ops = 2 if dtype in ["float16", "uint16"] else 1
         self.assertEqual(len(block.ops), num_ops)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'uniform_random')
@@ -470,6 +499,12 @@ def test_msra_initializer_fp16(self):
         block = self.test_msra_initializer_supplied_arguments("float16")
         self.assertTrue(check_cast_op(block.ops[1]))
 
+    def test_msra_initializer_bf16(self):
+        """Test the MSRA initializer with bfloat16
+        """
+        block = self.test_msra_initializer_supplied_arguments("uint16")
+        self.assertTrue(check_cast_op(block.ops[1]))
+
 
 class TestBilinearInitializer(unittest.TestCase):
     def test_bilinear_initializer(self, dtype="float32"):
@@ -484,7 +519,7 @@ def test_bilinear_initializer(self, dtype="float32"):
                 lod_level=0,
                 name="param",
                 initializer=initializer.BilinearInitializer())
-        num_ops = 2 if dtype == "float16" or dtype == "float64" else 1
+        num_ops = 2 if dtype in ["float16", "uint16", "float64"] else 1
         self.assertEqual(len(block.ops), num_ops)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'assign_value')
@@ -499,6 +534,12 @@ def test_bilinear_initializer_fp16(self):
         block = self.test_bilinear_initializer("float16")
         self.assertTrue(check_cast_op(block.ops[1]))
 
+    def test_bilinear_initializer_bf16(self):
+        """Test the bilinear initializer with supplied arguments
+        """
+        block = self.test_bilinear_initializer("uint16")
+        self.assertTrue(check_cast_op(block.ops[1]))
+
     def test_type_error(self):
         self.assertRaises(TypeError, self.test_bilinear_initializer, 'int32')
 
@@ -518,7 +559,7 @@ def test_numpy_array_initializer(self, dtype="float32"):
                 lod_level=0,
                 name="param",
                 initializer=initializer.NumpyArrayInitializer(np_array))
-        num_ops = 2 if dtype == "float16" else 1
+        num_ops = 2 if dtype in ["float16", "uint16"] else 1
         self.assertEqual(len(block.ops), num_ops)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'assign_value')
@@ -531,6 +572,12 @@ def test_numpy_array_initializer_fp16(self):
         block = self.test_numpy_array_initializer("float16")
         self.assertTrue(block.ops[1])
 
+    def test_numpy_array_initializer_bf16(self):
+        """Test the numpy array initializer with bfloat16
+        """
+        block = self.test_numpy_array_initializer("uint16")
+        self.assertTrue(block.ops[1])
+
 
 class TestSetGlobalInitializer(unittest.TestCase):
     def test_set_global_weight_initilizer(self):
diff --git a/python/paddle/fluid/tests/unittests/test_initializer_nn.py b/python/paddle/fluid/tests/unittests/test_initializer_nn.py
index ce72b5effbc51..08ec516ba95b0 100644
--- a/python/paddle/fluid/tests/unittests/test_initializer_nn.py
+++ b/python/paddle/fluid/tests/unittests/test_initializer_nn.py
@@ -36,7 +36,7 @@ def get_uniform_min_and_max(weight):
 def check_cast_op(op):
     return op.type == 'cast' and \
            op.attr('in_dtype') == VarDesc.VarType.FP32 and \
-           op.attr('out_dtype') == VarDesc.VarType.FP16
+           op.attr('out_dtype') in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]
 
 
 class TestConstantInitializer(unittest.TestCase):
@@ -54,7 +54,7 @@ def static_test_constant_initializer_common(self,
                 lod_level=0,
                 name="param",
                 initializer=init_inst)
-        num_ops = 2 if dtype == "float16" else 1
+        num_ops = 2 if dtype in ["float16"] else 1
         self.assertEqual(len(block.ops), num_ops)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'fill_constant')
@@ -109,6 +109,13 @@ def test_constant_initializer_fp16(self):
         self.test_constant_initializer_default_value_dygraph("float16")
         self.test_constant_initializer_dygraph("float16")
 
+    def test_constant_initializer_bf16(self):
+        """Test constant initializer with bfloat16
+            No cast operator has been added here
+        """
+        self.test_constant_initializer_default_value_static("uint16")  #bfloat16
+        self.test_constant_initializer_static("uint16")  #bfloat16
+
 
 class TestKaimingInitializer(unittest.TestCase):
     def static_test_kaiming_initializer_common(self,
@@ -218,7 +225,7 @@ def test_uniform_common(self, dtype="float32", seed=0):
                 lod_level=0,
                 name="param",
                 initializer=initializer.Uniform())
-        num_ops = 2 if dtype == "float16" else 1
+        num_ops = 2 if dtype in ["float16", "uint16"] else 1
         self.assertEqual(len(block.ops), num_ops)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'uniform_random')
@@ -249,7 +256,7 @@ def test_uniform_initializer_default_value(self,
                 lod_level=0,
                 name="param",
                 initializer=initializer.Uniform())
-        num_ops = 2 if dtype == "float16" else 1
+        num_ops = 2 if dtype in ["float16", "uint16"] else 1
         self.assertEqual(len(block.ops), num_ops)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'uniform_random')
@@ -280,7 +287,7 @@ def test_uniform_initializer(self,
                 lod_level=0,
                 name="param",
                 initializer=initializer.Uniform(min_value, max_vlaue))
-        num_ops = 2 if dtype == "float16" else 1
+        num_ops = 2 if dtype in ["float16", "uint16"] else 1
         self.assertEqual(len(block.ops), num_ops)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'uniform_random')
@@ -310,7 +317,7 @@ def test_uniform_initializer_two_op(self,
                 lod_level=0,
                 name="param",
                 initializer=initializer.Uniform(min_value, float(i)))
-        num_ops = 2 if dtype == "float16" else 1
+        num_ops = 2 if dtype in ["float16", "uint16"] else 1
         self.assertEqual(len(block.ops), num_ops)
         init_op0 = block.ops[0]
         self.assertEqual(init_op0.type, 'uniform_random')
@@ -332,6 +339,16 @@ def test_uniform_initializer_fp16(self):
         block = self.test_uniform_initializer_two_op("float16")
         self.assertTrue(check_cast_op(block.ops[1]))
 
+    def test_uniform_initializer_bf16(self):
+        """Test uniform initializer with bfloat16
+        """
+        block = self.test_uniform_initializer_default_value("uint16")  #bfloat16
+        self.assertTrue(check_cast_op(block.ops[1]))
+        block = self.test_uniform_initializer(dtype="uint16")  #bfloat16
+        self.assertTrue(check_cast_op(block.ops[1]))
+        block = self.test_uniform_initializer_two_op("uint16")  #bfloat16
+        self.assertTrue(check_cast_op(block.ops[1]))
+
     def test_uniform_initializer_dygraph(self):
         """Test uniform initializer in dygraph model.
         """
@@ -388,7 +405,7 @@ def test_normal_initializer(self, dtype="float32"):
                 lod_level=0,
                 name="param",
                 initializer=initializer.Normal(2.3, 1.9))
-        num_ops = 2 if dtype == "float16" else 1
+        num_ops = 2 if dtype in ["float16", "uint16"] else 1
         self.assertEqual(len(block.ops), num_ops)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'gaussian_random')
@@ -405,6 +422,12 @@ def test_normal_initializer_fp16(self):
         block = self.test_normal_initializer("float16")
         self.assertTrue(check_cast_op(block.ops[1]))
 
+    def test_normal_initializer_bf16(self):
+        """Test normal initializer with bfloat16
+        """
+        block = self.test_normal_initializer("uint16")  #bfloat16
+        self.assertTrue(check_cast_op(block.ops[1]))
+
     def test_normal_initializer_dygraph(self):
         """Test normal initializer in dygraph model.
         """
@@ -455,7 +478,7 @@ def test_truncated_normal_initializer(self, dtype="float32"):
                 lod_level=0,
                 name="param",
                 initializer=initializer.TruncatedNormal(2.3, 1.9))
-        num_ops = 2 if dtype == "float16" else 1
+        num_ops = 2 if dtype in ["float16", "uint16"] else 1
         self.assertEqual(len(block.ops), num_ops)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'truncated_gaussian_random')
@@ -474,6 +497,14 @@ def test_truncated_normal_initializer_fp16(self):
         block = self.test_truncated_normal_initializer("float16")
         self.assertTrue(check_cast_op(block.ops[1]))
 
+    def test_truncated_normal_initializer_bf16(self):
+        """Test truncated normal initializer with bfloat16
+        """
+        paddle.enable_static()
+
+        block = self.test_truncated_normal_initializer("uint16")  #bfloat16
+        self.assertTrue(check_cast_op(block.ops[1]))
+
     def test_truncated_normal_initializer_dygraph(self):
         """Test truncated normal initializer in dygraph model.
         """
@@ -629,7 +660,7 @@ def test_assign_initializer(self, dtype="float32"):
                 lod_level=0,
                 name="param",
                 initializer=initializer.Assign(np_array))
-        num_ops = 2 if dtype == "float16" else 1
+        num_ops = 2 if dtype in ["float16", "uint16"] else 1
         self.assertEqual(len(block.ops), num_ops)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'assign_value')
@@ -645,6 +676,12 @@ def test_assign_initializer_fp16(self):
         block = self.test_assign_initializer("float16")
         self.assertTrue(block.ops[1])
 
+    def test_assign_initializer_bf16(self):
+        """Test the numpy array initializer with bfloat16
+        """
+        block = self.test_assign_initializer("uint16")  #bfloat16
+        self.assertTrue(block.ops[1])
+
     def test_assign_initializer_dygraph_1(self):
         """Test assign initializer in dygraph model.
         """
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 35ecbd6bf10c3..5da4a1889b6b4 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -3718,6 +3718,36 @@ def test_set_train_eval_in_static_mode(self):
         self.assertFalse(net.training)
 
 
+class MyLayer(paddle.nn.Layer):
+    def __init__(self):
+        super(MyLayer, self).__init__()
+        self._linear = paddle.nn.Linear(1, 1)
+        self._dropout = paddle.nn.Dropout(p=0.5)
+
+    def forward(self, input):
+        temp = self._linear(input)
+        temp = self._dropout(temp)
+        return temp
+
+
+class MySuperLayer(paddle.nn.Layer):
+    def __init__(self):
+        super(MySuperLayer, self).__init__()
+        self._mylayer = MyLayer()
+
+    def forward(self, input):
+        temp = self._mylayer(input)
+        return temp
+
+
+class TestSubLayerCount(unittest.TestCase):
+    def test_sublayer(self):
+        with fluid.dygraph.guard():
+            mySuperlayer = MySuperLayer()
+            self.assertTrue(len(mySuperlayer.sublayers()) == 3)
+            self.assertTrue(len(mySuperlayer.sublayers(include_self=True)) == 4)
+
+
 if __name__ == '__main__':
     paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_lookup_table_bf16_op.py b/python/paddle/fluid/tests/unittests/test_lookup_table_bf16_op.py
index 13c4aa6d767a6..b423123160f0b 100644
--- a/python/paddle/fluid/tests/unittests/test_lookup_table_bf16_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lookup_table_bf16_op.py
@@ -171,6 +171,52 @@ def test_check_output(self):
         self.check_output_with_place(core.CPUPlace(), check_dygraph=False)
 
 
+class TestEmbeddingLayerBF16ConstantInitializer(unittest.TestCase):
+    """
+    Test embedding layer api and results for bfloat16
+    """
+
+    def set_initializer(self):
+        self.initializer = fluid.initializer.Constant(value=self.value)
+
+    def setUp(self):
+        self.ids_shape = [4, 1]
+        self.w_shape = [10, 64]
+        self.ids = np.random.randint(
+            low=0, high=9, size=self.ids_shape).astype("int64")
+        self.flat_ids = self.ids.flatten()
+        self.value = 3.0
+        self.w_fp32 = np.full(self.w_shape, self.value)
+        self.place = fluid.CPUPlace()
+        self.prog = fluid.Program()
+        self.startup_prog = fluid.Program()
+        self.set_initializer()
+
+        with fluid.program_guard(self.prog, self.startup_prog):
+            x = fluid.layers.data(name='x', shape=self.ids_shape, dtype='int64')
+            self.emb = fluid.layers.embedding(
+                input=x,
+                size=self.w_shape,
+                param_attr=fluid.ParamAttr(
+                    name="emb_weight", initializer=self.initializer),
+                is_sparse=False,
+                dtype="uint16")  # bfloat16
+        exe = fluid.Executor(self.place)
+        exe.run(self.startup_prog)
+        self.result = exe.run(self.prog,
+                              feed={'x': self.ids},
+                              fetch_list=['emb_weight', self.emb])
+
+    def test_embedding_weights(self):
+        result = convert_uint16_to_float(self.result[0])
+        self.assertTrue(np.array_equal(self.w_fp32, result))
+
+    def test_lookup_results(self):
+        lookup_result = convert_uint16_to_float(self.result[1])
+        lookup_ref = _lookup(self.w_fp32, self.ids, self.flat_ids)
+        self.assertTrue(np.array_equal(lookup_result, lookup_ref))
+
+
 if __name__ == "__main__":
     enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_static.py b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_static.py
index 8fd250f2a52c2..f5bccf7ab09b6 100644
--- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_static.py
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_static.py
@@ -101,7 +101,7 @@ def prepare_places(with_data_parallel, with_cpu=False, with_gpu=True):
 
 
 class TestStaticDataLoader(unittest.TestCase):
-    def run_main(self, num_workers, places):
+    def run_main(self, num_workers, places, use_pe=True):
         scope = fluid.Scope()
         with fluid.scope_guard(scope):
             startup_prog, main_prog, image, label, loss = simple_fc_net_static()
@@ -120,10 +120,13 @@ def run_main(self, num_workers, places):
             exe = fluid.Executor(place=places[0])
             exe.run(startup_prog)
 
-            prog = fluid.CompiledProgram(main_prog)
-            if len(places) > 1:
-                prog = prog.with_data_parallel(
-                    loss_name=loss.name, places=places)
+            if use_pe:
+                prog = fluid.CompiledProgram(main_prog)
+                if len(places) > 1:
+                    prog = prog.with_data_parallel(
+                        loss_name=loss.name, places=places)
+            else:
+                prog = main_prog
 
             step_list = []
             loss_list = []
diff --git a/python/paddle/fluid/tests/unittests/test_nearest_interp_v2_op.py b/python/paddle/fluid/tests/unittests/test_nearest_interp_v2_op.py
index 1f88568b5bc8e..04962a93c11c1 100755
--- a/python/paddle/fluid/tests/unittests/test_nearest_interp_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_nearest_interp_v2_op.py
@@ -21,6 +21,7 @@
 import paddle.fluid as fluid
 import paddle.nn as nn
 import paddle
+from paddle.nn.functional import interpolate
 
 
 def nearest_neighbor_interp_np(X,
@@ -526,6 +527,28 @@ def test_case(self):
             self.assertTrue(np.allclose(results[i + 1], expect_res))
 
 
+class TestNearestInterpOpAPI_dy(unittest.TestCase):
+    def test_case(self):
+        import paddle
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+        else:
+            place = core.CPUPlace()
+        with fluid.dygraph.guard(place):
+            input_data = np.random.random((2, 3, 6, 6)).astype("int64")
+            scale_np = np.array([2, 2]).astype("int64")
+            input_x = paddle.to_tensor(input_data)
+            scale = paddle.to_tensor(scale_np)
+            expect_res = nearest_neighbor_interp_np(
+                input_data, out_h=12, out_w=12, align_corners=False)
+            out = interpolate(
+                x=input_x,
+                scale_factor=scale,
+                mode="nearest",
+                align_corners=False)
+            self.assertTrue(np.allclose(out.numpy(), expect_res))
+
+
 class TestNearestInterpException(unittest.TestCase):
     def test_exception(self):
         input = fluid.data(name="input", shape=[1, 3, 6, 6], dtype="float32")
diff --git a/python/paddle/fluid/tests/unittests/test_new_group.sh b/python/paddle/fluid/tests/unittests/test_new_group.sh
index d0b29a64145c6..4914183fb46f9 100755
--- a/python/paddle/fluid/tests/unittests/test_new_group.sh
+++ b/python/paddle/fluid/tests/unittests/test_new_group.sh
@@ -17,4 +17,4 @@
 set -e
 
 CUDA_VISIBLE_DEVICES=0,1 python -m paddle.distributed.launch --gpus=0,1  new_group.py
-CUDA_VISIBLE_DEVICES=0,1 python -m paddle.distributed.launch --gpus=0,1  hybrid_communicate_group.py
+CUDA_VISIBLE_DEVICES=0,1 python -m paddle.distributed.launch --gpus=0,1  hybrid_parallel_communicate_group.py
diff --git a/python/paddle/fluid/tests/unittests/test_optimizer_for_varbase.py b/python/paddle/fluid/tests/unittests/test_optimizer_for_varbase.py
new file mode 100644
index 0000000000000..8fdedce22469a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_optimizer_for_varbase.py
@@ -0,0 +1,122 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+
+import paddle
+import paddle.optimizer as optimizer
+
+
+class TestOptimizerForVarBase(unittest.TestCase):
+    def setUp(self):
+        self.lr = 0.01
+
+    def run_optimizer_step_with_varbase_list_input(self, optimizer):
+        x = paddle.zeros([2, 3])
+        y = paddle.ones([2, 3])
+        x.stop_gradient = False
+
+        z = x + y
+
+        opt = optimizer(
+            learning_rate=self.lr, parameters=[x], weight_decay=0.01)
+
+        z.backward()
+        opt.step()
+
+        self.assertTrue(np.allclose(x.numpy(), np.full([2, 3], -self.lr)))
+
+    def run_optimizer_minimize_with_varbase_list_input(self, optimizer):
+        x = paddle.zeros([2, 3])
+        y = paddle.ones([2, 3])
+        x.stop_gradient = False
+
+        z = x + y
+
+        opt = optimizer(learning_rate=self.lr, parameters=[x])
+
+        z.backward()
+        opt.minimize(z)
+
+        self.assertTrue(np.allclose(x.numpy(), np.full([2, 3], -self.lr)))
+
+    def test_adam_with_varbase_list_input(self):
+        self.run_optimizer_step_with_varbase_list_input(optimizer.Adam)
+        self.run_optimizer_minimize_with_varbase_list_input(optimizer.Adam)
+
+    def test_sgd_with_varbase_list_input(self):
+        self.run_optimizer_step_with_varbase_list_input(optimizer.SGD)
+        self.run_optimizer_minimize_with_varbase_list_input(optimizer.SGD)
+
+    def test_adagrad_with_varbase_list_input(self):
+        self.run_optimizer_step_with_varbase_list_input(optimizer.Adagrad)
+        self.run_optimizer_minimize_with_varbase_list_input(optimizer.Adagrad)
+
+    def test_adamw_with_varbase_list_input(self):
+        self.run_optimizer_step_with_varbase_list_input(optimizer.AdamW)
+        self.run_optimizer_minimize_with_varbase_list_input(optimizer.AdamW)
+
+    def test_adamax_with_varbase_list_input(self):
+        self.run_optimizer_step_with_varbase_list_input(optimizer.Adamax)
+        self.run_optimizer_minimize_with_varbase_list_input(optimizer.Adamax)
+
+    def test_momentum_with_varbase_list_input(self):
+        self.run_optimizer_step_with_varbase_list_input(optimizer.Momentum)
+        self.run_optimizer_minimize_with_varbase_list_input(optimizer.Momentum)
+
+    def test_optimizer_with_varbase_input(self):
+        x = paddle.zeros([2, 3])
+        with self.assertRaises(TypeError):
+            optimizer.Adam(learning_rate=self.lr, parameters=x)
+
+    def test_create_param_lr_with_1_for_coverage(self):
+        x = paddle.fluid.framework.ParamBase(
+            dtype="float32",
+            shape=[5, 10],
+            lod_level=0,
+            name="x",
+            optimize_attr={'learning_rate': 1.0})
+        x.value().get_tensor().set(
+            np.random.random((5, 10)).astype('float32'),
+            paddle.fluid.framework._current_expected_place())
+
+        y = paddle.ones([5, 10])
+        z = x + y
+        opt = optimizer.Adam(learning_rate=self.lr, parameters=[x])
+        z.backward()
+        opt.step()
+
+    def test_create_param_lr_with_no_1_value_for_coverage(self):
+        x = paddle.fluid.framework.ParamBase(
+            dtype="float32",
+            shape=[5, 10],
+            lod_level=0,
+            name="x",
+            optimize_attr={'learning_rate': 0.12})
+        x.value().get_tensor().set(
+            np.random.random((5, 10)).astype('float32'),
+            paddle.fluid.framework._current_expected_place())
+
+        y = paddle.ones([5, 10])
+        z = x + y
+        opt = optimizer.Adam(learning_rate=self.lr, parameters=[x])
+        z.backward()
+        opt.step()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_paddle_save_load.py b/python/paddle/fluid/tests/unittests/test_paddle_save_load.py
index b58d63969a5e5..3a5c43b2bab3e 100644
--- a/python/paddle/fluid/tests/unittests/test_paddle_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_paddle_save_load.py
@@ -27,6 +27,7 @@
 from paddle.fluid.optimizer import Adam
 import paddle.fluid.framework as framework
 from test_imperative_base import new_program_scope
+from paddle.optimizer.lr import LRScheduler
 
 BATCH_SIZE = 16
 BATCH_NUM = 4
@@ -262,8 +263,31 @@ def test_replace_static_save_load(self):
 
     def test_paddle_save_load_v2(self):
         paddle.disable_static()
+
+        class StepDecay(LRScheduler):
+            def __init__(self,
+                         learning_rate,
+                         step_size,
+                         gamma=0.1,
+                         last_epoch=-1,
+                         verbose=False):
+                self.step_size = step_size
+                self.gamma = gamma
+                super(StepDecay, self).__init__(learning_rate, last_epoch,
+                                                verbose)
+
+            def get_lr(self):
+                i = self.last_epoch // self.step_size
+                return self.base_lr * (self.gamma**i)
+
         layer = LinearNet()
-        state_dict = layer.state_dict()
+        inps = paddle.randn([2, IMAGE_SIZE])
+        adam = opt.Adam(
+            learning_rate=StepDecay(0.1, 1), parameters=layer.parameters())
+        y = layer(inps)
+        y.mean().backward()
+        adam.step()
+        state_dict = adam.state_dict()
         path = 'paddle_save_load_v2/model.pdparams'
         with self.assertRaises(TypeError):
             paddle.save(state_dict, path, use_binary_format='False')
@@ -274,9 +298,15 @@ def test_paddle_save_load_v2(self):
         paddle.save(state_dict, path)
         load_dict_np = paddle.framework.io._legacy_load(path)
         for k, v in state_dict.items():
-            self.assertTrue(
-                np.array_equal(v.numpy(), load_dict_tensor[k].numpy()))
-            self.assertTrue(np.array_equal(v.numpy(), load_dict_np[k]))
+            if isinstance(v, dict):
+                self.assertTrue(v == load_dict_tensor[k])
+            else:
+                self.assertTrue(
+                    np.array_equal(v.numpy(), load_dict_tensor[k].numpy()))
+                if not np.array_equal(v.numpy(), load_dict_np[k]):
+                    print(v.numpy())
+                    print(load_dict_np[k])
+                self.assertTrue(np.array_equal(v.numpy(), load_dict_np[k]))
 
     def test_single_pickle_var_dygraph(self):
         # enable dygraph mode
@@ -370,6 +400,366 @@ def test_dygraph_save_static_load(self):
                     np.array_equal(tensor.numpy(),
                                    np.array(state_dict_param[tensor.name])))
 
+    def test_save_load_complex_object_dygraph_save(self):
+        paddle.disable_static()
+        layer = paddle.nn.Linear(3, 4)
+        state_dict = layer.state_dict()
+        obj1 = [
+            paddle.randn(
+                [3, 4], dtype='float32'), np.random.randn(5, 6),
+            ('fake_weight', np.ones(
+                [7, 8], dtype='float32'))
+        ]
+        obj2 = {'k1': obj1, 'k2': state_dict, 'epoch': 123}
+        obj3 = (paddle.randn(
+            [5, 4], dtype='float32'), np.ndarray(
+                [3, 4], dtype="float32"), {
+                    "state_dict": state_dict,
+                    "opt": state_dict
+                })
+        obj4 = (np.random.randn(5, 6), (123, ))
+
+        path1 = "test_save_load_any_complex_object_dygraph/obj1"
+        path2 = "test_save_load_any_complex_object_dygraph/obj2"
+        path3 = "test_save_load_any_complex_object_dygraph/obj3"
+        path4 = "test_save_load_any_complex_object_dygraph/obj4"
+        paddle.save(obj1, path1)
+        paddle.save(obj2, path2)
+        paddle.save(obj3, path3)
+        paddle.save(obj4, path4)
+
+        load_tensor1 = paddle.load(path1, return_numpy=False)
+        load_tensor2 = paddle.load(path2, return_numpy=False)
+        load_tensor3 = paddle.load(path3, return_numpy=False)
+        load_tensor4 = paddle.load(path4, return_numpy=False)
+
+        self.assertTrue(
+            np.array_equal(load_tensor1[0].numpy(), obj1[0].numpy()))
+        self.assertTrue(np.array_equal(load_tensor1[1], obj1[1]))
+        self.assertTrue(np.array_equal(load_tensor1[2].numpy(), obj1[2][1]))
+        for i in range(len(load_tensor1)):
+            self.assertTrue(
+                type(load_tensor1[i]) == type(load_tensor2['k1'][i]))
+        for k, v in state_dict.items():
+            self.assertTrue(
+                np.array_equal(v.numpy(), load_tensor2['k2'][k].numpy()))
+        self.assertTrue(load_tensor2['epoch'] == 123)
+
+        self.assertTrue(
+            np.array_equal(load_tensor3[0].numpy(), obj3[0].numpy()))
+        self.assertTrue(np.array_equal(np.array(load_tensor3[1]), obj3[1]))
+
+        for k, v in state_dict.items():
+            self.assertTrue(
+                np.array_equal(load_tensor3[2]["state_dict"][k].numpy(),
+                               v.numpy()))
+
+        for k, v in state_dict.items():
+            self.assertTrue(
+                np.array_equal(load_tensor3[2]["opt"][k].numpy(), v.numpy()))
+
+        self.assertTrue(np.array_equal(load_tensor4[0].numpy(), obj4[0]))
+
+        load_array1 = paddle.load(path1, return_numpy=True)
+        load_array2 = paddle.load(path2, return_numpy=True)
+        load_array3 = paddle.load(path3, return_numpy=True)
+        load_array4 = paddle.load(path4, return_numpy=True)
+
+        self.assertTrue(np.array_equal(load_array1[0], obj1[0].numpy()))
+        self.assertTrue(np.array_equal(load_array1[1], obj1[1]))
+        self.assertTrue(np.array_equal(load_array1[2], obj1[2][1]))
+        for i in range(len(load_array1)):
+            self.assertTrue(type(load_array1[i]) == type(load_array2['k1'][i]))
+        for k, v in state_dict.items():
+            self.assertTrue(np.array_equal(v.numpy(), load_array2['k2'][k]))
+        self.assertTrue(load_array2['epoch'] == 123)
+
+        self.assertTrue(np.array_equal(load_array3[0], obj3[0].numpy()))
+        self.assertTrue(np.array_equal(load_array3[1], obj3[1]))
+
+        for k, v in state_dict.items():
+            self.assertTrue(
+                np.array_equal(load_array3[2]["state_dict"][k], v.numpy()))
+
+        for k, v in state_dict.items():
+            self.assertTrue(np.array_equal(load_array3[2]["opt"][k], v.numpy()))
+
+        self.assertTrue(np.array_equal(load_array4[0], obj4[0]))
+
+        # static mode
+        paddle.enable_static()
+
+        load_tensor1 = paddle.load(path1, return_numpy=False)
+        load_tensor2 = paddle.load(path2, return_numpy=False)
+        load_tensor3 = paddle.load(path3, return_numpy=False)
+        load_tensor4 = paddle.load(path4, return_numpy=False)
+
+        self.assertTrue(
+            np.array_equal(np.array(load_tensor1[0]), obj1[0].numpy()))
+        self.assertTrue(np.array_equal(np.array(load_tensor1[1]), obj1[1]))
+        self.assertTrue(np.array_equal(np.array(load_tensor1[2]), obj1[2][1]))
+
+        for i in range(len(load_tensor1)):
+            self.assertTrue(
+                type(load_tensor1[i]) == type(load_tensor2['k1'][i]))
+        for k, v in state_dict.items():
+            self.assertTrue(
+                np.array_equal(v.numpy(), np.array(load_tensor2['k2'][k])))
+        self.assertTrue(load_tensor2['epoch'] == 123)
+
+        self.assertTrue(
+            isinstance(load_tensor3[0], paddle.fluid.core.LoDTensor))
+        self.assertTrue(
+            np.array_equal(np.array(load_tensor3[0]), obj3[0].numpy()))
+        self.assertTrue(np.array_equal(np.array(load_tensor3[1]), obj3[1]))
+
+        for k, v in state_dict.items():
+            self.assertTrue(
+                isinstance(load_tensor3[2]["state_dict"][k],
+                           paddle.fluid.core.LoDTensor))
+            self.assertTrue(
+                np.array_equal(
+                    np.array(load_tensor3[2]["state_dict"][k]), v.numpy()))
+
+        for k, v in state_dict.items():
+            self.assertTrue(
+                isinstance(load_tensor3[2]["opt"][k],
+                           paddle.fluid.core.LoDTensor))
+            self.assertTrue(
+                np.array_equal(np.array(load_tensor3[2]["opt"][k]), v.numpy()))
+
+        self.assertTrue(load_tensor4[0], paddle.fluid.core.LoDTensor)
+        self.assertTrue(np.array_equal(np.array(load_tensor4[0]), obj4[0]))
+
+        load_array1 = paddle.load(path1, return_numpy=True)
+        load_array2 = paddle.load(path2, return_numpy=True)
+        load_array3 = paddle.load(path3, return_numpy=True)
+        load_array4 = paddle.load(path4, return_numpy=True)
+
+        self.assertTrue(np.array_equal(load_array1[0], obj1[0].numpy()))
+        self.assertTrue(np.array_equal(load_array1[1], obj1[1]))
+        self.assertTrue(np.array_equal(load_array1[2], obj1[2][1]))
+        for i in range(len(load_array1)):
+            self.assertTrue(type(load_array1[i]) == type(load_array2['k1'][i]))
+        for k, v in state_dict.items():
+            self.assertTrue(np.array_equal(v.numpy(), load_array2['k2'][k]))
+        self.assertTrue(load_array2['epoch'] == 123)
+
+        self.assertTrue(isinstance(load_array3[0], np.ndarray))
+        self.assertTrue(np.array_equal(load_array3[0], obj3[0].numpy()))
+        self.assertTrue(np.array_equal(load_array3[1], obj3[1]))
+
+        for k, v in state_dict.items():
+            self.assertTrue(
+                np.array_equal(load_array3[2]["state_dict"][k], v.numpy()))
+
+        for k, v in state_dict.items():
+            self.assertTrue(np.array_equal(load_array3[2]["opt"][k], v.numpy()))
+
+        self.assertTrue(np.array_equal(load_array4[0], obj4[0]))
+
+    def test_save_load_complex_object_static_save(self):
+        paddle.enable_static()
+        with new_program_scope():
+            # create network
+            x = paddle.static.data(
+                name="x", shape=[None, IMAGE_SIZE], dtype='float32')
+            z = paddle.static.nn.fc(x, 10, bias_attr=False)
+            z = paddle.static.nn.fc(z, 128, bias_attr=False)
+            loss = fluid.layers.reduce_mean(z)
+            place = fluid.CPUPlace(
+            ) if not paddle.fluid.core.is_compiled_with_cuda(
+            ) else fluid.CUDAPlace(0)
+            prog = paddle.static.default_main_program()
+            exe = paddle.static.Executor(place)
+            exe.run(paddle.static.default_startup_program())
+
+            state_dict = prog.state_dict()
+            keys = list(state_dict.keys())
+            obj1 = [
+                state_dict[keys[0]], np.random.randn(5, 6),
+                ('fake_weight', np.ones(
+                    [7, 8], dtype='float32'))
+            ]
+            obj2 = {'k1': obj1, 'k2': state_dict, 'epoch': 123}
+            obj3 = (state_dict[keys[0]], np.ndarray(
+                [3, 4], dtype="float32"), {
+                    "state_dict": state_dict,
+                    "opt": state_dict
+                })
+            obj4 = (np.ndarray([3, 4], dtype="float32"), )
+
+            path1 = "test_save_load_any_complex_object_static/obj1"
+            path2 = "test_save_load_any_complex_object_static/obj2"
+            path3 = "test_save_load_any_complex_object_static/obj3"
+            path4 = "test_save_load_any_complex_object_static/obj4"
+            paddle.save(obj1, path1)
+            paddle.save(obj2, path2)
+            paddle.save(obj3, path3)
+            paddle.save(obj4, path4)
+
+            load_tensor1 = paddle.load(path1, return_numpy=False)
+            load_tensor2 = paddle.load(path2, return_numpy=False)
+            load_tensor3 = paddle.load(path3, return_numpy=False)
+            load_tensor4 = paddle.load(path4, return_numpy=False)
+
+            self.assertTrue(
+                np.array_equal(np.array(load_tensor1[0]), np.array(obj1[0])))
+            self.assertTrue(np.array_equal(np.array(load_tensor1[1]), obj1[1]))
+            self.assertTrue(
+                np.array_equal(np.array(load_tensor1[2]), obj1[2][1]))
+            for i in range(len(load_tensor1)):
+                self.assertTrue(
+                    type(load_tensor1[i]) == type(load_tensor2['k1'][i]))
+            for k, v in state_dict.items():
+                self.assertTrue(
+                    np.array_equal(
+                        np.array(v), np.array(load_tensor2['k2'][k])))
+            self.assertTrue(load_tensor2['epoch'] == 123)
+
+            self.assertTrue(isinstance(load_tensor3[0], fluid.core.LoDTensor))
+            self.assertTrue(np.array_equal(np.array(load_tensor3[0]), obj3[0]))
+            self.assertTrue(isinstance(load_tensor3[1], fluid.core.LoDTensor))
+            self.assertTrue(np.array_equal(np.array(load_tensor3[1]), obj3[1]))
+
+            for k, v in state_dict.items():
+                self.assertTrue(
+                    isinstance(load_tensor3[2]["state_dict"][k],
+                               fluid.core.LoDTensor))
+                self.assertTrue(
+                    np.array_equal(
+                        np.array(load_tensor3[2]["state_dict"][k]), np.array(
+                            v)))
+
+            for k, v in state_dict.items():
+                self.assertTrue(
+                    isinstance(load_tensor3[2]["opt"][k], fluid.core.LoDTensor))
+                self.assertTrue(
+                    np.array_equal(
+                        np.array(load_tensor3[2]["opt"][k]), np.array(v)))
+
+            self.assertTrue(isinstance(load_tensor4[0], fluid.core.LoDTensor))
+            self.assertTrue(np.array_equal(np.array(load_tensor4[0]), obj4[0]))
+
+            load_array1 = paddle.load(path1, return_numpy=True)
+            load_array2 = paddle.load(path2, return_numpy=True)
+            load_array3 = paddle.load(path3, return_numpy=True)
+            load_array4 = paddle.load(path4, return_numpy=True)
+
+            self.assertTrue(np.array_equal(load_array1[0], np.array(obj1[0])))
+            self.assertTrue(np.array_equal(load_array1[1], obj1[1]))
+            self.assertTrue(np.array_equal(load_array1[2], obj1[2][1]))
+            for i in range(len(load_array1)):
+                self.assertTrue(
+                    type(load_array1[i]) == type(load_array2['k1'][i]))
+            for k, v in state_dict.items():
+                self.assertTrue(
+                    np.array_equal(np.array(v), load_array2['k2'][k]))
+            self.assertTrue(load_array2['epoch'] == 123)
+
+            self.assertTrue(np.array_equal(load_array3[0], np.array(obj3[0])))
+            self.assertTrue(np.array_equal(load_array3[1], obj3[1]))
+
+            for k, v in state_dict.items():
+                self.assertTrue(
+                    np.array_equal(load_array3[2]["state_dict"][k], np.array(
+                        v)))
+
+            for k, v in state_dict.items():
+                self.assertTrue(
+                    np.array_equal(load_array3[2]["opt"][k], np.array(v)))
+
+            self.assertTrue(np.array_equal(load_array4[0], obj4[0]))
+
+            # dygraph mode
+            paddle.disable_static()
+
+            load_tensor1 = paddle.load(path1, return_numpy=False)
+            load_tensor2 = paddle.load(path2, return_numpy=False)
+            load_tensor3 = paddle.load(path3, return_numpy=False)
+            load_tensor4 = paddle.load(path4, return_numpy=False)
+
+            self.assertTrue(
+                np.array_equal(np.array(load_tensor1[0]), np.array(obj1[0])))
+            self.assertTrue(np.array_equal(np.array(load_tensor1[1]), obj1[1]))
+            self.assertTrue(np.array_equal(load_tensor1[2].numpy(), obj1[2][1]))
+            for i in range(len(load_tensor1)):
+                self.assertTrue(
+                    type(load_tensor1[i]) == type(load_tensor2['k1'][i]))
+            for k, v in state_dict.items():
+                self.assertTrue(
+                    np.array_equal(
+                        np.array(v), np.array(load_tensor2['k2'][k])))
+            self.assertTrue(load_tensor2['epoch'] == 123)
+
+            self.assertTrue(isinstance(load_tensor3[0], fluid.core.VarBase))
+            self.assertTrue(np.array_equal(load_tensor3[0].numpy(), obj3[0]))
+            self.assertTrue(isinstance(load_tensor3[1], fluid.core.VarBase))
+            self.assertTrue(np.array_equal(load_tensor3[1].numpy(), obj3[1]))
+
+            for k, v in state_dict.items():
+                self.assertTrue(
+                    isinstance(load_tensor3[2]["state_dict"][k],
+                               fluid.core.VarBase))
+                self.assertTrue(
+                    np.array_equal(load_tensor3[2]["state_dict"][k].numpy(),
+                                   np.array(v)))
+
+            for k, v in state_dict.items():
+                self.assertTrue(
+                    isinstance(load_tensor3[2]["opt"][k], fluid.core.VarBase))
+                self.assertTrue(
+                    np.array_equal(load_tensor3[2]["opt"][k].numpy(),
+                                   np.array(v)))
+
+            self.assertTrue(isinstance(load_tensor4[0], fluid.core.VarBase))
+            self.assertTrue(np.array_equal(load_tensor4[0].numpy(), obj4[0]))
+
+            load_array1 = paddle.load(path1, return_numpy=True)
+            load_array2 = paddle.load(path2, return_numpy=True)
+            load_array3 = paddle.load(path3, return_numpy=True)
+            load_array4 = paddle.load(path4, return_numpy=True)
+
+            self.assertTrue(np.array_equal(load_array1[0], np.array(obj1[0])))
+            self.assertTrue(np.array_equal(load_array1[1], obj1[1]))
+            self.assertTrue(np.array_equal(load_array1[2], obj1[2][1]))
+            for i in range(len(load_array1)):
+                self.assertTrue(
+                    type(load_array1[i]) == type(load_array2['k1'][i]))
+            for k, v in state_dict.items():
+                self.assertTrue(
+                    np.array_equal(np.array(v), load_array2['k2'][k]))
+            self.assertTrue(load_array2['epoch'] == 123)
+
+            self.assertTrue(np.array_equal(load_array3[0], np.array(obj3[0])))
+            self.assertTrue(np.array_equal(load_array3[1], obj3[1]))
+
+            for k, v in state_dict.items():
+                self.assertTrue(
+                    np.array_equal(load_array3[2]["state_dict"][k], np.array(
+                        v)))
+
+            for k, v in state_dict.items():
+                self.assertTrue(
+                    np.array_equal(load_array3[2]["opt"][k], np.array(v)))
+
+            self.assertTrue(isinstance(load_array4[0], np.ndarray))
+            self.assertTrue(np.array_equal(load_array4[0], obj4[0]))
+
+    def test_varbase_binary_var(self):
+        paddle.disable_static()
+        varbase = paddle.randn([3, 2], dtype='float32')
+        path = 'test_paddle_save_load_varbase_binary_var/varbase'
+        paddle.save(varbase, path, use_binary_format=True)
+        load_array = paddle.load(path, return_numpy=True)
+        load_tensor = paddle.load(path, return_numpy=False)
+        origin_array = varbase.numpy()
+        load_tensor_array = load_tensor.numpy()
+        if paddle.fluid.core.is_compiled_with_cuda():
+            fluid.core._cuda_synchronize(paddle.CUDAPlace(0))
+        self.assertTrue(np.array_equal(origin_array, load_array))
+        self.assertTrue(np.array_equal(origin_array, load_tensor_array))
+
 
 class TestSaveLoad(unittest.TestCase):
     def setUp(self):
@@ -431,8 +821,6 @@ def test_save_load(self):
         # error test cases, some tests relay base test above
         # 1. test save obj not dict error
         test_list = [1, 2, 3]
-        with self.assertRaises(NotImplementedError):
-            paddle.save(test_list, "not_dict_error_path")
 
         # 2. test save path format error
         with self.assertRaises(ValueError):
@@ -447,5 +835,57 @@ def test_save_load(self):
             paddle.load("test_paddle_save_load.linear")
 
 
+class TestSaveLoadProgram(unittest.TestCase):
+    def test_save_load_program(self):
+        paddle.enable_static()
+        with new_program_scope():
+            layer = LinearNet()
+            data = paddle.static.data(
+                name='x_static_save', shape=(None, IMAGE_SIZE), dtype='float32')
+            y_static = layer(data)
+            main_program = paddle.static.default_main_program()
+            startup_program = paddle.static.default_startup_program()
+            origin_main = main_program.desc.serialize_to_string()
+            origin_startup = startup_program.desc.serialize_to_string()
+            path1 = "test_paddle_save_load_program/main_program.pdmodel"
+            path2 = "test_paddle_save_load_program/startup_program.pdmodel"
+            paddle.save(main_program, path1)
+            paddle.save(startup_program, path2)
+
+        with new_program_scope():
+            load_main = paddle.load(path1).desc.serialize_to_string()
+            load_startup = paddle.load(path2).desc.serialize_to_string()
+            self.assertTrue(origin_main == load_main)
+            self.assertTrue(origin_startup == load_startup)
+
+
+class TestSaveLoadLayer(unittest.TestCase):
+    def test_save_load_layer(self):
+        if six.PY2:
+            return
+
+        paddle.disable_static()
+        inps = paddle.randn([1, IMAGE_SIZE], dtype='float32')
+        layer1 = LinearNet()
+        layer2 = LinearNet()
+        layer1.eval()
+        layer2.eval()
+        origin = (layer1(inps), layer2(inps))
+        path = "test_save_load_layer_/layer.pdmodel"
+        paddle.save((layer1, layer2), path)
+
+        # static
+        paddle.enable_static()
+        with self.assertRaises(ValueError):
+            paddle.load(path)
+        # dygraph
+        paddle.disable_static()
+
+        loaded_layer = paddle.load(path)
+        loaded_result = [l(inps) for l in loaded_layer]
+        for i in range(len(origin)):
+            self.assertTrue((origin[i] - loaded_result[i]).abs().max() < 1e-10)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_paddle_save_load_binary.py b/python/paddle/fluid/tests/unittests/test_paddle_save_load_binary.py
new file mode 100644
index 0000000000000..8b508d5c9ae79
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_paddle_save_load_binary.py
@@ -0,0 +1,217 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import os
+import sys
+import six
+
+import paddle
+import paddle.nn as nn
+import paddle.optimizer as opt
+import paddle.fluid as fluid
+from paddle.fluid.optimizer import Adam
+import paddle.fluid.framework as framework
+from test_imperative_base import new_program_scope
+
+IMAGE_SIZE = 784
+
+
+class TestSaveLoadBinaryFormat(unittest.TestCase):
+    def setUp(self):
+        # enable static graph mode
+        paddle.enable_static()
+
+    def set_zero(self, prog, place, scope=None):
+        if scope is None:
+            scope = fluid.global_scope()
+        for var in prog.list_vars():
+            if isinstance(var, framework.Parameter) or var.persistable:
+                ten = scope.find_var(var.name).get_tensor()
+                if ten is not None:
+                    ten.set(np.zeros_like(np.array(ten)), place)
+                    new_t = np.array(scope.find_var(var.name).get_tensor())
+                    self.assertTrue(np.sum(np.abs(new_t)) == 0)
+
+    def replace_save_vars(self, program, dirname):
+        def predicate(var):
+            return var.persistable
+
+        vars = filter(predicate, program.list_vars())
+        for var in vars:
+            paddle.save(
+                var.get_value(),
+                os.path.join(dirname, var.name),
+                use_binary_format=True)
+
+    def replace_load_vars(self, program, dirname):
+        def predicate(var):
+            return var.persistable
+
+        var_list = list(filter(predicate, program.list_vars()))
+        for var in var_list:
+            var_load = paddle.load(os.path.join(dirname, var.name))
+            # set var_load to scope
+            var.set_value(var_load)
+
+    def test_replace_save_load_vars(self):
+        paddle.enable_static()
+        with new_program_scope():
+            # create network
+            x = paddle.static.data(
+                name="x", shape=[None, IMAGE_SIZE], dtype='float32')
+            z = paddle.static.nn.fc(x, 10, bias_attr=False)
+            z = paddle.static.nn.fc(z, 128, bias_attr=False)
+            loss = fluid.layers.reduce_mean(z)
+            place = fluid.CPUPlace(
+            ) if not paddle.fluid.core.is_compiled_with_cuda(
+            ) else fluid.CUDAPlace(0)
+            exe = paddle.static.Executor(place)
+            exe.run(paddle.static.default_startup_program())
+            prog = paddle.static.default_main_program()
+            base_map = {}
+            for var in prog.list_vars():
+                if isinstance(var, framework.Parameter) or var.persistable:
+                    t = np.array(fluid.global_scope().find_var(var.name)
+                                 .get_tensor())
+                    # make sure all the paramerter or optimizer var have been update
+                    self.assertTrue(np.sum(np.abs(t)) != 0)
+                    base_map[var.name] = t
+            # test for replace_save_vars/io.load_vars
+            path_vars1 = 'test_replace_save_load_vars_binary1/model'
+            self.replace_save_vars(prog, path_vars1)
+            # set var to zero
+            self.set_zero(prog, place)
+            var_list = list(
+                filter(lambda var: var.persistable, prog.list_vars()))
+            fluid.io.load_vars(
+                exe, path_vars1, main_program=prog, vars=var_list)
+
+            for var in prog.list_vars():
+                if var.persistable:
+                    new_t = np.array(fluid.global_scope().find_var(var.name)
+                                     .get_tensor())
+                    base_t = base_map[var.name]
+
+                    self.assertTrue(np.array_equal(new_t, base_t))
+            # test for io.save_vars/replace_load_vars
+            path_vars2 = 'test_replace_save_load_vars_binary2/model/'
+            fluid.io.save_vars(
+                exe, path_vars2, main_program=prog, vars=var_list)
+            self.set_zero(prog, place)
+            self.replace_load_vars(prog, path_vars2)
+            for var in prog.list_vars():
+                if var.persistable:
+                    new_t = np.array(fluid.global_scope().find_var(var.name)
+                                     .get_tensor())
+                    base_t = base_map[var.name]
+
+                    self.assertTrue(np.array_equal(new_t, base_t))
+
+    def test_save_load_lod_tensor(self):
+        paddle.enable_static()
+        OUTPUT_NUM = 32
+        with new_program_scope():
+            x = fluid.data(name="x", shape=[None, IMAGE_SIZE], dtype='float32')
+            y = fluid.layers.fc(
+                x,
+                OUTPUT_NUM,
+                name='fc_vars', )
+            prog = fluid.default_main_program()
+            place = fluid.CPUPlace(
+            ) if not paddle.fluid.core.is_compiled_with_cuda(
+            ) else fluid.CUDAPlace(0)
+            exe = fluid.Executor(place)
+            prog = paddle.static.default_main_program()
+            exe.run(fluid.default_startup_program())
+
+            dirname = 'test_save_load_lod_tensor1/tensor_'
+            for var in prog.list_vars():
+                if var.persistable and list(
+                        var.shape) == [IMAGE_SIZE, OUTPUT_NUM]:
+                    tensor = var.get_value()
+                    paddle.save(
+                        tensor, dirname + 'fc_vars.w_0', use_binary_format=True)
+                    break
+
+            origin = np.array(var.get_value())
+            var.set_value(np.zeros_like(origin))
+            is_zeros = np.array(var.get_value())
+
+            loaded_tensor = paddle.load(dirname + 'fc_vars.w_0')
+            self.assertTrue(isinstance(loaded_tensor, fluid.core.LoDTensor))
+            self.assertTrue(
+                list(loaded_tensor.shape()) == [IMAGE_SIZE, OUTPUT_NUM])
+            to_array = np.array(loaded_tensor)
+            self.assertTrue(np.array_equal(origin, to_array))
+
+        with self.assertRaises(NotImplementedError):
+            path = 'test_save_load_error/temp'
+            paddle.save({}, path, use_binary_format=True)
+
+        with self.assertRaises(ValueError):
+            path = 'test_save_load_error/temp'
+            with open(path, "w") as f:
+                f.write('\0')
+            paddle.load(path)
+
+        with self.assertRaises(ValueError):
+            temp_lod = fluid.core.LoDTensor()
+            paddle.save(temp_lod, path, use_binary_format=True)
+
+        with self.assertRaises(RuntimeError):
+            fluid.core._save_lod_tensor(
+                temp_lod, 'test_save_load_error_not_exist_file/not_exist_file')
+
+        with self.assertRaises(RuntimeError):
+            fluid.core._load_lod_tensor(
+                temp_lod, 'test_save_load_error_not_exist_file/not_exist_file')
+
+    def test_save_load_selected_rows(self):
+        paddle.enable_static()
+        place = fluid.CPUPlace() if not paddle.fluid.core.is_compiled_with_cuda(
+        ) else fluid.CUDAPlace(0)
+        height = 10
+        rows = [0, 4, 7]
+        row_numel = 12
+        selected_rows = fluid.core.SelectedRows(rows, height)
+        path = 'test_paddle_save_load_selected_rows/sr.pdsr'
+
+        with self.assertRaises(ValueError):
+            paddle.save(selected_rows, path, use_binary_format=True)
+
+        np_array = np.random.randn(len(rows), row_numel).astype("float32")
+        tensor = selected_rows.get_tensor()
+        tensor.set(np_array, place)
+
+        paddle.save(selected_rows, path, use_binary_format=True)
+        load_sr = paddle.load(path)
+
+        self.assertTrue(isinstance(load_sr, fluid.core.SelectedRows))
+        self.assertTrue(list(load_sr.rows()) == rows)
+        self.assertTrue(load_sr.height() == height)
+        self.assertTrue(
+            np.array_equal(np.array(load_sr.get_tensor()), np_array))
+
+        with self.assertRaises(RuntimeError):
+            fluid.core._save_selected_rows(
+                selected_rows,
+                'test_paddle_save_load_selected_rows_not_exist_file/temp')
+        with self.assertRaises(RuntimeError):
+            fluid.core._load_selected_rows(
+                selected_rows,
+                'test_paddle_save_load_selected_rows_not_exist_file/temp')
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_hybrid_parallel.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_hybrid_parallel.py
new file mode 100644
index 0000000000000..ac37edc266f2c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_hybrid_parallel.py
@@ -0,0 +1,41 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import paddle.fluid as fluid
+
+from test_parallel_dygraph_dataparallel import TestMultipleGpus
+
+
+class TestHybridParallel(TestMultipleGpus):
+    def test_hybrid_parallel_mp_layers(self):
+        self.run_mnist_2gpu('hybrid_parallel_mp_layers.py')
+
+    def test_hybrid_parallel_mp_random(self):
+        self.run_mnist_2gpu('hybrid_parallel_mp_random.py')
+
+    def test_hybrid_parallel_mp_model(self):
+        self.run_mnist_2gpu('hybrid_parallel_mp_model.py')
+
+    def test_hybrid_parallel_mp_amp(self):
+        self.run_mnist_2gpu('hybrid_parallel_mp_amp.py')
+
+    def test_hybrid_parallel_mp_clip_grad(self):
+        self.run_mnist_2gpu('hybrid_parallel_mp_clip_grad.py')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_layer.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_layer.py
new file mode 100644
index 0000000000000..f3b89d694f70b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_layer.py
@@ -0,0 +1,29 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import paddle.fluid as fluid
+
+from test_parallel_dygraph_dataparallel import TestMultipleGpus
+
+
+class TestHybridPipeParallel(TestMultipleGpus):
+    def test_hybrid_parallel_pp_layer(self):
+        self.run_mnist_2gpu('hybrid_parallel_pp_layer.py')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
index ea59a7f584a2d..47d286fb6ab32 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
@@ -159,7 +159,7 @@ def check_network_convergence(self,
                 train_data = paddle.batch(
                     paddle.reader.shuffle(
                         paddle.dataset.conll05.test(), buf_size=8192),
-                    batch_size=16)
+                    batch_size=8)
 
                 place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
                 exe = fluid.Executor(place)
diff --git a/python/paddle/fluid/tests/unittests/test_pylayer_op.py b/python/paddle/fluid/tests/unittests/test_pylayer_op.py
new file mode 100644
index 0000000000000..72d8efc80a938
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_pylayer_op.py
@@ -0,0 +1,337 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+
+import paddle
+from paddle.autograd import PyLayer
+
+
+class TestPyLayer(unittest.TestCase):
+    def test_simple_pylayer_multiple_output(self):
+        class tanh(PyLayer):
+            @staticmethod
+            def forward(ctx, x1, x2, func1, func2=paddle.square):
+                ctx.func = func2
+                y1 = func1(x1)
+                y2 = func1(x2)
+                ctx.save_for_backward(y1, y2)
+                return y1, y2
+
+            @staticmethod
+            def backward(ctx, dy1, dy2):
+                y1, y2 = ctx.saved_tensor()
+                re1 = dy1 * (1 - ctx.func(y1))
+                re2 = dy2 * (1 - paddle.square(y2))
+                return re1, re2
+
+        input1 = paddle.randn([2, 3]).astype("float64")
+        input2 = input1.detach().clone()
+        input1.stop_gradient = False
+        input2.stop_gradient = False
+        z = tanh.apply(input1, input1, paddle.tanh, paddle.square)
+        z = z[0] + z[1]
+        z.mean().backward()
+
+        z2 = paddle.tanh(input2) + paddle.tanh(input2)
+        z2.mean().backward()
+
+        self.assertTrue(np.max(np.abs((input1.grad - input2.grad))) < 1e-10)
+
+    def test_simple_pylayer_single_output(self):
+        class tanh(PyLayer):
+            @staticmethod
+            def forward(ctx, x1, func1, func2=paddle.square):
+                ctx.func = func2
+                y1 = func1(x1)
+                ctx.save_for_backward(y1)
+                return y1
+
+            @staticmethod
+            def backward(ctx, dy1):
+                y1, = ctx.saved_tensor()
+                re1 = dy1 * (1 - ctx.func(y1))
+                return re1
+
+        input1 = paddle.randn([2, 3]).astype("float64")
+        input2 = input1.detach().clone()
+        input1.stop_gradient = False
+        input2.stop_gradient = False
+        z = tanh.apply(x1=input1, func1=paddle.tanh)
+        z.mean().backward()
+        z2 = paddle.tanh(input2)
+        z2.mean().backward()
+
+        self.assertTrue(np.max(np.abs((input1.grad - input2.grad))) < 1e-10)
+
+    def test_pylayer_dtype(self):
+        class tanh(PyLayer):
+            @staticmethod
+            def forward(ctx, x, dtype):
+                y = paddle.cast(x, dtype)
+                return y
+
+            @staticmethod
+            def backward(ctx, dy1):
+                return dy1
+
+        dtypes = [
+            'bool', 'float16', 'float32', 'float64', 'uint8', 'int32', 'int64'
+        ]
+        for dtype in dtypes:
+            input1 = (paddle.randn([2, 3]))
+            input1.stop_gradient = False
+            self.assertTrue(input1.grad is None)
+
+            z = tanh.apply(input1, dtype)
+            z = paddle.cast(z, "float32")
+            z.sum().backward()
+            self.assertTrue(input1.grad is not None)
+
+    def test_pylayer_Exception_forward(self):
+        class Layer_None1(PyLayer):
+            @staticmethod
+            def forward(ctx, *args):
+                return None
+
+            @staticmethod
+            def backward(ctx, *args):
+                return args
+
+        input1 = paddle.randn([2, 3]).astype("float64")
+        with self.assertRaises(NotImplementedError):
+            z = Layer_None1.apply(input1)
+
+        class Layer_None2(PyLayer):
+            @staticmethod
+            def forward(ctx, *args):
+                return [None, None]
+
+            @staticmethod
+            def backward(ctx, *args):
+                return args
+
+        input1 = paddle.randn([2, 3]).astype("float64")
+        with self.assertRaises(NotImplementedError):
+            z = Layer_None2.apply(input1)
+
+        class Layer_one1(PyLayer):
+            @staticmethod
+            def forward(ctx, *args):
+                return 1
+
+            @staticmethod
+            def backward(ctx, *args):
+                return args
+
+        input1 = paddle.randn([2, 3]).astype("float64")
+        with self.assertRaises(NotImplementedError):
+            z = Layer_one1.apply(input1)
+
+        class Layer_one2(PyLayer):
+            @staticmethod
+            def forward(ctx, *args):
+                return [1, 2]
+
+            @staticmethod
+            def backward(ctx, *args):
+                return args
+
+        input1 = paddle.randn([2, 3]).astype("float64")
+        with self.assertRaises(NotImplementedError):
+            z = Layer_one2.apply(input1)
+
+        class Layer_no_fw(PyLayer):
+            @staticmethod
+            def backward(ctx, *args):
+                return args
+
+        input1 = paddle.randn([2, 3]).astype("float64")
+        with self.assertRaises(NotImplementedError):
+            z = Layer_no_fw.apply(input1)
+
+    def test_pylayer_nograd(self):
+        class tanh(PyLayer):
+            @staticmethod
+            def forward(ctx, x1, func1, func2=paddle.square, xx=None):
+                ctx.func = func2
+                y1 = func1(x1)
+                return y1
+
+            @staticmethod
+            def backward(ctx, x1, y1, dy1):
+                re1 = dy1 * (1 - ctx.func(y1))
+                return re1
+
+        input1 = paddle.randn([2, 3]).astype("float64")
+        z = tanh.apply(input1, paddle.tanh, paddle.square)
+        z.mean().backward()
+        self.assertTrue(z.grad is None)
+
+    def test_pylayer_Exception_bk(self):
+        class Layer_bk_none1(PyLayer):
+            @staticmethod
+            def forward(ctx, x):
+                return x * 2
+
+            @staticmethod
+            def backward(ctx, dy1):
+                return None
+
+        input2 = paddle.randn([2, 3]).astype("float64")
+        input2.stop_gradient = False
+        z = Layer_bk_none1.apply(input2)
+
+        with self.assertRaises(NotImplementedError):
+            with paddle.fluid.dygraph.guard():
+                z.sum().backward()
+
+        class Layer_bk_none2(PyLayer):
+            @staticmethod
+            def forward(ctx, x1, x2):
+                return x1 + x2
+
+            @staticmethod
+            def backward(ctx, dy1):
+                return None, dy1
+
+        input1 = paddle.randn([2, 3]).astype("float64")
+        input1.stop_gradient = False
+        z = Layer_bk_none2.apply(input1, input1)
+        with self.assertRaises(NotImplementedError):
+            with paddle.fluid.dygraph.guard():
+                z.mean().backward()
+
+        class Layer_bk_one1(PyLayer):
+            @staticmethod
+            def forward(ctx, x):
+                return x + x
+
+            @staticmethod
+            def backward(ctx, dy):
+                return 1
+
+        input1 = paddle.randn([2, 3]).astype("float64")
+        input1.stop_gradient = False
+        z = Layer_bk_one1.apply(input1)
+        with self.assertRaises(NotImplementedError):
+            with paddle.fluid.dygraph.guard():
+                z.mean().backward()
+
+        class Layer_bk_one2(PyLayer):
+            @staticmethod
+            def forward(ctx, x):
+                return x * 2, x * 5
+
+            @staticmethod
+            def backward(ctx, *args):
+                return 1, 1
+
+        input1 = paddle.randn([2, 3]).astype("float64")
+        input1.stop_gradient = False
+        z = Layer_bk_one1.apply(input1)
+        with self.assertRaises(NotImplementedError):
+            with paddle.fluid.dygraph.guard():
+                z.mean().backward()
+
+        class Layer_no_bk(PyLayer):
+            @staticmethod
+            def forward(ctx, x):
+                return x * 2, x * 5
+
+        input1 = paddle.randn([2, 3]).astype("float64")
+        input1.stop_gradient = False
+        z = Layer_no_bk.apply(input1)
+
+        with self.assertRaises(NotImplementedError):
+            with paddle.fluid.dygraph.guard():
+                z = z[0] + z[1]
+                z.mean().backward()
+
+        class Layer_bk_match(PyLayer):
+            @staticmethod
+            def forward(ctx, x):
+                return x * 2, x * 5
+
+            @staticmethod
+            def backward(ctx, dy1, dy2):
+                return dy2 * 2, dy1 * 2
+
+        input1 = paddle.randn([2, 3]).astype("float64")
+        input1.stop_gradient = False
+        z = Layer_bk_match.apply(input1)
+        with self.assertRaises(ValueError):
+            with paddle.fluid.dygraph.guard():
+                z = z[0] + z[1]
+                z.mean().backward()
+
+    def test_pylayer_inplace(self):
+        class cus_tanh(PyLayer):
+            @staticmethod
+            def forward(ctx, x):
+                return x
+
+            @staticmethod
+            def backward(ctx, dy):
+                return dy
+
+        class Layer(paddle.nn.Layer):
+            def __init__(self):
+                super(Layer, self).__init__()
+
+            def forward(self, data):
+                data = paddle.nn.functional.relu(data)
+                z = paddle.tanh(data)
+                z = cus_tanh.apply(data)
+                return z.mean()
+
+        for i in range(2):
+            data = paddle.ones([2, 3], dtype="float64") / (i + 1)
+            data.stop_gradient = False
+            layer = Layer()
+            z = layer(data)
+            z.backward()
+            self.assertTrue(data.grad is not None)
+
+    def test_backward_in_backward(self):
+        class cus_tanh(PyLayer):
+            @staticmethod
+            def forward(ctx, x):
+                temp = x.detach()
+                ctx.inputs = temp
+                return x.mean()
+
+            @staticmethod
+            def backward(ctx, dy):
+                with paddle.set_grad_enabled(True):
+                    temp = ctx.inputs
+                    temp.stop_gradient = False
+                    z = paddle.tanh(temp)
+                    z.backward()
+                    self.assertTrue(temp.grad is not None)
+                    return paddle.to_tensor(temp.grad)
+
+        for i in range(2):
+            data = paddle.ones([2, 3], dtype="float32") / (i + 1)
+            data.stop_gradient = False
+            data = paddle.nn.functional.relu(data)
+            z = paddle.tanh(data)
+            z = cus_tanh.apply(data)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_scatter_nd_op.py b/python/paddle/fluid/tests/unittests/test_scatter_nd_op.py
index 35bb4487c6aae..59d1ede5a0b53 100644
--- a/python/paddle/fluid/tests/unittests/test_scatter_nd_op.py
+++ b/python/paddle/fluid/tests/unittests/test_scatter_nd_op.py
@@ -242,7 +242,7 @@ def check_raise_is_test():
                 output5 = fluid.layers.scatter_nd_add(ref5, index5, updates5)
             except Exception as e:
                 t = \
-                "Input(Index).shape[-1] should be no greater than Input(X).rank"
+                "The last dimension of Input(Index)'s shape should be no greater "
                 if t in str(e):
                     raise IndexError
 
diff --git a/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py b/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py
new file mode 100644
index 0000000000000..0717ec80f6a13
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py
@@ -0,0 +1,212 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+from paddle.fluid.tests.unittests.op_test import (
+    OpTest, convert_float_to_uint16, convert_uint16_to_float)
+import paddle
+
+
+@unittest.skipIf(not core.supports_bfloat16(),
+                 'place does not support BF16 evaluation')
+class TestSGDOpBF16(OpTest):
+    def setUp(self):
+        self.op_type = 'sgd'
+        self.dtype = np.uint16
+        self.conf()
+        w = np.random.random((self.h, self.w)).astype('float32')
+        w_bf16 = convert_float_to_uint16(w)
+        g = np.random.random((self.h, self.w)).astype('float32')
+        g_bf16 = convert_float_to_uint16(g)
+        lr = np.array([0.1]).astype('float32')
+        lr_bf16 = convert_float_to_uint16(lr)
+
+        self.inputs = {'Param': w_bf16, 'Grad': g_bf16, 'LearningRate': lr_bf16}
+        self.outputs = {'ParamOut': w - lr * g}
+
+    def conf(self):
+        self.h = 102
+        self.w = 105
+
+    def test_check_output(self):
+        self.check_output_with_place(core.CPUPlace(), check_dygraph=False)
+
+
+@unittest.skipIf(not core.supports_bfloat16(),
+                 'place does not support BF16 evaluation')
+class TestSGDOpCase8XBF16(TestSGDOpBF16):
+    def conf(self):
+        self.h = 10
+        self.w = 64
+
+
+class TestSparseSGDOpBF16(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        np.random.seed(12345)
+
+    def ref_optimize(self, params, grad_rows, grad_array, lr_value):
+        reference = np.copy(params)
+        for index, id in enumerate(grad_rows):
+            reference[id] = params[id] - lr_value * grad_array[index]
+        return reference
+
+    def check_output(self, actual_bf16, reference, atol=0, rtol=0.15e-2):
+        actual_fp32 = convert_uint16_to_float(actual_bf16)
+        np.testing.assert_allclose(actual_fp32, reference, atol=atol, rtol=rtol)
+
+    def create_sparse_grad_var(self, scope, place, height, rows, row_numel):
+        grad_selected_rows = scope.var('Grad').get_selected_rows()
+        grad_selected_rows.set_height(height)
+        grad_selected_rows.set_rows(rows)
+        # grad_array = np.random.random((len(rows), row_numel)).astype('float32')
+        grad_array = np.full((len(rows), row_numel), 2, np.float32)
+        np_array_bf16 = convert_float_to_uint16(grad_array)
+
+        grad_tensor = grad_selected_rows.get_tensor()
+        grad_tensor.set(np_array_bf16, place)
+
+        return grad_tensor, grad_array
+
+    def create_dense_param_var(self, scope, place, height, width):
+        param_tensor = scope.var('Param').get_tensor()
+        # param_array = np.random.random((height, width)).astype('float32')
+        param_array = np.full((height, width), 5, np.float32)
+        param_array_bf16 = convert_float_to_uint16(param_array)
+        param_tensor.set(param_array_bf16, place)
+
+        return param_tensor, param_array
+
+    def create_sparse_param_var(self, scope, place, height, rows, row_numel):
+        param_selected_rows = scope.var('Param').get_selected_rows()
+        param_selected_rows.set_height(height)
+        param_selected_rows.set_rows(rows)
+        param_selected_rows.sync_index()
+        param_array = np.random.random((len(rows), row_numel)).astype('float32')
+        np_array_bf16 = convert_float_to_uint16(param_array)
+
+        param_tensor = param_selected_rows.get_tensor()
+        param_tensor.set(np_array_bf16, place)
+
+        return param_tensor, param_array
+
+    def create_dense_lr_var(self, scope, place):
+        lr_tensor = scope.var('LearningRate').get_tensor()
+        # lr_value = np.random.uniform()
+        lr_value = 2
+        lr_array = np.full((1), lr_value, np.float32)
+        lr_array_bf16 = convert_float_to_uint16(lr_array)
+        lr_tensor.set(lr_array_bf16, place)
+
+        return lr_tensor, lr_value
+
+
+@unittest.skipIf(not core.supports_bfloat16(),
+                 'place does not support BF16 evaluation')
+class TestSparseGradSGDOpBF16(TestSparseSGDOpBF16):
+    def setUp(self):
+        self.setup_params()
+
+    def setup_params(self):
+        self.grad_height = 10
+        self.grad_rows = [0, 4, 7]
+        self.grad_row_numel = 12
+
+    def test_sparse_grad_sgd(self):
+        scope = core.Scope()
+        place = core.CPUPlace()
+        _, grad_array = self.create_sparse_grad_var(
+            scope, place, self.grad_height, self.grad_rows, self.grad_row_numel)
+        param_tensor, param_array = self.create_dense_param_var(
+            scope, place, self.grad_height, self.grad_row_numel)
+        _, lr_value = self.create_dense_lr_var(scope, place)
+
+        sgd_op = Operator(
+            'sgd',
+            Param='Param',
+            Grad='Grad',
+            ParamOut='Param',
+            LearningRate='LearningRate')
+        sgd_op.run(scope, place)
+
+        reference = self.ref_optimize(param_array, self.grad_rows, grad_array,
+                                      lr_value)
+        output = np.array(param_tensor)
+        self.check_output(output, reference, atol=5e-3, rtol=1e-1)
+
+
+@unittest.skipIf(not core.supports_bfloat16(),
+                 'place does not support BF16 evaluation')
+class TestSparseGradSGDOpBF16Case2(TestSparseGradSGDOpBF16):
+    def setup_params(self):
+        self.grad_height = 14
+        self.grad_rows = [1, 4, 12, 7, 8]
+        self.grad_row_numel = 16
+
+
+@unittest.skipIf(not core.supports_bfloat16(),
+                 'place does not support BF16 evaluation')
+class TestSparseGradParamSGDOpBF16(TestSparseSGDOpBF16):
+    def setUp(self):
+        self.setup_params()
+
+    def setup_params(self):
+        self.grad_height = 10
+        self.grad_rows = [0, 4, 7]
+        self.grad_row_numel = 12
+        self.param_rows = [a for a in range(self.grad_height)]
+
+    def test_sparse_param_grad_sgd(self):
+        scope = core.Scope()
+        place = core.CPUPlace()
+        _, grad_array = self.create_sparse_grad_var(
+            scope, place, self.grad_height, self.grad_rows, self.grad_row_numel)
+        param_tensor, param_array = self.create_sparse_param_var(
+            scope, place, self.grad_height, self.param_rows,
+            self.grad_row_numel)
+        _, lr_value = self.create_dense_lr_var(scope, place)
+
+        sgd_op = Operator(
+            'sgd',
+            Param='Param',
+            Grad='Grad',
+            ParamOut='Param',
+            LearningRate='LearningRate')
+        sgd_op.run(scope, place)
+
+        reference = self.ref_optimize(param_array, self.grad_rows, grad_array,
+                                      lr_value)
+        output = np.array(param_tensor)
+        self.check_output(output, reference, atol=5e-3, rtol=1e-1)
+
+
+@unittest.skipIf(not core.supports_bfloat16(),
+                 'place does not support BF16 evaluation')
+class TestSparseGradParamSGDOpBF16Case2(TestSparseGradParamSGDOpBF16):
+    def setup_params(self):
+        self.grad_height = 14
+        self.grad_rows = [1, 4, 12, 7, 8]
+        self.grad_row_numel = 16
+        self.param_rows = [a for a in range(self.grad_height)]
+
+
+if __name__ == '__main__':
+    paddle.enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py b/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py
deleted file mode 100644
index d8c57d964da70..0000000000000
--- a/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py
+++ /dev/null
@@ -1,137 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import paddle.fluid.core as core
-import numpy as np
-from paddle.fluid.op import Operator
-
-
-class TestSpliteSelectedRows(unittest.TestCase):
-    def get_places(self):
-        places = [core.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(core.CUDAPlace(0))
-        return places
-
-    def test_check_output(self):
-        for place in self.get_places():
-            self.check_with_place(place)
-
-    def test_check_grad(self):
-        for place in self.get_places():
-            self.check_grad_with_place(place)
-
-    def check_with_place(self, place):
-        scope = core.Scope()
-        rows = [0, 5, 7, 4, 20]
-        height = 21
-        row_numel = 2
-
-        # initialize input variable X
-        x = scope.var('X').get_selected_rows()
-        x.set_rows(rows)
-        x.set_height(height)
-        np_array = np.ones((len(rows), row_numel)).astype("float32")
-        np_array[0, 0] = 2.0
-        np_array[2, 1] = 4.0
-        np_array[4, 1] = 8.0
-        x_tensor = x.get_tensor()
-        x_tensor.set(np_array, place)
-
-        height_sections = [5, 5, 5, 5, 3]
-
-        # initialize output variables [out0, out1]
-        outs_name = ["out%d" % i for i in range(len(height_sections))]
-        outs = [
-            scope.var(var_name).get_selected_rows() for var_name in outs_name
-        ]
-
-        # expected output selected rows
-        expected_out0_rows = [0, 4]
-        expected_out1_rows = [0, 2]
-        expected_out2_rows = []
-        expected_out4_rows = [0]
-
-        op = Operator(
-            "split_selected_rows",
-            X="X",
-            Out=outs_name,
-            height_sections=height_sections)
-
-        op.run(scope, place)
-
-        self.assertEqual(outs[0].rows(), expected_out0_rows)
-        self.assertEqual(outs[1].rows(), expected_out1_rows)
-        self.assertEqual(outs[2].rows(), expected_out2_rows)
-        self.assertEqual(outs[4].rows(), expected_out4_rows)
-
-        self.assertEqual(outs[0].height(), height_sections[0])
-        self.assertEqual(outs[4].height(), height_sections[4])
-
-        self.assertAlmostEqual(2.0, np.array(outs[0].get_tensor())[0, 0])
-        self.assertAlmostEqual(4.0, np.array(outs[1].get_tensor())[1, 1])
-        self.assertAlmostEqual(8.0, np.array(outs[4].get_tensor())[0, 1])
-
-        self.assertEqual(outs[2].numel(), 0)
-        self.assertEqual(outs[3].numel(), 0)
-
-    def check_grad_with_place(self, place):
-        scope = core.Scope()
-        height = 10
-        row_numel = 2
-
-        # attr
-        height_sections = [5, 5]
-
-        # initialize input variable X
-        out0_grad = scope.var("out0@GRAD").get_selected_rows()
-        rows0 = [0, 5]
-        out0_grad.set_rows(rows0)
-        out0_grad.set_height(height)
-        out0_grad_tensor = out0_grad.get_tensor()
-        np_array = np.ones((len(rows0), row_numel)).astype("float32")
-        out0_grad_tensor.set(np_array, place)
-
-        out1_grad = scope.var("out1@GRAD").get_selected_rows()
-        rows1 = [2, 0]
-        out1_grad.set_rows(rows1)
-        out1_grad.set_height(height)
-        out1_grad_tensor = out1_grad.get_tensor()
-        np_array = np.ones((len(rows1), row_numel)).astype("float32")
-        out1_grad_tensor.set(np_array, place)
-
-        x_grad = scope.var("X@GRAD").get_selected_rows()
-
-        grad_op = Operator(
-            "sum",
-            X=["out0@GRAD", "out1@GRAD"],
-            Out="X@GRAD",
-            height_sections=height_sections)
-
-        grad_op.run(scope, place)
-
-        merged_rows = set(rows0 + rows1)
-        self.assertEqual(set(x_grad.rows()), set(rows0 + rows1))
-        self.assertEqual(x_grad.height(), height)
-
-        print(np.array(x_grad.get_tensor()))
-        self.assertAlmostEqual(2.0, np.array(x_grad.get_tensor())[0, 0])
-        self.assertAlmostEqual(1.0, np.array(x_grad.get_tensor())[2, 1])
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_static_model_parallel.py b/python/paddle/fluid/tests/unittests/test_static_model_parallel.py
new file mode 100644
index 0000000000000..6f2f7408262d9
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_static_model_parallel.py
@@ -0,0 +1,63 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+from test_dist_base import TestDistBase
+
+import os
+import paddle
+
+paddle.enable_static()
+flag_name = os.path.splitext(__file__)[0]
+
+
+class TestStaticModelParallel(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+        self._use_reduce = False
+        self._use_reader_alloc = False
+        self._nccl_comm_num = 1
+        self._pipeline_mode = True
+
+    def test_dist_static_model_parallel(self):
+        import paddle.fluid as fluid
+        if fluid.core.is_compiled_with_cuda():
+            self.check_with_place(
+                "static_model_parallel_by_row.py",
+                delta=1e-5,
+                check_error_log=True,
+                log_name=flag_name)
+
+    def test_dist_static_model_parallel2(self):
+        import paddle.fluid as fluid
+        if fluid.core.is_compiled_with_cuda():
+            self.check_with_place(
+                "static_model_parallel_by_col.py",
+                delta=1e-5,
+                check_error_log=True,
+                log_name=flag_name)
+
+    def test_dist_static_model_parallel3(self):
+        import paddle.fluid as fluid
+        if fluid.core.is_compiled_with_cuda():
+            self.check_with_place(
+                "static_model_parallel_embedding.py",
+                delta=1e-5,
+                check_error_log=True,
+                log_name=flag_name)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_static_save_load.py b/python/paddle/fluid/tests/unittests/test_static_save_load.py
index 51c543c5f7464..cfce0bb7d311b 100644
--- a/python/paddle/fluid/tests/unittests/test_static_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_static_save_load.py
@@ -19,7 +19,7 @@
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
-from paddle.fluid.dygraph.nn import Embedding
+from paddle.nn import Embedding
 import paddle.fluid.framework as framework
 from paddle.fluid.optimizer import Adam
 from paddle.fluid.dygraph.base import to_variable
@@ -31,6 +31,8 @@
 import os
 import errno
 
+paddle.enable_static()
+
 
 class SimpleLSTMRNN(fluid.Layer):
     def __init__(self,
@@ -159,11 +161,10 @@ def __init__(self,
             num_layers=num_layers,
             init_scale=init_scale,
             dropout=dropout)
-        self.embedding = Embedding(
-            size=[vocab_size, hidden_size],
-            dtype='float32',
-            is_sparse=False,
-            param_attr=fluid.ParamAttr(
+        self.embedding = paddle.nn.Embedding(
+            num_embeddings=vocab_size,
+            embedding_dim=hidden_size,
+            weight_attr=fluid.ParamAttr(
                 name='embedding_para',
                 initializer=fluid.initializer.UniformInitializer(
                     low=-init_scale, high=init_scale)))
@@ -187,6 +188,8 @@ def forward(self, input, label, init_hidden, init_cell):
         init_c = fluid.layers.reshape(
             init_cell, shape=[self.num_layers, -1, self.hidden_size])
 
+        # NPU 'tok_k' kernel only support `int32` dtype, so cast `input` from `int64` to `int32`.
+        input = fluid.layers.cast(input, "int32")
         x_emb = self.embedding(input)
         x_emb = fluid.layers.reshape(
             x_emb, shape=[-1, self.num_steps, self.hidden_size])
@@ -214,6 +217,10 @@ def forward(self, input, label, init_hidden, init_cell):
 
 
 class TestSaveLoadBase(unittest.TestCase):
+    def set_place(self):
+        return fluid.CPUPlace() if not core.is_compiled_with_cuda(
+        ) else fluid.CUDAPlace(0)
+
     def test_ptb_rnn_cpu_float32(self):
         seed = 90
         hidden_size = 10
@@ -235,8 +242,7 @@ def test_ptb_rnn_cpu_float32(self):
                 num_steps=num_steps,
                 init_scale=init_scale)
 
-            place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
-            ) else fluid.CUDAPlace(0)
+            place = self.set_place()
             exe = fluid.Executor(place)
             sgd = Adam(learning_rate=1e-3)
             x = fluid.layers.data(
@@ -315,6 +321,10 @@ def test_ptb_rnn_cpu_float32(self):
 
 
 class TestSaveLoadPartial(unittest.TestCase):
+    def set_place(self):
+        return fluid.CPUPlace() if not core.is_compiled_with_cuda(
+        ) else fluid.CUDAPlace(0)
+
     def test_ptb_rnn_cpu_float32(self):
         seed = 90
         hidden_size = 10
@@ -336,8 +346,7 @@ def test_ptb_rnn_cpu_float32(self):
                 num_steps=num_steps,
                 init_scale=init_scale)
 
-            place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
-            ) else fluid.CUDAPlace(0)
+            place = self.set_place()
             exe = fluid.Executor(place)
             sgd = Adam(learning_rate=1e-3)
             x = fluid.layers.data(
@@ -425,6 +434,10 @@ def test_ptb_rnn_cpu_float32(self):
 
 
 class TestSaveLoadSetStateDict(unittest.TestCase):
+    def set_place(self):
+        return fluid.CPUPlace() if not core.is_compiled_with_cuda(
+        ) else fluid.CUDAPlace(0)
+
     def test_ptb_rnn_cpu_float32(self):
         seed = 90
         hidden_size = 10
@@ -446,8 +459,7 @@ def test_ptb_rnn_cpu_float32(self):
                 num_steps=num_steps,
                 init_scale=init_scale)
 
-            place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
-            ) else fluid.CUDAPlace(0)
+            place = self.set_place()
             exe = fluid.Executor(place)
             sgd = Adam(learning_rate=1e-3)
             x = fluid.layers.data(
@@ -526,6 +538,10 @@ def test_ptb_rnn_cpu_float32(self):
 
 
 class TestProgramStatePartial(unittest.TestCase):
+    def set_place(self):
+        return fluid.CPUPlace() if not core.is_compiled_with_cuda(
+        ) else fluid.CUDAPlace(0)
+
     def test_ptb_rnn_cpu_float32(self):
         seed = 90
         hidden_size = 10
@@ -547,8 +563,7 @@ def test_ptb_rnn_cpu_float32(self):
                 num_steps=num_steps,
                 init_scale=init_scale)
 
-            place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
-            ) else fluid.CUDAPlace(0)
+            place = self.set_place()
             exe = fluid.Executor(place)
             sgd = Adam(learning_rate=1e-3)
             x = fluid.layers.data(
@@ -708,14 +723,17 @@ def test_ptb_rnn_cpu_float32(self):
 
 
 class TestVariableInit(unittest.TestCase):
+    def set_place(self):
+        return fluid.CPUPlace() if not core.is_compiled_with_cuda(
+        ) else fluid.CUDAPlace(0)
+
     def test_variable_init(self):
 
         x = fluid.data(name="x", shape=[10, 10], dtype='float32')
         y = fluid.layers.fc(x, 10)
         z = fluid.layers.fc(y, 10)
 
-        place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
-        ) else fluid.CUDAPlace(0)
+        place = self.set_place()
         exe = fluid.Executor(place)
         exe.run(fluid.default_startup_program())
 
@@ -738,8 +756,7 @@ def set_var(var, ndarray):
         program = fluid.default_main_program()
         new_scope = fluid.core.Scope()
 
-        place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
-        ) else fluid.CUDAPlace(0)
+        place = self.set_place()
         exe = fluid.Executor(place)
         parameter_list = list(
             filter(fluid.io.is_parameter, program.list_vars()))
@@ -798,6 +815,10 @@ def setUp(self):
         if os.path.exists("test_static_load_var_list.pdparams"):
             os.remove("test_static_load_var_list.pdparams")
 
+    def set_place(self):
+        return fluid.CPUPlace() if not core.is_compiled_with_cuda(
+        ) else fluid.CUDAPlace(0)
+
     def test_load_from_old_interface(self):
         seed = 90
         hidden_size = 10
@@ -819,8 +840,7 @@ def test_load_from_old_interface(self):
                 num_steps=num_steps,
                 init_scale=init_scale)
 
-            place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
-            ) else fluid.CUDAPlace(0)
+            place = self.set_place()
             exe = fluid.Executor(place)
             sgd = Adam(learning_rate=1e-3)
             x = fluid.layers.data(
@@ -935,8 +955,7 @@ def test_load_from_old_interface_var_list(self):
                 num_steps=num_steps,
                 init_scale=init_scale)
 
-            place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
-            ) else fluid.CUDAPlace(0)
+            place = self.set_place()
             exe = fluid.Executor(place)
             sgd = Adam(learning_rate=1e-3)
             x = fluid.layers.data(
@@ -1027,6 +1046,10 @@ def test_load_from_old_interface_var_list(self):
 
 
 class TestLoadFromOldInterfaceSingleFile(unittest.TestCase):
+    def set_place(self):
+        return fluid.CPUPlace() if not core.is_compiled_with_cuda(
+        ) else fluid.CUDAPlace(0)
+
     def test_load_from_old_interface(self):
         seed = 90
         hidden_size = 10
@@ -1048,8 +1071,7 @@ def test_load_from_old_interface(self):
                 num_steps=num_steps,
                 init_scale=init_scale)
 
-            place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
-            ) else fluid.CUDAPlace(0)
+            place = self.set_place()
             exe = fluid.Executor(place)
             sgd = Adam(learning_rate=1e-3)
             x = fluid.layers.data(
@@ -1171,6 +1193,13 @@ def test_load_from_old_interface(self):
 
 
 class TestProgramStateOldSave(unittest.TestCase):
+    def setUp(self):
+        self.test_dygraph = True
+
+    def set_place(self):
+        return fluid.CPUPlace() if not core.is_compiled_with_cuda(
+        ) else fluid.CUDAPlace(0)
+
     def test_ptb_rnn_cpu_float32(self):
         seed = 90
         hidden_size = 10
@@ -1192,8 +1221,7 @@ def test_ptb_rnn_cpu_float32(self):
                 num_steps=num_steps,
                 init_scale=init_scale)
 
-            place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
-            ) else fluid.CUDAPlace(0)
+            place = self.set_place()
             exe = fluid.Executor(place)
             sgd = Adam(learning_rate=1e-3)
             x = fluid.layers.data(
@@ -1299,11 +1327,12 @@ def symlink_force(target, link_name):
             fluid.set_program_state(main_program, program_state)
             self.check_in_static(main_program, base_map)
 
-        # make sure `load_program_state` can be used in dynamic graph mode
-        with fluid.dygraph.guard(place):
-            load_state = fluid.load_program_state("test_program_1")
-            for k, v in load_state.items():
-                self.assertTrue(np.array_equal(base_map[k], v))
+        if self.test_dygraph:
+            # make sure `load_program_state` can be used in dynamic graph mode
+            with fluid.dygraph.guard(place):
+                load_state = fluid.load_program_state("test_program_1")
+                for k, v in load_state.items():
+                    self.assertTrue(np.array_equal(base_map[k], v))
 
     def create_symlink(self, target, link_name):
         try:
@@ -1323,6 +1352,10 @@ def check_in_static(self, main_program, base_map):
 
 
 class TestProgramStateOldSaveSingleModel(unittest.TestCase):
+    def set_place(self):
+        return fluid.CPUPlace() if not core.is_compiled_with_cuda(
+        ) else fluid.CUDAPlace(0)
+
     def test_ptb_rnn_cpu_float32(self):
         seed = 90
         hidden_size = 10
@@ -1344,8 +1377,7 @@ def test_ptb_rnn_cpu_float32(self):
                 num_steps=num_steps,
                 init_scale=init_scale)
 
-            place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
-            ) else fluid.CUDAPlace(0)
+            place = self.set_place()
             exe = fluid.Executor(place)
             sgd = Adam(learning_rate=1e-3)
             x = fluid.layers.data(
diff --git a/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py b/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py
index a390dd9d80756..50b00ab34fd09 100644
--- a/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py
+++ b/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py
@@ -178,8 +178,9 @@ def run_double_hook_for_leaf_var(double_hook, removed=False):
         # register hook and removed
         run_double_hook_for_leaf_var(lambda grad: grad * 2, removed=True)
 
-    def test_hook_for_accumulated_grad(self):
-        def run_double_hook_for_accumulated_grad(double_hook, removed=False):
+    def test_hook_for_accumulated_grad_interior_var(self):
+        def run_double_hook_for_accumulated_grad_interior_var(double_hook,
+                                                              removed=False):
             for device in self.devices:
                 paddle.set_device(device)
 
@@ -227,9 +228,50 @@ def run_double_hook_for_accumulated_grad(double_hook, removed=False):
                                    if not removed else base_grad))
 
         # register hook
-        run_double_hook_for_accumulated_grad(lambda grad: grad * 2)
+        run_double_hook_for_accumulated_grad_interior_var(lambda grad: grad * 2)
         # register hook and removed
-        run_double_hook_for_accumulated_grad(
+        run_double_hook_for_accumulated_grad_interior_var(
+            lambda grad: grad * 2, removed=True)
+
+    def test_hook_for_accumulated_grad_leaf_var(self):
+        def run_double_hook_for_accumulated_grad_leaf_var(double_hook,
+                                                          removed=False):
+            for device in self.devices:
+                paddle.set_device(device)
+
+                x = paddle.to_tensor([0., 1., 2., 4.])
+                x.stop_gradient = False
+
+                helper = x.register_hook(double_hook)
+
+                y = paddle.to_tensor([4., 5., 6., 7.])
+                z = paddle.to_tensor([1., 2., 3., 4.])
+                y.stop_gradient = False
+                z.stop_gradient = False
+
+                o1 = x + y
+                o2 = x + z
+                o1.stop_gradient = False
+                o2.stop_gradient = False
+
+                o = o1.matmul(o2)
+
+                # remove hook before backward
+                if removed:
+                    helper.remove()
+
+                o.backward()
+
+                base_grad = np.array([5., 9., 13., 19.])
+                # x.grad is changed by x.hook
+                self.assertTrue(
+                    np.array_equal(x.grad, base_grad * 2
+                                   if not removed else base_grad))
+
+        # register hook
+        run_double_hook_for_accumulated_grad_leaf_var(lambda grad: grad * 2)
+        # register hook and removed
+        run_double_hook_for_accumulated_grad_leaf_var(
             lambda grad: grad * 2, removed=True)
 
     def test_hook_in_model(self):
@@ -409,5 +451,54 @@ def test_register_hook_for_stop_gradient_var(self):
                 x.register_hook(lambda grad: grad * 2)
 
 
+HOOK_INIT_VALUE = 10
+HOOK_IS_CALLED = False
+
+
+def global_void_hook():
+    global HOOK_INIT_VALUE
+    global HOOK_IS_CALLED
+    HOOK_INIT_VALUE *= 2
+    HOOK_IS_CALLED = True
+
+
+class TestTensorRegisterBackwardHook(unittest.TestCase):
+    def setUp(self):
+        self.devices = ["cpu"]
+        if paddle.is_compiled_with_cuda():
+            self.devices.append("gpu")
+
+    def test_register_backward_hook(self):
+        global HOOK_INIT_VALUE
+        global HOOK_IS_CALLED
+        for device in self.devices:
+            x = paddle.to_tensor(5., stop_gradient=False)
+            x._register_backward_hook(global_void_hook)
+            for i in range(5):
+                y = paddle.pow(x, 4.0)
+                y.backward()
+
+            self.assertEqual(HOOK_INIT_VALUE, 320)
+            self.assertTrue(HOOK_IS_CALLED)
+
+            # reset initial value
+            HOOK_INIT_VALUE = 10
+            HOOK_IS_CALLED = False
+
+    def test_register_backward_hook_for_interior_var(self):
+        x = paddle.to_tensor(5., stop_gradient=False)
+        y = paddle.pow(x, 4.0)
+
+        with self.assertRaises(ValueError):
+            y._register_backward_hook(global_void_hook)
+
+    def test_register_backward_hook_for_var_without_gradient(self):
+        x = paddle.to_tensor(5.)
+        y = paddle.pow(x, 4.0)
+
+        with self.assertRaises(ValueError):
+            x._register_backward_hook(global_void_hook)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_tensor_to_list.py b/python/paddle/fluid/tests/unittests/test_tensor_to_list.py
new file mode 100644
index 0000000000000..73b91297e6fd6
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_tensor_to_list.py
@@ -0,0 +1,44 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid as fluid
+import unittest
+import numpy as np
+import six
+import paddle
+
+
+class TensorToListTest(unittest.TestCase):
+    def setUp(self):
+        self.shape = [11, 25, 32, 43]
+
+    def test_tensor_tolist(self):
+        places = [fluid.CPUPlace()]
+        if fluid.core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+            places.append(fluid.CUDAPinnedPlace())
+
+        for p in places:
+            np_arr = np.reshape(
+                np.array(six.moves.range(np.prod(self.shape))), self.shape)
+            expectlist = np_arr.tolist()
+
+            t = paddle.to_tensor(np_arr, place=p)
+            tensorlist = t.tolist()
+
+            self.assertEqual(tensorlist, expectlist)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_transpiler_ops.py b/python/paddle/fluid/tests/unittests/test_transpiler_ops.py
deleted file mode 100644
index 9512ae495d8b6..0000000000000
--- a/python/paddle/fluid/tests/unittests/test_transpiler_ops.py
+++ /dev/null
@@ -1,143 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import traceback
-import math
-import collections
-
-import six
-import unittest
-import numpy as np
-
-import gc
-
-gc.set_debug(gc.DEBUG_COLLECTABLE)
-
-import paddle.fluid as fluid
-from test_dist_transpiler import TranspilerTest
-
-
-class TestFakeInit(TranspilerTest):
-    def net_conf(self):
-        dict_size, embedding_size, neg_num = 10000, 8, 5
-
-        input_word = fluid.layers.data(
-            name="input_word", shape=[1], dtype='int64', lod_level=1)
-        true_word = fluid.layers.data(
-            name='true_label', shape=[1], dtype='int64', lod_level=1)
-        neg_word = fluid.layers.data(
-            name="neg_label", shape=[1], dtype='int64', lod_level=1)
-        inputs = [input_word, true_word, neg_word]
-
-        init_width = 0.5 / embedding_size
-        input_emb = fluid.layers.embedding(
-            input=inputs[0],
-            is_sparse=True,
-            size=[dict_size, embedding_size],
-            param_attr=fluid.ParamAttr(
-                name='emb',
-                initializer=fluid.initializer.Uniform(-init_width, init_width)))
-
-        true_emb_w = fluid.layers.embedding(
-            input=inputs[1],
-            is_sparse=True,
-            size=[dict_size, embedding_size],
-            param_attr=fluid.ParamAttr(
-                name='emb_w',
-                initializer=fluid.initializer.Constant(value=0.0)))
-
-        true_emb_b = fluid.layers.embedding(
-            input=inputs[1],
-            is_sparse=True,
-            size=[dict_size, 1],
-            param_attr=fluid.ParamAttr(
-                name='emb_b',
-                initializer=fluid.initializer.Constant(value=0.0)))
-
-        neg_word_reshape = fluid.layers.reshape(inputs[2], shape=[-1, 1])
-        neg_word_reshape.stop_gradient = True
-
-        neg_emb_w = fluid.layers.embedding(
-            input=neg_word_reshape,
-            is_sparse=True,
-            size=[dict_size, embedding_size],
-            param_attr=fluid.ParamAttr(
-                name='emb_w', learning_rate=1.0))
-
-        neg_emb_w_re = fluid.layers.reshape(
-            neg_emb_w, shape=[-1, neg_num, embedding_size])
-
-        neg_emb_b = fluid.layers.embedding(
-            input=neg_word_reshape,
-            is_sparse=True,
-            size=[dict_size, 1],
-            param_attr=fluid.ParamAttr(
-                name='emb_b', learning_rate=1.0))
-
-        neg_emb_b_vec = fluid.layers.reshape(neg_emb_b, shape=[-1, neg_num])
-
-        true_logits = fluid.layers.elementwise_add(
-            fluid.layers.reduce_sum(
-                fluid.layers.elementwise_mul(input_emb, true_emb_w),
-                dim=1,
-                keep_dim=True),
-            true_emb_b)
-
-        input_emb_re = fluid.layers.reshape(
-            input_emb, shape=[-1, 1, embedding_size])
-
-        neg_matmul = fluid.layers.matmul(
-            input_emb_re, neg_emb_w_re, transpose_y=True)
-        neg_matmul_re = fluid.layers.reshape(neg_matmul, shape=[-1, neg_num])
-        neg_logits = fluid.layers.elementwise_add(neg_matmul_re, neg_emb_b_vec)
-        # nce loss
-        label_ones = fluid.layers.fill_constant_batch_size_like(
-            true_logits, shape=[-1, 1], value=1.0, dtype='float32')
-        label_zeros = fluid.layers.fill_constant_batch_size_like(
-            true_logits, shape=[-1, neg_num], value=0.0, dtype='float32')
-
-        true_xent = fluid.layers.sigmoid_cross_entropy_with_logits(true_logits,
-                                                                   label_ones)
-        neg_xent = fluid.layers.sigmoid_cross_entropy_with_logits(neg_logits,
-                                                                  label_zeros)
-        cost = fluid.layers.elementwise_add(
-            fluid.layers.reduce_sum(
-                true_xent, dim=1),
-            fluid.layers.reduce_sum(
-                neg_xent, dim=1))
-        avg_cost = fluid.layers.reduce_mean(cost)
-
-        sgd_optimizer = fluid.optimizer.SGD(
-            learning_rate=fluid.layers.exponential_decay(
-                learning_rate=1.0,
-                decay_steps=2100,
-                decay_rate=0.1,
-                staircase=True))
-        sgd_optimizer.minimize(avg_cost)
-
-    def transpiler_test_impl(self):
-        trainer, startup = self.get_trainer()
-
-        fake_init_ops = []
-        for op in startup.global_block().ops:
-            if op.type == "fake_init":
-                fake_init_ops.append(op)
-
-        self.assertEqual(len(fake_init_ops), 3)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_unfold_op.py b/python/paddle/fluid/tests/unittests/test_unfold_op.py
index e24368e052dd2..7295cb8381600 100644
--- a/python/paddle/fluid/tests/unittests/test_unfold_op.py
+++ b/python/paddle/fluid/tests/unittests/test_unfold_op.py
@@ -18,6 +18,9 @@
 import numpy as np
 import unittest
 from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
 
 
 class TestUnfoldOp(OpTest):
@@ -98,5 +101,30 @@ def test_check_grad(self):
         self.check_grad(['X'], 'Y')
 
 
+class TestUnfoldAPI(TestUnfoldOp):
+    """
+    This is for test on paddle.nn.Unfold
+    """
+
+    def setUp(self):
+        self.op_type = 'unfold'
+        self.set_data()
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def test_dygraph(self):
+        for place in self.places:
+            with fluid.dygraph.guard(place):
+                input = fluid.dygraph.to_variable(self.inputs['X'])
+                m = paddle.nn.Unfold(**self.attrs)
+                m.eval()
+                result = m(input)
+                self.assertTrue(np.allclose(result.numpy(), self.outputs['Y']))
+
+    def test_info(self):
+        str(paddle.nn.Unfold(**self.attrs))
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_var_base.py b/python/paddle/fluid/tests/unittests/test_var_base.py
index 1fea1935473a7..77432a59de273 100644
--- a/python/paddle/fluid/tests/unittests/test_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_var_base.py
@@ -502,6 +502,15 @@ def test_var_base_to_np(self):
                 np.array_equal(var.numpy(),
                                fluid.framework._var_base_to_np(var)))
 
+    def test_var_base_as_np(self):
+        with fluid.dygraph.guard():
+            var = fluid.dygraph.to_variable(self.array)
+            self.assertTrue(np.array_equal(var.numpy(), np.array(var)))
+            self.assertTrue(
+                np.array_equal(
+                    var.numpy(), np.array(
+                        var, dtype=np.float32)))
+
     def test_if(self):
         with fluid.dygraph.guard():
             var1 = fluid.dygraph.to_variable(np.array([[[0]]]))
@@ -622,6 +631,18 @@ def test_tensor_str_scaler(self):
         self.assertEqual(a_str, expected)
         paddle.enable_static()
 
+    def test_tensor_str_shape_with_zero(self):
+        paddle.disable_static(paddle.CPUPlace())
+        x = paddle.ones((10, 10))
+        y = paddle.fluid.layers.where(x == 0)
+        a_str = str(y)
+
+        expected = '''Tensor(shape=[0, 2], dtype=int64, place=CPUPlace, stop_gradient=True,
+       [])'''
+
+        self.assertEqual(a_str, expected)
+        paddle.enable_static()
+
     def test_print_tensor_dtype(self):
         paddle.disable_static(paddle.CPUPlace())
         a = paddle.rand([1])
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_rnn_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_rnn_op_xpu.py
new file mode 100755
index 0000000000000..a27d806319cb2
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_rnn_op_xpu.py
@@ -0,0 +1,173 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import math
+import paddle.fluid.core as core
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+import random
+import sys
+
+sys.path.append("..")
+from op_test_xpu import XPUOpTest
+sys.path.append("../rnn")
+from rnn_numpy import SimpleRNN, LSTM, GRU
+from convert import get_params_for_net
+
+random.seed(2)
+np.set_printoptions(threshold=np.inf)
+paddle.enable_static()
+
+
+class TestRNNOp(XPUOpTest):
+    def init_size(self):
+        self.seq_length = 1
+        self.batch_size = 1
+        self.input_size = 5
+        self.hidden_size = 16
+
+    def get_weight_names(self):
+        weight_names = []
+        for i in range(self.num_layers):
+            for j in range(0, 2 * self.direction_num):
+                weight_names.append("{}.weight_{}".format(i, j))
+        for i in range(self.num_layers):
+            for j in range(0, 2 * self.direction_num):
+                weight_names.append("{}.bias_{}".format(i, j))
+        return weight_names
+
+    def setUp(self):
+        self.init_size()
+        self.op_type = "rnn"
+        self.dtype = np.float32
+        self.sequence_length = np.ones(
+            (self.batch_size, ), dtype=np.int32) * self.seq_length
+        self.num_layers = 1
+        self.is_bidirec = False
+        self.mode = "LSTM"
+        self.is_test = False
+        self.dropout = 0.0
+        self.set_attrs()
+
+        self.direction_num = 2 if self.is_bidirec else 1
+        direction = "bidirectional" if self.is_bidirec else "forward"
+
+        input = np.random.uniform(
+            low=-0.1,
+            high=0.1,
+            size=(self.seq_length, self.batch_size,
+                  self.input_size)).astype(self.dtype)
+
+        rnn1 = LSTM(
+            self.input_size,
+            self.hidden_size,
+            num_layers=self.num_layers,
+            time_major=True,
+            direction=direction,
+            dropout=self.dropout,
+            dtype="float32")
+
+        flat_w = get_params_for_net(rnn1)
+        output, (last_hidden, last_cell) = rnn1(
+            input, sequence_length=self.sequence_length)
+
+        init_h = np.zeros(
+            (self.num_layers * self.direction_num, self.batch_size,
+             self.hidden_size)).astype(self.dtype)
+        init_c = np.zeros(
+            (self.num_layers * self.direction_num, self.batch_size,
+             self.hidden_size)).astype(self.dtype)
+        state_out = np.ndarray((300)).astype("uint8")
+
+        self.inputs = {
+            'Input': input,
+            'WeightList': flat_w,
+            'PreState': [('init_h', init_h), ('init_c', init_c)],
+            'SequenceLength': self.sequence_length
+        }
+        if self.sequence_length is None:
+            self.inputs = {
+                'Input': input,
+                'WeightList': flat_w,
+                'PreState': [('init_h', init_h), ('init_c', init_c)],
+            }
+        self.attrs = {
+            'dropout_prob': self.dropout,
+            'is_bidirec': self.is_bidirec,
+            'input_size': self.input_size,
+            'hidden_size': self.hidden_size,
+            'num_layers': self.num_layers,
+            'mode': self.mode,
+            'is_test': self.is_test
+        }
+        self.outputs = {
+            'Out': output,
+            "State": [('last_hidden', last_hidden), ('last_cell', last_cell)],
+            'Reserve': np.ndarray((400)).astype("uint8"),
+            'DropoutState': state_out
+        }
+
+    def test_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(
+                place, atol=0.01, no_check_set=['Reserve', 'DropoutState'])
+
+    def set_attrs(self):
+        pass
+
+    def test_grad(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            if not self.is_test:
+                var_name_list = self.get_weight_names()
+                grad_check_list = ['Input', 'init_h', 'init_c']
+                grad_check_list.extend(var_name_list)
+                self.check_grad_with_place(
+                    place,
+                    set(grad_check_list), ['Out', 'last_hidden', 'last_cell'],
+                    max_relative_error=0.1)
+
+
+class TestRNNOpCase0(TestRNNOp):
+    def init_size(self):
+        self.seq_length = 2
+        self.batch_size = 4
+        self.input_size = 10
+        self.hidden_size = 32
+
+
+class TestRNNOpCase1(TestRNNOp):
+    def init_size(self):
+        self.seq_length = 5
+        self.batch_size = 16
+        self.input_size = 30
+        self.hidden_size = 64
+
+
+class TestRNNOpCase2(TestRNNOp):
+    def init_size(self):
+        self.seq_length = 10
+        self.batch_size = 64
+        self.input_size = 50
+        self.hidden_size = 64
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/trainer_factory.py b/python/paddle/fluid/trainer_factory.py
index c61141bcd322c..00dea8d1251f4 100644
--- a/python/paddle/fluid/trainer_factory.py
+++ b/python/paddle/fluid/trainer_factory.py
@@ -49,8 +49,8 @@ def _create_trainer(self, opt_info=None):
             device_worker = Hogwild()
             trainer._set_device_worker(device_worker)
         else:
-            trainer_class = opt_info["trainer"]
-            device_worker_class = opt_info["device_worker"]
+            trainer_class = opt_info.get("trainer", "MultiTrainer")
+            device_worker_class = opt_info.get("device_worker", "Hogwild")
             trainer = globals()[trainer_class]()
             device_worker = globals()[device_worker_class]()
 
diff --git a/python/paddle/fluid/transpiler/collective.py b/python/paddle/fluid/transpiler/collective.py
index 752ec0672c216..ef6975c3d241e 100644
--- a/python/paddle/fluid/transpiler/collective.py
+++ b/python/paddle/fluid/transpiler/collective.py
@@ -17,6 +17,7 @@
 import sys
 import math
 from functools import reduce
+import os
 
 import collections
 import six
@@ -101,34 +102,64 @@ def _init_communicator(self, program, current_endpoint, endpoints, rank,
         nranks = len(endpoints)
         other_endpoints = endpoints[:]
         other_endpoints.remove(current_endpoint)
+        block = program.global_block()
+
         if rank == 0 and wait_port:
             wait_server_ready(other_endpoints)
 
         block = program.global_block()
-        nccl_id_var = block.create_var(
-            name=unique_name.generate('nccl_id'),
-            persistable=True,
-            type=core.VarDesc.VarType.RAW)
-        block.append_op(
-            type='c_gen_nccl_id',
-            inputs={},
-            outputs={'Out': nccl_id_var},
-            attrs={
-                'rank': rank,
-                'endpoint': current_endpoint,
-                'other_endpoints': other_endpoints,
-                self.op_role_key: OpRole.Forward
-            })
-        block.append_op(
-            type='c_comm_init',
-            inputs={'X': nccl_id_var},
-            outputs={},
-            attrs={
-                'nranks': nranks,
-                'rank': rank,
-                'ring_id': ring_id,
-                self.op_role_key: OpRole.Forward
-            })
+        if core.is_compiled_with_npu():
+            hccl_id_var = block.create_var(
+                name=unique_name.generate('hccl_id'),
+                persistable=True,
+                type=core.VarDesc.VarType.RAW)
+            endpoint_to_index_map = {e: idx for idx, e in enumerate(endpoints)}
+            block.append_op(
+                type='c_gen_hccl_id',
+                inputs={},
+                outputs={'Out': hccl_id_var},
+                attrs={
+                    'rank': rank,
+                    'endpoint': current_endpoint,
+                    'other_endpoints': other_endpoints,
+                    self.op_role_key: OpRole.Forward
+                })
+            block.append_op(
+                type='c_comm_init_hccl',
+                inputs={'X': hccl_id_var},
+                outputs={},
+                attrs={
+                    'rank': rank,
+                    'ring_id': ring_id,
+                    'device_id': int(os.getenv("FLAGS_selected_npus")),
+                    'rank_ids': nranks,
+                    self.op_role_key: OpRole.Forward
+                })
+        else:
+            nccl_id_var = block.create_var(
+                name=unique_name.generate('nccl_id'),
+                persistable=True,
+                type=core.VarDesc.VarType.RAW)
+            block.append_op(
+                type='c_gen_nccl_id',
+                inputs={},
+                outputs={'Out': nccl_id_var},
+                attrs={
+                    'rank': rank,
+                    'endpoint': current_endpoint,
+                    'other_endpoints': other_endpoints,
+                    self.op_role_key: OpRole.Forward
+                })
+            block.append_op(
+                type='c_comm_init',
+                inputs={'X': nccl_id_var},
+                outputs={},
+                attrs={
+                    'nranks': nranks,
+                    'rank': rank,
+                    'ring_id': ring_id,
+                    self.op_role_key: OpRole.Forward
+                })
 
     def _broadcast_params(self):
         block = self.startup_program.global_block()
diff --git a/python/paddle/framework/__init__.py b/python/paddle/framework/__init__.py
index 5a616d81659b2..b8684874085a9 100644
--- a/python/paddle/framework/__init__.py
+++ b/python/paddle/framework/__init__.py
@@ -18,12 +18,16 @@
     'NPUPlace', 'get_default_dtype', 'set_default_dtype'
 ]
 
-__all__ += ['grad', 'LayerList', 'load', 'save', 'no_grad', 'DataParallel']
+__all__ += [
+    'grad', 'set_grad_enabled', 'LayerList', 'load', 'save', 'no_grad',
+    'DataParallel'
+]
 
 from . import random
 from .random import seed
 from .framework import get_default_dtype
 from .framework import set_default_dtype
+from .framework import set_grad_enabled
 
 from ..fluid.param_attr import ParamAttr  #DEFINE_ALIAS
 # from ..fluid.layers.tensor import create_global_var  #DEFINE_ALIAS
diff --git a/python/paddle/framework/dtype.py b/python/paddle/framework/dtype.py
new file mode 100644
index 0000000000000..3eeaa6e74eceb
--- /dev/null
+++ b/python/paddle/framework/dtype.py
@@ -0,0 +1,40 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = [
+    "dtype", "uint8", "int8", "int16", "int32", "int64", "bfloat16", "float16",
+    "float32", "float64", "complex64", "complex128", "bool"
+]
+
+from ..fluid.core import VarDesc
+
+dtype = VarDesc.VarType
+dtype.__qualname__ = "dtype"
+dtype.__module__ = "paddle"
+
+uint8 = VarDesc.VarType.UINT8
+int8 = VarDesc.VarType.INT8
+int16 = VarDesc.VarType.INT16
+int32 = VarDesc.VarType.INT32
+int64 = VarDesc.VarType.INT64
+
+float32 = VarDesc.VarType.FP32
+float64 = VarDesc.VarType.FP64
+float16 = VarDesc.VarType.FP16
+bfloat16 = VarDesc.VarType.BF16
+
+complex64 = VarDesc.VarType.COMPLEX64
+complex128 = VarDesc.VarType.COMPLEX128
+
+bool = VarDesc.VarType.BOOL
diff --git a/python/paddle/framework/framework.py b/python/paddle/framework/framework.py
index 41ec18ce32d30..77be85a3195fd 100644
--- a/python/paddle/framework/framework.py
+++ b/python/paddle/framework/framework.py
@@ -15,7 +15,9 @@
 # TODO: define framework api 
 from paddle.fluid.layer_helper_base import LayerHelperBase
 from paddle.fluid.data_feeder import convert_dtype
+from paddle.fluid.framework import _dygraph_tracer
 import numpy as np
+from contextlib import contextmanager
 
 __all__ = ['set_default_dtype', 'get_default_dtype']
 
@@ -80,3 +82,37 @@ def get_default_dtype():
             paddle.get_default_dtype()
     """
     return LayerHelperBase.get_default_dtype()
+
+
+@contextmanager
+def set_grad_enabled(mode):
+    """
+    :api_attr: imperative
+
+    Create a context which enables or disables dygraph gradient calculation.
+
+    Args:
+        mode(bool): whether to enable (`True`), or disable (`False`) grad.
+
+    Examples:
+        .. code-block:: python
+            x = paddle.ones([3, 2])
+            x.stop_gradient = False
+            with torch.set_grad_enabled(False):
+                y = x * 2
+                with torch.set_grad_enabled(True):
+                    z = x * 2
+            print(y.stop_gradient)   # True
+            print(z.stop_gradient)   # False
+    """
+
+    tracer = _dygraph_tracer()
+    if tracer:
+        prev_mode = tracer._has_grad
+        tracer._has_grad = mode
+        try:
+            yield
+        finally:
+            tracer._has_grad = prev_mode
+    else:
+        yield
diff --git a/python/paddle/framework/io.py b/python/paddle/framework/io.py
index 3b953efab71c4..32a62d2461a14 100644
--- a/python/paddle/framework/io.py
+++ b/python/paddle/framework/io.py
@@ -33,7 +33,7 @@
 from paddle.fluid.io import _unpack_saved_dict, _pack_loaded_dict, _pickle_loads_mac
 from paddle.fluid.io import _legacy_save as _legacy_static_save
 
-from paddle.fluid.framework import Variable, _varbase_creator, _dygraph_tracer, in_dygraph_mode, ParamBase, _current_expected_place
+from paddle.fluid.framework import Variable, _varbase_creator, _dygraph_tracer, in_dygraph_mode, ParamBase, _current_expected_place, Program
 from paddle.fluid.dygraph.jit import _SaveLoadConfig
 from paddle.fluid.dygraph.io import _construct_program_holders, _construct_params_and_buffers
 from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX, INFER_PARAMS_INFO_SUFFIX
@@ -235,11 +235,6 @@ def _pickle_save(obj, f, protocol):
         raise ValueError("Expected 1<'protocol'<5, but received protocol={}".
                          format(protocol))
 
-    if not isinstance(obj, (core.LoDTensor, core.VarBase)):
-        raise NotImplementedError(
-            "Support 'paddle.Tensor' or 'paddle.core.LoDTensor', but received {}.".
-            format(type(obj)))
-
     def reudce_varbase(self):
         data = self.numpy()
         name = self.name
@@ -287,11 +282,48 @@ def pop_dispatch_table():
             pickler.dump(obj)
 
 
-def _use_legacy(obj):
-    # TODO(weixin):If `obj` is any object, the judgment condition should be more precise.
-    if not isinstance(obj, dict):
+def _contain_x(obj, condition_func):
+    if isinstance(obj, core.SelectedRows):
+        raise NotImplementedError(
+            "`paddle.save` do not support saving 'SelectedRows'.")
+
+    if condition_func(obj):
+        return True
+    elif type(obj) in (dict, collections.OrderedDict, list, tuple):
+        if type(obj) in (dict, collections.OrderedDict):
+            keys = list(obj.keys())
+        else:
+            keys = range(len(obj))
+        flag = False
+        for key in keys:
+            flag |= _contain_x(obj[key], condition_func)
+            if flag:
+                return True
+        return flag
+    else:
         return False
-    return True
+
+
+def _is_state_dict(obj):
+    if isinstance(obj, dict):
+
+        def condition(obj):
+            return isinstance(obj, (core.Layer, Program, core.VarBase,
+                                    core.LoDTensor, core.SelectedRows))
+
+        # If the value of a dict is a core.VarBase/LoDTensor or a dict 
+        # that does not contain a paddle type(Layer, Program, VarBase, LoDTensor, SelectedRows), 
+        # the dict is considered to be a state_ dict.
+        for key, value in obj.items():
+            if isinstance(value, dict):
+                for k, v in value.items():
+                    if _contain_x(v, condition):
+                        return False
+            elif not isinstance(value, (core.VarBase, core.LoDTensor)):
+                return False
+        return True
+
+    return False
 
 
 def _transformed_from_varbase(obj):
@@ -348,6 +380,120 @@ def _ndarray_to_tensor(obj, return_numpy):
         return _to_LodTensor(obj)
 
 
+def _lod_tensor2varbase(tensor):
+    return_var = _varbase_creator()
+    return_var.value().get_tensor().set(tensor, _current_expected_place())
+    return return_var
+
+
+def _parse_every_object(obj, condition_func, convert_func):
+    if condition_func(obj):
+        return convert_func(obj)
+    elif type(obj) in (dict, collections.OrderedDict, list):
+        if type(obj) == list:
+            keys = range(len(obj))
+        else:
+            keys = list(obj.keys())
+        for key in keys:
+            if condition_func(obj[key]):
+                obj[key] = convert_func(obj[key])
+            else:
+                obj[key] = _parse_every_object(obj[key], condition_func,
+                                               convert_func)
+        return obj
+    elif type(obj) == tuple:
+        return tuple(
+            _parse_every_object(list(obj), condition_func, convert_func))
+    elif type(obj) == set:
+        return set(_parse_every_object(list(obj), condition_func, convert_func))
+    else:
+        if isinstance(obj, collections.Iterable) and not isinstance(obj, (
+                str, np.ndarray, core.VarBase, core.LoDTensor)):
+            raise NotImplementedError(
+                "The iteratable objects supported are tuple, list, dict, OrderedDict, string. But received {}.".
+                format(type(obj)))
+        return obj
+
+
+def _parse_load_result(obj, return_numpy):
+    def is_layer(obj):
+        return isinstance(obj, core.Layer)
+
+    def parse_layer(obj):
+        temp_dict = _parse_load_result(obj.__dict__, False)
+        obj.__dict__.update(temp_dict)
+        return obj
+
+    if _contain_x(obj, is_layer):
+        if not in_dygraph_mode():
+            raise ValueError(
+                "Layer can only be loaded in dynamic graph mode, but now in static graph mode."
+            )
+
+        _parse_every_object(obj, is_layer, parse_layer)
+
+    def tuple_to_tensor(obj):
+        return _tuple_to_tensor(obj, return_numpy=return_numpy)
+
+    def ndarray_to_tensor(obj):
+        return _ndarray_to_tensor(obj, return_numpy=return_numpy)
+
+    # tuple(name, ndarry) was converted from varbase of paddle2.1, 
+    # and all tuple(name, ndarry) are converted to tensor.
+    if _contain_x(obj, _transformed_from_varbase):
+        return _parse_every_object(obj, _transformed_from_varbase,
+                                   tuple_to_tensor)
+    # If there is no tuple(name, ndary), it is considered to be saved by paddle2.0 
+    # or converted from LoDTensor, and all ndarrays are converted to tensor.
+    else:
+        return _parse_every_object(obj, _transformed_from_lodtensor,
+                                   ndarray_to_tensor)
+
+
+def _save_lod_tensor(tensor, file_name):
+    if not tensor._is_initialized():
+        raise ValueError("The saved tensor is not initialized.")
+    _seek = core._save_lod_tensor(tensor, file_name)
+    # '_seek' is the end position of this tensor in the file.
+    return _seek
+
+
+def _load_lod_tensor(file_name):
+    temp_t = paddle.fluid.core.LoDTensor()
+    # '_seek' is the end position of this tensor in the file.
+    _seek = paddle.fluid.core._load_lod_tensor(temp_t, file_name)
+    return temp_t, _seek
+
+
+def _save_selected_rows(selected_rows, file_name):
+    # '_seek' is the end position of this SelectedRows in the file.
+    if not selected_rows.get_tensor()._is_initialized():
+        raise ValueError("The saved tensor is not initialized.")
+    _seek = core._save_selected_rows(selected_rows, file_name)
+    return _seek
+
+
+def _load_selected_rows(file_name):
+    temp_sr = core.SelectedRows()
+    # '_seek' is the end position of this SelectedRows in the file.
+    _seek = core._load_selected_rows(temp_sr, file_name)
+    return temp_sr, _seek
+
+
+def _save_binary_var(obj, path):
+    if isinstance(obj, core.LoDTensor):
+        _save_lod_tensor(obj, path)
+    elif isinstance(obj, core.SelectedRows):
+        _save_selected_rows(obj, path)
+    elif isinstance(obj, core.VarBase):
+        _save_lod_tensor(obj.value().get_tensor(), path)
+    else:
+        # Since the concept of 'Tensor' is only exposed to users, the error message can only contain tensor instead of 'LoDTensor' or 'SelectedRows'
+        raise NotImplementedError(
+            "When use_binary_format = True, `paddle.save`  expected Tensor, but received {}.".
+            format(type(obj)))
+
+
 def save(obj, path, protocol=2, **configs):
     '''
     Save an object to the specified path.
@@ -447,22 +593,29 @@ def save(obj, path, protocol=2, **configs):
             "Type of `use_binary_format` should be bool, but received {}.".
             format(type(config.use_binary_format)))
 
-    # `protocol` need to be used, `pickle_protocol` is a deprecated arg.
-    if config.pickle_protocol is not None:
-        protocol = config.pickle_protocol
-        warnings.warn(
-            "'pickle_protocol' is a deprecated argument. Please use 'protocol' instead."
-        )
-
-    if _use_legacy(obj):
-        if in_dygraph_mode():
-            _legacy_save(obj, path, protocol)
-        else:
-            _legacy_static_save(obj, path, protocol)
+    if config.use_binary_format:
+        _save_binary_var(obj, path)
     else:
-        # save single variable
-        with open(path, 'wb') as f:
-            _pickle_save(obj, f, protocol)
+        # `protocol` need to be used, `pickle_protocol` is a deprecated arg.
+        if config.pickle_protocol is not None:
+            protocol = config.pickle_protocol
+            warnings.warn(
+                "'pickle_protocol' is a deprecated argument. Please use 'protocol' instead."
+            )
+
+        if isinstance(obj, Program):
+            obj.desc.flush()
+            with open(path, "wb") as f:
+                f.write(obj.desc.serialize_to_string())
+
+        elif _is_state_dict(obj):
+            if in_dygraph_mode():
+                _legacy_save(obj, path, protocol)
+            else:
+                _legacy_static_save(obj, path, protocol)
+        else:
+            with open(path, 'wb') as f:
+                _pickle_save(obj, f, protocol)
 
 
 def _legacy_save(obj, path, protocol=2):
@@ -627,46 +780,64 @@ def load(path, **configs):
 
     if os.path.isfile(path):
         config = _parse_load_config(configs)
-        with open(path, 'rb') as f:
-            # When value of dict is lager than 4GB ,there is a Bug on 'MAC python3'
-            if sys.platform == 'darwin' and sys.version_info.major == 3:
-                load_result = _pickle_loads_mac(path, f)
-            else:
-                load_result = pickle.load(f) if six.PY2 else pickle.load(
-                    f, encoding='latin1')
+        if six.PY2:
+            exception_type = KeyError
+        else:
+            exception_type = pickle.UnpicklingError
+        try:
+            with open(path, 'rb') as f:
+                # When value of dict is lager than 4GB ,there is a Bug on 'MAC python3'
+                if sys.platform == 'darwin' and sys.version_info.major == 3:
+                    load_result = _pickle_loads_mac(path, f)
+                else:
+                    load_result = pickle.load(f) if six.PY2 else pickle.load(
+                        f, encoding='latin1')
 
-            # TODO(weixin):If `obj` is any object, the judgment condition should be more precise.
-            if isinstance(load_result, dict):
+                # TODO(weixin):If `obj` is any object, the judgment condition should be more precise.
                 if isinstance(load_result, dict):
                     load_result = _pack_loaded_dict(load_result)
-                # paddle2.0: paddle.save/load
-                if "StructuredToParameterName@@" in load_result:
+                    # paddle2.0: paddle.save/load
+                    if "StructuredToParameterName@@" in load_result:
 
-                    for key in load_result["StructuredToParameterName@@"]:
-                        load_result[key] = _ndarray_to_tensor(
-                            load_result[key], config.return_numpy)
+                        for key in load_result["StructuredToParameterName@@"]:
+                            load_result[key] = _ndarray_to_tensor(
+                                load_result[key], config.return_numpy)
 
-                    if not config.keep_name_table and "StructuredToParameterName@@" in load_result:
-                        del load_result["StructuredToParameterName@@"]
-                else:
-                    # paddle2.1 static.save/load
-                    for key in load_result:
-                        load_result[key] = _ndarray_to_tensor(
-                            load_result[key], config.return_numpy)
+                        if not config.keep_name_table and "StructuredToParameterName@@" in load_result:
+                            del load_result["StructuredToParameterName@@"]
+                    else:
+                        # paddle2.1 static.save/load
+                        load_result = _parse_load_result(load_result,
+                                                         config.return_numpy)
 
-            else:
-                # TODO(weixin): support complex objects such as layer.
-                # If `obj` is any object, the judgment condition should be more precise.
-                if _transformed_from_lodtensor(load_result):
-                    load_result = _ndarray_to_tensor(load_result,
-                                                     config.return_numpy)
-                elif _transformed_from_varbase(load_result):
-                    load_result = _tuple_to_tensor(load_result,
-                                                   config.return_numpy)
                 else:
-                    raise NotImplementedError(
-                        'Only support tensor and state_dict, but received {}.'.
-                        format(type(load_result)))
+                    load_result = _parse_load_result(load_result,
+                                                     config.return_numpy)
+
+        except exception_type as msg_pickle:
+            try:
+                tensor, _ = _load_selected_rows(path)
+                return tensor
+            except:
+                try:
+                    tensor, _ = _load_lod_tensor(path)
+                    if config.return_numpy:
+                        return np.array(tensor)
+                    else:
+                        if in_dygraph_mode():
+                            return _lod_tensor2varbase(tensor)
+                        return tensor
+                except:
+                    try:
+                        with open(path, "rb") as f:
+                            program_desc_str = f.read()
+                            program = Program.parse_from_string(
+                                program_desc_str)
+                            return program
+                    except:
+                        raise ValueError(
+                            "`paddle.load` can not parse the file:{}.".format(
+                                path))
 
     else:
         load_result = _legacy_load(path, **configs)
diff --git a/python/paddle/hapi/__init__.py b/python/paddle/hapi/__init__.py
index 0aea557a28c27..6b7672828e63d 100644
--- a/python/paddle/hapi/__init__.py
+++ b/python/paddle/hapi/__init__.py
@@ -15,6 +15,7 @@
 from . import logger
 from . import callbacks
 from . import model_summary
+from . import hub
 
 from . import model
 from .model import *
diff --git a/python/paddle/hapi/callbacks.py b/python/paddle/hapi/callbacks.py
index ac95fea151ed0..cd4b35ea29a83 100644
--- a/python/paddle/hapi/callbacks.py
+++ b/python/paddle/hapi/callbacks.py
@@ -364,7 +364,7 @@ def on_train_begin(self, logs=None):
         }
         if self._is_print():
             print(
-                "The loss value printed in the log is the current step, and the metric is the average value of previous step."
+                "The loss value printed in the log is the current step, and the metric is the average value of previous steps."
             )
 
     def on_epoch_begin(self, epoch=None, logs=None):
diff --git a/python/paddle/hapi/hub.py b/python/paddle/hapi/hub.py
new file mode 100644
index 0000000000000..31a8be0944f3d
--- /dev/null
+++ b/python/paddle/hapi/hub.py
@@ -0,0 +1,277 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import re
+import sys
+import shutil
+import zipfile
+from paddle.utils.download import get_path_from_url
+
+DEFAULT_CACHE_DIR = '~/.cache'
+VAR_DEPENDENCY = 'dependencies'
+MODULE_HUBCONF = 'hubconf.py'
+HUB_DIR = os.path.expanduser(os.path.join('~', '.cache', 'paddle', 'hub'))
+
+
+def _remove_if_exists(path):
+    if os.path.exists(path):
+        if os.path.isfile(path):
+            os.remove(path)
+        else:
+            shutil.rmtree(path)
+
+
+def _import_module(name, repo_dir):
+    sys.path.insert(0, repo_dir)
+    try:
+        hub_module = __import__(name)
+        sys.modules.pop(name)
+    except ImportError:
+        sys.path.remove(repo_dir)
+        raise RuntimeError(
+            'Cannot import `{}`, please make sure `{}`.py in repo root dir'.
+            format(name, name))
+
+    sys.path.remove(repo_dir)
+
+    return hub_module
+
+
+def _git_archive_link(repo_owner, repo_name, branch, source):
+    if source == 'github':
+        return 'https://github.com/{}/{}/archive/{}.zip'.format(
+            repo_owner, repo_name, branch)
+    elif source == 'gitee':
+        return 'https://gitee.com/{}/{}/repository/archive/{}.zip'.format(
+            repo_owner, repo_name, branch)
+
+
+def _parse_repo_info(repo, source):
+    branch = 'main' if source == 'github' else 'master'
+    if ':' in repo:
+        repo_info, branch = repo.split(':')
+    else:
+        repo_info = repo
+    repo_owner, repo_name = repo_info.split('/')
+    return repo_owner, repo_name, branch
+
+
+def _make_dirs(dirname):
+    try:
+        from pathlib import Path
+    except ImportError:
+        from pathlib2 import Path
+    Path(dirname).mkdir(exist_ok=True)
+
+
+def _get_cache_or_reload(repo, force_reload, verbose=True, source='github'):
+    # Setup hub_dir to save downloaded files
+    hub_dir = HUB_DIR
+
+    _make_dirs(hub_dir)
+
+    # Parse github/gitee repo information
+    repo_owner, repo_name, branch = _parse_repo_info(repo, source)
+    # Github allows branch name with slash '/',
+    # this causes confusion with path on both Linux and Windows.
+    # Backslash is not allowed in Github branch name so no need to
+    # to worry about it.
+    normalized_br = branch.replace('/', '_')
+    # Github renames folder repo/v1.x.x to repo-1.x.x
+    # We don't know the repo name before downloading the zip file
+    # and inspect name from it.
+    # To check if cached repo exists, we need to normalize folder names.
+    repo_dir = os.path.join(hub_dir,
+                            '_'.join([repo_owner, repo_name, normalized_br]))
+
+    use_cache = (not force_reload) and os.path.exists(repo_dir)
+
+    if use_cache:
+        if verbose:
+            sys.stderr.write('Using cache found in {}\n'.format(repo_dir))
+    else:
+        cached_file = os.path.join(hub_dir, normalized_br + '.zip')
+        _remove_if_exists(cached_file)
+
+        url = _git_archive_link(repo_owner, repo_name, branch, source=source)
+
+        get_path_from_url(url, hub_dir, decompress=False)
+
+        with zipfile.ZipFile(cached_file) as cached_zipfile:
+            extraced_repo_name = cached_zipfile.infolist()[0].filename
+            extracted_repo = os.path.join(hub_dir, extraced_repo_name)
+            _remove_if_exists(extracted_repo)
+            # Unzip the code and rename the base folder
+            cached_zipfile.extractall(hub_dir)
+
+        _remove_if_exists(cached_file)
+        _remove_if_exists(repo_dir)
+        # rename the repo
+        shutil.move(extracted_repo, repo_dir)
+
+    return repo_dir
+
+
+def _load_entry_from_hubconf(m, name):
+    '''load entry from hubconf
+    '''
+    if not isinstance(name, str):
+        raise ValueError(
+            'Invalid input: model should be a str of function name')
+
+    func = getattr(m, name, None)
+
+    if func is None or not callable(func):
+        raise RuntimeError('Cannot find callable {} in hubconf'.format(name))
+
+    return func
+
+
+def _check_module_exists(name):
+    try:
+        __import__(name)
+        return True
+    except ImportError:
+        return False
+
+
+def _check_dependencies(m):
+    dependencies = getattr(m, VAR_DEPENDENCY, None)
+
+    if dependencies is not None:
+        missing_deps = [
+            pkg for pkg in dependencies if not _check_module_exists(pkg)
+        ]
+        if len(missing_deps):
+            raise RuntimeError('Missing dependencies: {}'.format(', '.join(
+                missing_deps)))
+
+
+def list(repo_dir, source='github', force_reload=False):
+    r"""
+    List all entrypoints available in `github` hubconf.
+
+    Args:
+        repo_dir(str): github or local path
+            github path (str): a str with format "repo_owner/repo_name[:tag_name]" with an optional
+                tag/branch. The default branch is `main` if not specified.
+            local path (str): local repo path
+        source (str): `github` | `gitee` | `local`, default is `github`
+        force_reload (bool, optional): whether to discard the existing cache and force a fresh download, default is `False`.
+    Returns:
+        entrypoints: a list of available entrypoint names
+
+    Example:
+        ```python
+        import paddle
+
+        paddle.hub.list('lyuwenyu/paddlehub_demo:main', source='github', force_reload=False)
+
+        ```
+    """
+    if source not in ('github', 'gitee', 'local'):
+        raise ValueError(
+            'Unknown source: "{}". Allowed values: "github" | "gitee" | "local".'.
+            format(source))
+
+    if source in ('github', 'gitee'):
+        repo_dir = _get_cache_or_reload(
+            repo_dir, force_reload, True, source=source)
+
+    hub_module = _import_module(MODULE_HUBCONF.split('.')[0], repo_dir)
+
+    entrypoints = [
+        f for f in dir(hub_module)
+        if callable(getattr(hub_module, f)) and not f.startswith('_')
+    ]
+
+    return entrypoints
+
+
+def help(repo_dir, model, source='github', force_reload=False):
+    """
+    Show help information of model
+
+    Args:
+        repo_dir(str): github or local path
+            github path (str): a str with format "repo_owner/repo_name[:tag_name]" with an optional
+                tag/branch. The default branch is `main` if not specified.
+            local path (str): local repo path
+        model (str): model name
+        source (str): `github` | `gitee` | `local`, default is `github`
+        force_reload (bool, optional): default is `False`
+    Return:
+        docs
+
+    Example:
+        ```python
+        import paddle
+
+        paddle.hub.help('lyuwenyu/paddlehub_demo:main', model='MM', source='github')
+        ```
+    """
+    if source not in ('github', 'gitee', 'local'):
+        raise ValueError(
+            'Unknown source: "{}". Allowed values: "github" | "gitee" | "local".'.
+            format(source))
+
+    if source in ('github', 'gitee'):
+        repo_dir = _get_cache_or_reload(
+            repo_dir, force_reload, True, source=source)
+
+    hub_module = _import_module(MODULE_HUBCONF.split('.')[0], repo_dir)
+
+    entry = _load_entry_from_hubconf(hub_module, model)
+
+    return entry.__doc__
+
+
+def load(repo_dir, model, source='github', force_reload=False, **kwargs):
+    """
+    Load model
+
+    Args:
+        repo_dir(str): github or local path
+            github path (str): a str with format "repo_owner/repo_name[:tag_name]" with an optional
+                tag/branch. The default branch is `main` if not specified.
+            local path (str): local repo path
+        model (str): model name
+        source (str): `github` | `gitee` | `local`, default is `github`
+        force_reload (bool, optional), default is `False`
+        **kwargs: parameters using for model
+    Return:
+        paddle model
+    Example:
+        ```python
+        import paddle
+        paddle.hub.load('lyuwenyu/paddlehub_demo:main', model='MM', source='github')
+        ```
+    """
+    if source not in ('github', 'gitee', 'local'):
+        raise ValueError(
+            'Unknown source: "{}". Allowed values: "github" | "gitee" | "local".'.
+            format(source))
+
+    if source in ('github', 'gitee'):
+        repo_dir = _get_cache_or_reload(
+            repo_dir, force_reload, True, source=source)
+
+    hub_module = _import_module(MODULE_HUBCONF.split('.')[0], repo_dir)
+
+    _check_dependencies(hub_module)
+
+    entry = _load_entry_from_hubconf(hub_module, model)
+
+    return entry(**kwargs)
diff --git a/python/paddle/hapi/model.py b/python/paddle/hapi/model.py
index 137ca186d7946..fa8bd600bb28d 100644
--- a/python/paddle/hapi/model.py
+++ b/python/paddle/hapi/model.py
@@ -41,8 +41,6 @@
 from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
 from paddle.fluid.layers.utils import flatten
 from paddle.fluid.layers import collective
-from paddle.fluid.incubate.fleet.collective import fleet, DistributedStrategy
-from paddle.fluid.incubate.fleet.base import role_maker
 
 from paddle.io import DataLoader, Dataset, DistributedBatchSampler
 from paddle.fluid.executor import scope_guard, Executor
@@ -50,6 +48,8 @@
 from paddle.metric import Metric
 from paddle.static import InputSpec as Input
 import paddle.distributed as dist
+import paddle.distributed.fleet as fleet
+from paddle.distributed.fleet.base import role_maker
 
 from .callbacks import config_callbacks, EarlyStopping
 from .model_summary import summary
@@ -133,33 +133,59 @@ def init_communicator(program, rank, nranks, wait_port, current_endpoint,
         return
     other_endpoints = endpoints[:]
     other_endpoints.remove(current_endpoint)
+    block = program.global_block()
     if rank == 0 and wait_port:
         wait_server_ready(other_endpoints)
-    block = program.global_block()
-    nccl_id_var = block.create_var(
-        name=fluid.unique_name.generate('nccl_id'),
-        persistable=True,
-        type=fluid.core.VarDesc.VarType.RAW)
-
-    block.append_op(
-        type='c_gen_nccl_id',
-        inputs={},
-        outputs={'Out': nccl_id_var},
-        attrs={
-            'rank': rank,
-            'endpoint': current_endpoint,
-            'other_endpoints': other_endpoints
-        })
-
-    block.append_op(
-        type='c_comm_init',
-        inputs={'X': nccl_id_var},
-        outputs={},
-        attrs={
-            'nranks': nranks,
-            'rank': rank,
-            'ring_id': 0,
-        })
+    if core.is_compiled_with_cuda():
+        nccl_id_var = block.create_var(
+            name=fluid.unique_name.generate('nccl_id'),
+            persistable=True,
+            type=fluid.core.VarDesc.VarType.RAW)
+
+        block.append_op(
+            type='c_gen_nccl_id',
+            inputs={},
+            outputs={'Out': nccl_id_var},
+            attrs={
+                'rank': rank,
+                'endpoint': current_endpoint,
+                'other_endpoints': other_endpoints
+            })
+
+        block.append_op(
+            type='c_comm_init',
+            inputs={'X': nccl_id_var},
+            outputs={},
+            attrs={
+                'nranks': nranks,
+                'rank': rank,
+                'ring_id': 0,
+            })
+    elif core.is_compiled_with_npu():
+        hccl_id_var = block.create_var(
+            name=unique_name.generate('hccl_id'),
+            persistable=True,
+            type=core.VarDesc.VarType.RAW)
+        endpoint_to_index_map = {e: idx for idx, e in enumerate(endpoints)}
+        block.append_op(
+            type='c_gen_hccl_id',
+            inputs={},
+            outputs={'Out': hccl_id_var},
+            attrs={
+                'rank': rank,
+                'endpoint': current_endpoint,
+                'other_endpoints': other_endpoints
+            })
+        block.append_op(
+            type='c_comm_init_hccl',
+            inputs={'X': hccl_id_var},
+            outputs={},
+            attrs={
+                'rank': rank,
+                'ring_id': 0,
+                'device_id': int(os.getenv("FLAGS_selected_npus")),
+                'rank_ids': nranks
+            })
 
 
 def prepare_distributed_context(place=None):
@@ -252,6 +278,11 @@ def __init__(self, model):
         self._nranks = ParallelEnv().nranks
         self._local_rank = ParallelEnv().local_rank
 
+        self._amp_level = "O0"
+        self._amp_configs = {}
+        self._amp_custom_lists = {}
+        self._use_fp16_guard = True
+
     @property
     def mode(self):
         return self.model.mode
@@ -550,11 +581,26 @@ def _make_program(self, mode):
                 if self._nranks > 1:
                     role = role_maker.PaddleCloudRoleMaker(is_collective=True)
                     fleet.init(role)
-                    dist_strategy = DistributedStrategy()
-                    dist_strategy.mode = "collective"
-                    dist_strategy.collective_mode = "grad_allreduce"
+                    dist_strategy = fleet.DistributedStrategy()
+                    if self._amp_level != 'O0':
+                        dist_strategy.amp = True
+                        dist_strategy.amp_configs = self._amp_configs.copy()
+                        dist_strategy.amp_configs.update(self._amp_custom_lists)
+                        dist_strategy.amp_configs[
+                            'use_pure_fp16'] = self._amp_level == 'O2'
                     self.model._optimizer = fleet.distributed_optimizer(
                         self.model._optimizer, strategy=dist_strategy)
+                elif self._amp_level != "O0" and core.is_compiled_with_cuda:
+                    amp_lists = paddle.static.amp.AutoMixedPrecisionLists(
+                        **self.
+                        _amp_custom_lists) if self._amp_custom_lists else None
+
+                    self.model._optimizer = paddle.static.amp.decorate(
+                        self.model._optimizer,
+                        amp_lists=amp_lists,
+                        use_pure_fp16=self._amp_level == "O2",
+                        use_fp16_guard=self._use_fp16_guard,
+                        **self._amp_configs)
 
                 self.model._optimizer.minimize(self._loss_endpoint)
 
@@ -598,6 +644,10 @@ def _compile_and_initialize(self, prog, mode):
                 startup_prog = self._startup_prog._prune(uninitialized)
                 self._executor.run(startup_prog)
 
+        if self._amp_level == "O2" and mode == 'train' and core.is_compiled_with_cuda(
+        ):
+            self.model._optimizer.amp_init(place)
+
         if self._nranks < 2:
             compiled_prog = fluid.CompiledProgram(prog)
         else:
@@ -620,6 +670,11 @@ def __init__(self, model):
         }
 
         self._input_info = None
+        self._amp_level = "O0"
+        self._amp_configs = {}
+        self._amp_custom_lists = {}
+        self._use_fp16_guard = True
+
         if self._nranks > 1:
             dist.init_parallel_env()
             stradegy = fluid.dygraph.parallel.ParallelStrategy()
@@ -649,19 +704,30 @@ def train_batch(self, inputs, labels=None):
         labels = labels or []
         labels = [to_variable(l) for l in to_list(labels)]
 
-        if self._nranks > 1:
-            outputs = self.ddp_model.forward(* [to_variable(x) for x in inputs])
-        else:
-            outputs = self.model.network.forward(
-                * [to_variable(x) for x in inputs])
+        if self._amp_level != "O0":
+            scaler = paddle.amp.GradScaler(**self._amp_configs)
+        with paddle.amp.auto_cast(
+                enable=self._amp_level != 'O0', **self._amp_custom_lists):
+            if self._nranks > 1:
+                outputs = self.ddp_model.forward(
+                    * [to_variable(x) for x in inputs])
+            else:
+                outputs = self.model.network.forward(
+                    * [to_variable(x) for x in inputs])
 
-        losses = self.model._loss(*(to_list(outputs) + labels))
-        losses = to_list(losses)
-        final_loss = fluid.layers.sum(losses)
-        final_loss.backward()
+            losses = self.model._loss(*(to_list(outputs) + labels))
+            losses = to_list(losses)
+            final_loss = fluid.layers.sum(losses)
 
-        self.model._optimizer.minimize(final_loss)
-        self.model.network.clear_gradients()
+        if self._amp_level != "O0":
+            scaled = scaler.scale(final_loss)
+            scaled.backward()
+            scaler.minimize(self.model._optimizer, scaled)
+            self.model.network.clear_gradients()
+        else:
+            final_loss.backward()
+            self.model._optimizer.minimize(final_loss)
+            self.model.network.clear_gradients()
 
         metrics = []
         for metric in self.model._metrics:
@@ -816,6 +882,16 @@ class Model(object):
     instantiating a Model. The input description, i.e, paddle.static.InputSpec,
     must be required for static graph.
 
+    When training on GPU, auto mixed precision (AMP) training is supported, and
+    pure float16 training is also supported in static mode while using Adam,
+    AdamW and Momentum optimizer. Before using pure float16 training,
+    `multi_precision` could be set to True when creating optimizer, which can
+    avoid poor accuracy or slow convergence in a way, and inputs of dtype float
+    should be cast to float16 by users. Users should also use
+    `paddle.static.amp.fp16_guard` API to limit the range of pure float16
+    training, otherwise, 'use_fp16_guard' should be set to False by users.
+    However, limiting the range of is not supported during training using AMP.
+
     Args:
         network (paddle.nn.Layer): The network is an instance of
             paddle.nn.Layer.
@@ -830,6 +906,8 @@ class Model(object):
 
 
     Examples:
+        1. A common example
+
         .. code-block:: python
 
           import paddle
@@ -838,7 +916,7 @@ class Model(object):
           from paddle.static import InputSpec
   
           device = paddle.set_device('cpu') # or 'gpu'
-  
+
           net = nn.Sequential(
               nn.Flatten(1),
               nn.Linear(784, 200),
@@ -852,6 +930,7 @@ class Model(object):
           model = paddle.Model(net, input, label)
           optim = paddle.optimizer.SGD(learning_rate=1e-3,
               parameters=model.parameters())
+
           model.prepare(optim,
                         paddle.nn.CrossEntropyLoss(),
                         paddle.metric.Accuracy())
@@ -862,6 +941,43 @@ class Model(object):
           ])
           data = paddle.vision.datasets.MNIST(mode='train', transform=transform)
           model.fit(data, epochs=2, batch_size=32, verbose=1)
+
+
+        2. An example using mixed precision training.
+
+        .. code-block:: python
+
+          import paddle
+          import paddle.nn as nn
+          import paddle.vision.transforms as T
+
+          def run_example_code():
+            device = paddle.set_device('gpu')
+
+            net = nn.Sequential(nn.Flatten(1), nn.Linear(784, 200), nn.Tanh(),
+                                nn.Linear(200, 10))
+
+            model = paddle.Model(net)
+            optim = paddle.optimizer.SGD(learning_rate=1e-3, parameters=model.parameters())
+
+            amp_configs = {
+                "level": "O1",
+                "custom_white_list": {'conv2d'},
+                "use_dynamic_loss_scaling": True
+            }
+            model.prepare(optim,
+                paddle.nn.CrossEntropyLoss(),
+                paddle.metric.Accuracy(),
+                amp_configs=amp_configs)
+
+            transform = T.Compose([T.Transpose(), T.Normalize([127.5], [127.5])])
+            data = paddle.vision.datasets.MNIST(mode='train', transform=transform)
+            model.fit(data, epochs=2, batch_size=32, verbose=1)
+
+          # mixed precision training is only support on GPU now.
+          if paddle.is_compiled_with_cuda():
+            run_example_code()
+
     """
 
     def __init__(self, network, inputs=None, labels=None):
@@ -1241,7 +1357,94 @@ def parameters(self, *args, **kwargs):
         """
         return self._adapter.parameters()
 
-    def prepare(self, optimizer=None, loss=None, metrics=None):
+    def _prepare_amp(self, amp_configs):
+        def _check_pure_fp16_configs():
+            # pure float16 training has some restricts now
+            if self._adapter._amp_level == "O2":
+                if in_dygraph_mode():
+                    warnings.warn("Pure float16 training is not supported in dygraph mode now, "\
+                        "and it will be supported in future version.")
+                else:
+                    # grad clip is not supported in pure fp16 training now
+                    assert self._optimizer._grad_clip is None, \
+                        "Grad clip is not supported in pure float16 training now, and it will be supported in future version."
+
+        self._adapter._amp_custom_lists = {}
+        self._adapter._amp_configs = {}
+
+        # check and get level of mixed precision training
+        if not amp_configs:
+            self._adapter._amp_level = 'O0'
+            return
+        elif isinstance(amp_configs, str):
+            if amp_configs not in ('O0', 'O1', 'O2'):
+                raise ValueError(
+                    "The level of amp_configs should be 'O0', 'O1' or 'O2'.")
+            self._adapter._amp_level = amp_configs
+            _check_pure_fp16_configs()
+            return
+        else:
+            if 'level' not in amp_configs:
+                self._adapter._amp_level = 'O1'
+            elif amp_configs['level'] not in ('O0', 'O1', 'O2'):
+                raise ValueError(
+                    "amp_configs['level'] should be 'O0', 'O1' or 'O2'.")
+            else:
+                self._adapter._amp_level = amp_configs['level']
+        amp_config_key_set = set(amp_configs.keys()) - {'level'}
+        if not amp_config_key_set or self._adapter._amp_level == 'O0':
+            return
+
+        if 'use_pure_fp16' in amp_configs:
+            raise ValueError(
+                "''use_pure_fp16' is an invalid parameter, "
+                "the level of mixed precision training only depends on 'O1' or 'O2'."
+            )
+
+        _check_pure_fp16_configs()
+
+        # construct amp_custom_lists
+        if self._adapter._amp_level != 'O0' and amp_config_key_set:
+            for param_name in [
+                    'custom_white_list', 'custom_black_list',
+                    'custom_black_varnames'
+            ]:
+                if param_name in amp_config_key_set:
+                    self._adapter._amp_custom_lists[param_name] = amp_configs[
+                        param_name]
+                    amp_config_key_set -= {param_name}
+
+        def _check_amp_configs(amp_config_key_set):
+            accepted_param_set = {
+                'init_loss_scaling',
+                'incr_ratio',
+                'decr_ratio',
+                'incr_every_n_steps',
+                'decr_every_n_nan_or_inf',
+                'use_dynamic_loss_scaling',
+                'use_fp16_guard',
+            }
+            if amp_config_key_set - accepted_param_set:
+                raise ValueError(
+                    "Except for 'level', the keys of 'amp_configs' must be accepted by mixed precision APIs, "
+                    "but {} could not be recognized.".format(
+                        tuple(amp_config_key_set - accepted_param_set)))
+
+            if 'use_fp16_guard' in amp_config_key_set:
+                if in_dygraph_mode():
+                    raise ValueError(
+                        "'use_fp16_guard' is supported in static mode only.")
+                self._adapter._use_fp16_guard = amp_configs['use_fp16_guard']
+                amp_config_key_set.remove('use_fp16_guard')
+
+            return amp_config_key_set
+
+        amp_configs_set = _check_amp_configs(amp_config_key_set)
+        for key in amp_configs_set:
+            self._adapter._amp_configs[key] = amp_configs[key]
+
+    def prepare(self, optimizer=None, loss=None, metrics=None,
+                amp_configs=None):
         """
         Configures the model before runing.
 
@@ -1255,7 +1458,23 @@ def prepare(self, optimizer=None, loss=None, metrics=None):
                 It can be None when there is no loss.
             metrics (Metric|list of Metric|None): If metrics is set, all
                 metrics will be calculated and output in train/eval mode.
-
+            amp_configs (str|dict|None): AMP configurations. If AMP or pure
+                float16 training is used, the key 'level' of 'amp_configs'
+                should be set to 'O1' or 'O2' respectively. Otherwise, the
+                value of 'level' defaults to 'O0', which means float32
+                training. In addition to 'level', users could pass in more
+                parameters consistent with mixed precision API. The supported
+                keys are: 'init_loss_scaling', 'incr_ratio', 'decr_ratio',
+                'incr_every_n_steps', 'decr_every_n_nan_or_inf',
+                'use_dynamic_loss_scaling', 'custom_white_list',
+                'custom_black_list', and 'custom_black_varnames'or
+                'use_fp16_guard' is only supported in static mode. Users could
+                refer to mixed precision API documentations
+                 :ref:`api_paddle_amp_auto_cast` and
+                 :ref:`api_paddle_amp_GradScaler` for details. For convenience,
+                'amp_configs' could be set to 'O1' or 'O2' if no more
+                parameters are needed. 'amp_configs' could be None in float32
+                training. Default: None.
         Returns:
             None
         """
@@ -1292,6 +1511,7 @@ def prepare(self, optimizer=None, loss=None, metrics=None):
                 "{} is not sub class of Metric".format(
                     metric.__class__.__name__)
         self._metrics = to_list(metrics)
+        self._prepare_amp(amp_configs)
 
         if not in_dygraph_mode():
             self._adapter.prepare()
diff --git a/python/paddle/incubate/__init__.py b/python/paddle/incubate/__init__.py
index 662515f0e52b1..03e5a88624086 100644
--- a/python/paddle/incubate/__init__.py
+++ b/python/paddle/incubate/__init__.py
@@ -13,9 +13,9 @@
 # limitations under the License.
 
 from . import optimizer
-from ..fluid.contrib import reader
+from . import checkpoint
 from ..fluid.layer_helper import LayerHelper
 
 __all__ = []
-__all__ += ["reader"]
 __all__ += optimizer.__all__
+__all__ += checkpoint.__all__
diff --git a/python/paddle/fluid/contrib/reader/__init__.py b/python/paddle/incubate/checkpoint/__init__.py
similarity index 74%
rename from python/paddle/fluid/contrib/reader/__init__.py
rename to python/paddle/incubate/checkpoint/__init__.py
index 32054d1421a27..7ddd256df7479 100644
--- a/python/paddle/fluid/contrib/reader/__init__.py
+++ b/python/paddle/incubate/checkpoint/__init__.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,9 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from __future__ import print_function
+from ...fluid.incubate.checkpoint import auto_checkpoint
 
-from .distributed_reader import *
-
-__all__ = []
-__all__ += distributed_reader.__all__
+__all__ = ["auto_checkpoint"]
diff --git a/python/paddle/io/__init__.py b/python/paddle/io/__init__.py
index 59e2729941e41..5781f78c6e4e4 100755
--- a/python/paddle/io/__init__.py
+++ b/python/paddle/io/__init__.py
@@ -13,26 +13,37 @@
 # limitations under the License.
 
 # TODO: define all functions about input & output in this directory 
-__all__ = [
-    'Dataset',
-    'IterableDataset',
-    'TensorDataset',
-    'ComposeDataset',
-    'ChainDataset',
-    'BatchSampler',
-    'DistributedBatchSampler',
-    #            'Transform',
-    'DataLoader',
-    'get_worker_info',
-    'Sampler',
-    'SequenceSampler',
-    'RandomSampler',
-    'WeightedRandomSampler',
-    'random_split',
-    'Subset'
-]
 
-from ..fluid.io import DataLoader
-from ..fluid.dataloader import Dataset, IterableDataset, BatchSampler, get_worker_info, \
-        TensorDataset, Sampler, SequenceSampler, RandomSampler, DistributedBatchSampler, \
-        ComposeDataset, ChainDataset, WeightedRandomSampler, Subset, random_split
+from ..fluid.io import DataLoader  # noqa: F401
+from ..fluid.dataloader import Dataset  # noqa: F401
+from ..fluid.dataloader import IterableDataset  # noqa: F401
+from ..fluid.dataloader import BatchSampler  # noqa: F401
+from ..fluid.dataloader import get_worker_info  # noqa: F401
+from ..fluid.dataloader import TensorDataset  # noqa: F401
+from ..fluid.dataloader import Sampler  # noqa: F401
+from ..fluid.dataloader import SequenceSampler  # noqa: F401
+from ..fluid.dataloader import RandomSampler  # noqa: F401
+from ..fluid.dataloader import DistributedBatchSampler  # noqa: F401
+from ..fluid.dataloader import ComposeDataset  # noqa: F401
+from ..fluid.dataloader import ChainDataset  # noqa: F401
+from ..fluid.dataloader import WeightedRandomSampler  # noqa: F401
+from ..fluid.dataloader import Subset  # noqa: F401
+from ..fluid.dataloader import random_split  # noqa: F401
+
+__all__ = [ #noqa
+           'Dataset',
+           'IterableDataset',
+           'TensorDataset',
+           'ComposeDataset',
+           'ChainDataset',
+           'BatchSampler',
+           'DistributedBatchSampler',
+           'DataLoader',
+           'get_worker_info',
+           'Sampler',
+           'SequenceSampler',
+           'RandomSampler',
+           'WeightedRandomSampler',
+           'random_split',
+           'Subset'
+]
diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py
index 79f21aadae69a..836d4008f7d0b 100644
--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
@@ -55,6 +55,7 @@
 from .layer.activation import ReLU  #DEFINE_ALIAS
 from .layer.activation import ReLU6  #DEFINE_ALIAS
 from .layer.activation import SELU  #DEFINE_ALIAS
+from .layer.activation import Silu  #DEFINE_ALIAS
 from .layer.activation import LeakyReLU  #DEFINE_ALIAS
 from .layer.activation import Sigmoid  #DEFINE_ALIAS
 from .layer.activation import Hardsigmoid  #DEFINE_ALIAS
@@ -83,6 +84,7 @@
 from .layer.common import Dropout2D  #DEFINE_ALIAS
 from .layer.common import Dropout3D  #DEFINE_ALIAS
 from .layer.common import AlphaDropout  #DEFINE_ALIAS
+from .layer.common import Unfold  #DEFINE_ALIAS
 
 from .layer.pooling import AvgPool1D  #DEFINE_ALIAS
 from .layer.pooling import AvgPool2D  #DEFINE_ALIAS
diff --git a/python/paddle/nn/functional/__init__.py b/python/paddle/nn/functional/__init__.py
index 36f39a5056ed5..98124be7288d0 100644
--- a/python/paddle/nn/functional/__init__.py
+++ b/python/paddle/nn/functional/__init__.py
@@ -46,6 +46,7 @@
 from .activation import relu6  #DEFINE_ALIAS
 from .activation import selu  #DEFINE_ALIAS
 from .activation import sigmoid  #DEFINE_ALIAS
+from .activation import silu  #DEFINE_ALIAS
 # from .activation import soft_relu  #DEFINE_ALIAS
 from .activation import softmax  #DEFINE_ALIAS
 from .activation import softmax_  #DEFINE_ALIAS
@@ -58,6 +59,7 @@
 from .activation import tanhshrink  #DEFINE_ALIAS
 from .activation import thresholded_relu  #DEFINE_ALIAS
 from .activation import log_softmax  #DEFINE_ALIAS
+from .activation import glu  #DEFINE_ALIAS
 from .common import dropout  #DEFINE_ALIAS
 from .common import dropout2d  #DEFINE_ALIAS
 from .common import dropout3d  #DEFINE_ALIAS
@@ -97,6 +99,7 @@
 # from .extension import temporal_shift  #DEFINE_ALIAS
 # from .extension import warpctc  #DEFINE_ALIAS
 from .extension import diag_embed  #DEFINE_ALIAS
+from .extension import sequence_mask
 # from .lod import sequence_concat        #DEFINE_ALIAS
 # from .lod import sequence_conv        #DEFINE_ALIAS
 # from .lod import sequence_enumerate        #DEFINE_ALIAS
diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py
index 3553a93dfab20..d74308dc9aa32 100644
--- a/python/paddle/nn/functional/activation.py
+++ b/python/paddle/nn/functional/activation.py
@@ -23,6 +23,8 @@
 from ...tensor.math import tanh_  #DEFINE_ALIAS
 
 from ...tensor.manipulation import _print_warning_in_static_mode
+from ...tensor.manipulation import chunk
+from ...tensor.math import multiply
 
 __all__ = [
     'brelu',
@@ -47,12 +49,14 @@
     'softshrink',
     'softsign',
     'sigmoid',
+    'silu'
     'swish',
     'tanh',
     'tanh_',
     'tanhshrink',
     'thresholded_relu',
     'log_softmax',
+    'glu',
 ]
 
 import warnings
@@ -758,6 +762,39 @@ def selu(x,
     return out
 
 
+def silu(x, name=None):
+    """
+    silu activation.
+    .. math:
+        silu(x) = \frac{x}{1 + e^{-x}}
+    
+    Parameters:
+        x (Tensor): The input Tensor with data type float32, float64.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+    
+    Returns:
+        A Tensor with the same data type and shape as ``x`` .
+    
+    Examples:
+        .. code-block:: python
+        import paddle
+        import paddle.nn.functional as F
+        
+        x = paddle.to_tensor([1.0, 2.0, 3.0, 4.0])
+        out = F.silu(x) # [ 0.731059, 1.761594, 2.857722, 3.928055 ]
+    """
+
+    if in_dygraph_mode():
+        return core.ops.silu(x)
+
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'silu')
+    helper = LayerHelper("silu", **locals())
+    out = helper.create_variable_for_type_inference(x.dtype)
+    helper.append_op(type='silu', inputs={'X': x}, outputs={'Out': out})
+    return out
+
+
 def softmax(x, axis=-1, dtype=None, name=None):
     r"""
     This operator implements the softmax layer. The calculation process is as follows:
@@ -1276,3 +1313,50 @@ def log_softmax(x, axis=-1, dtype=None, name=None):
         attrs={'axis': axis})
 
     return out
+
+
+def glu(x, axis=-1, name=None):
+    r"""
+    The gated linear unit. The input is evenly splited into 2 parts along a 
+    given axis. The first part is used as the content, and the second part is
+    passed through a sigmoid function then used as the gate. The output is a
+    elementwise multiplication of the content and the gate.
+
+    .. math::
+
+        \mathrm{GLU}(a, b) = a \otimes \sigma(b)
+
+    Parameters:
+        x (Tensor): The input Tensor with data type float32, float64.
+        axis (int, optional): The axis along which split the input tensor. It 
+            should be in range [-D, D), where D is the dimensions of ``x`` . 
+            If ``axis`` < 0, it works the same way as :math:`axis + D` . 
+            Default is -1.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+    
+    Returns:
+        A Tensor with the same data type as x. The size of the given aixs is 
+        halved.
+    
+    Examples:
+        .. code-block:: python
+        
+            import paddle
+            from paddle.nn import functional as F
+            
+            x = paddle.to_tensor(
+                [[-0.22014759, -1.76358426,  0.80566144,  0.04241343],
+                 [-1.94900405, -1.89956081,  0.17134808, -1.11280477]]
+            )
+            print(F.glu(x).numpy())
+            # array([[-0.15216254, -0.9004892 ],
+            #        [-1.0577879 , -0.46985325]], dtype=float32)
+        
+    """
+    check_variable_and_dtype(x, 'input', ['float16', 'float32', 'float64'],
+                             "glu")
+    a, b = chunk(x, 2, axis=axis, name=name)
+    gate = sigmoid(b, name=name)
+    out = paddle.multiply(a, gate, name=name)
+    return out
diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py
index 3a52061562532..5263d54045ef1 100644
--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
@@ -25,7 +25,7 @@
 import numpy as np
 from ...device import get_cudnn_version
 from ...fluid.framework import Variable, in_dygraph_mode
-from ...fluid import core, dygraph_utils
+from ...fluid import core, dygraph_utils, get_flags
 from ...fluid.layers import nn, utils
 from ...fluid.data_feeder import check_variable_and_dtype
 from ...fluid.param_attr import ParamAttr
@@ -414,7 +414,7 @@ def conv2d(x,
 
     ..  math::
 
-        Out = \sigma (W \\ast X + b)
+        Out = \sigma (W \ast X + b)
 
     Where:
 
@@ -441,8 +441,8 @@ def conv2d(x,
 
         ..  math::
 
-            H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\\\
-            W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1
+            H_{out}&= \frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\\\
+            W_{out}&= \frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1
 
     Args:
         x (Tensor): The input is 4-D Tensor with shape [N, C, H, W], the data type 
@@ -551,6 +551,13 @@ def conv2d(x,
     if (num_channels == groups and num_channels != 1 and
             num_filters % num_channels == 0):
         l_type = 'depthwise_conv2d'
+        if core.is_compiled_with_rocm():
+            use_cudnn = True
+        else:
+            use_cudnn = False
+
+    if (core.is_compiled_with_cuda() and get_flags("FLAGS_conv2d_disable_cudnn")
+        ["FLAGS_conv2d_disable_cudnn"]):
         use_cudnn = False
 
     return _conv_nd(x, weight, bias, stride, padding, padding_algorithm,
@@ -620,7 +627,7 @@ def conv1d_transpose(x,
           so for conv1d_transpose, when stride > 1, input shape maps multiple output shape.
           If output_size is None, :math:`L_{out} = L^\prime_{out}`;
           else, the :math:`L_{out}` of the output size must between :math:`L^\prime_{out}`
-          and :math:`L^\prime_{out} + stride`. conv1d_transpose can compute the kernel size automatically.
+          and :math:`L^\prime_{out} + stride`.
 
     Args:
         x(Tensor): 3-D tensor with [N, C, L] or [N, L, C] format,
@@ -650,10 +657,7 @@ def conv1d_transpose(x,
             Default: dilation = 1.
         output_size(int|tuple|list, optional): The output image size. If output size is a
             tuple, it must contain one integer, `(feature_length)`. None if use
-            filter_size, padding, and stride to calculate output_size.
-            If output_size and filter_size are specified at the same time, They
-            should follow the formula above. Default: None. output_size and filter_size
-            should not be None at the same time.
+            filter_size(shape of weight), padding, and stride to calculate output_size.
         data_format (str, optional): Specify the data format of the input, and the data format of the output 
             will be consistent with that of the input. An optional string from: `"NCL"`, `"NLC"`.
             The default is `"NCL"`. When it is `"NCL"`, the data is stored in the order of:
@@ -847,7 +851,7 @@ def conv2d_transpose(x,
 
     ..  math::
 
-        Out = \sigma (W \\ast X + b)
+        Out = \sigma (W \ast X + b)
 
     Where:
 
@@ -886,8 +890,7 @@ def conv2d_transpose(x,
           If output_size is None, :math:`H_{out} = H^\prime_{out}, W_{out} = W^\prime_{out}`; 
           else, the :math:`H_{out}` of the output size must between :math:`H^\prime_{out}` 
           and :math:`H^\prime_{out} + strides[0]`, and the :math:`W_{out}` of the output size must 
-          between :math:`W^\prime_{out}` and :math:`W^\prime_{out} + strides[1]`, 
-          conv2d_transpose can compute the kernel size automatically.
+          between :math:`W^\prime_{out}` and :math:`W^\prime_{out} + strides[1]`.
 
     Args:
         x(Tensor): 4-D Tensor with [N, C, H, W] or [N, H, W, C] format,
@@ -922,10 +925,7 @@ def conv2d_transpose(x,
             Otherwise, dilation_height = dilation_width = dilation. Default: dilation = 1.
         output_size(int|tuple|list, optional): The output image size. If output size is a
             tuple, it must contain two integers, (image_height, image_width). None if use
-            filter_size, padding, and stride to calculate output_size.
-            If output_size is specified, output_size and filter_size (weight)'s shape 
-            should follow the formula above. Default: None. output_size and filter_size 
-            should not be None at the same time.
+            filter_size(shape of weight), padding, and stride to calculate output_size.
         data_format (str, optional): Specify the data format of the input, and the data format of the output 
             will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`.
             The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
@@ -1083,7 +1083,7 @@ def conv3d(x,
 
     ..  math::
 
-        Out = \sigma (W \\ast X + b)
+        Out = \sigma (W \ast X + b)
 
     In the above equation:
 
@@ -1239,7 +1239,7 @@ def conv3d_transpose(x,
 
     ..  math::
 
-        Out = \sigma (W \\ast X + b)
+        Out = \sigma (W \ast X + b)
 
     In the above equation:
 
@@ -1282,8 +1282,7 @@ def conv3d_transpose(x,
           size must between :math:`D^\prime_{out}` and :math:`D^\prime_{out} + strides[0]`, 
           the :math:`H_{out}` of the output size must between :math:`H^\prime_{out}` 
           and :math:`H^\prime_{out} + strides[1]`, and the :math:`W_{out}` of the output size must 
-          between :math:`W^\prime_{out}` and :math:`W^\prime_{out} + strides[2]`, 
-          conv3d_transpose can compute the kernel size automatically.
+          between :math:`W^\prime_{out}` and :math:`W^\prime_{out} + strides[2]`.
 
     Args:
         x(Tensor): The input is 5-D Tensor with shape [N, C, D, H, W] or [N, D, H, W, C], the data type 
@@ -1319,10 +1318,8 @@ def conv3d_transpose(x,
             dilation_width). Otherwise, dilation_depth = dilation_height = dilation_width = dilation. 
             Default: dilation = 1.
         output_size(int|list|tuple, optional): The output image size. If output size is a
-            tuple, it must contain three integers, (image_depth, image_height, image_width). This
-            parameter only works when filter_size is None. If output_size and filter_size are 
-            specified at the same time, They should follow the formula above. Default: None. 
-            Output_size and filter_size should not be None at the same time.
+            tuple, it must contain three integers, (image_depth, image_height, image_width).
+            None if use filter_size(shape of weight), padding, and stride to calculate output_size.
         data_format (str, optional): Specify the data format of the input, and the data format of the output 
             will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`.
             The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
diff --git a/python/paddle/nn/functional/extension.py b/python/paddle/nn/functional/extension.py
index 3bbdb89f16c0a..b004d79a877e7 100644
--- a/python/paddle/nn/functional/extension.py
+++ b/python/paddle/nn/functional/extension.py
@@ -14,7 +14,7 @@
 
 # TODO: define the extention functions
 
-__all__ = ['diag_embed']
+__all__ = ['diag_embed', 'sequence_mask']
 
 import numpy as np
 from ...fluid.data_feeder import check_dtype
@@ -23,6 +23,7 @@
 from ...fluid.layers.tensor import assign
 from ...fluid import core, dygraph_utils
 from ...fluid.layers.layer_function_generator import templatedoc
+from ...fluid.layers.sequence_lod import sequence_mask
 
 
 def diag_embed(input, offset=0, dim1=-2, dim2=-1):
diff --git a/python/paddle/nn/functional/norm.py b/python/paddle/nn/functional/norm.py
index 54824233f7076..e6971b3781c3b 100644
--- a/python/paddle/nn/functional/norm.py
+++ b/python/paddle/nn/functional/norm.py
@@ -223,24 +223,27 @@ def batch_norm(x,
 
     helper = LayerHelper('batch_norm', **locals())
 
-    dtype = x.dtype if x.dtype is not 'float16' else 'float32'
+    param_dtype = x.dtype if x.dtype is not 'float16' else 'float32'
     saved_mean = helper.create_variable_for_type_inference(
-        dtype=dtype, stop_gradient=True)
+        dtype=param_dtype, stop_gradient=True)
     saved_variance = helper.create_variable_for_type_inference(
-        dtype=dtype, stop_gradient=True)
-    batch_norm_out = helper.create_variable_for_type_inference(dtype)
-    reserve_space = helper.create_variable_for_type_inference(
-        dtype=x.dtype, stop_gradient=True)
+        dtype=param_dtype, stop_gradient=True)
+    batch_norm_out = helper.create_variable_for_type_inference(x.dtype)
 
     outputs = {
         "Y": [batch_norm_out],
         "MeanOut": [running_mean],
         "VarianceOut": [running_var],
         "SavedMean": [saved_mean],
-        "SavedVariance": [saved_variance],
-        "ReserveSpace": [reserve_space]
+        "SavedVariance": [saved_variance]
     }
 
+    if training or trainable_statistics:
+        # reserve_space is only used for training.
+        reserve_space = helper.create_variable_for_type_inference(
+            dtype=x.dtype, stop_gradient=True)
+        outputs["ReserveSpace"] = [reserve_space]
+
     helper.append_op(
         type="batch_norm", inputs=inputs, outputs=outputs, attrs=attrs)
 
diff --git a/python/paddle/nn/layer/activation.py b/python/paddle/nn/layer/activation.py
index 69cdb7381716b..2a9ae310615ce 100644
--- a/python/paddle/nn/layer/activation.py
+++ b/python/paddle/nn/layer/activation.py
@@ -27,6 +27,7 @@
     'SELU',
     'LeakyReLU',
     'Sigmoid',
+    'Silu',
     'Hardsigmoid',
     'Softmax',
     'Softplus',
@@ -919,6 +920,44 @@ def extra_repr(self):
         return 'threshold={}{}'.format(self._threshold, name_str)
 
 
+class Silu(layers.Layer):
+    """
+    Silu Activation.
+    .. math::
+
+        Silu(x) = \frac{x}{1 + e^{-x}}
+
+    Parameters:
+        x (Tensor): The input Tensor with data type float32, or float64.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        - input: Tensor with any shape.
+        - output: Tensor with the same shape as input.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            x = paddle.to_tensor([1.0, 2.0, 3.0, 4.0])
+            m = paddle.nn.Silu()
+            out = m(x) # [ 0.731059, 1.761594, 2.857722, 3.928055 ]
+    """
+
+    def __init__(self, name=None):
+        super(Silu, self).__init__()
+        self._name = name
+
+    def forward(self, x):
+        return F.silu(x, self._name)
+
+    def extra_repr(self):
+        name_str = 'name={}'.format(self._name) if self._name else ''
+        return name_str
+
+
 class LogSigmoid(layers.Layer):
     r"""
     LogSigmoid Activation.
diff --git a/python/paddle/nn/layer/common.py b/python/paddle/nn/layer/common.py
index 86a6fae0d6857..2f71e5470fd95 100644
--- a/python/paddle/nn/layer/common.py
+++ b/python/paddle/nn/layer/common.py
@@ -35,6 +35,7 @@
     'Dropout3D',
     'Bilinear',
     'AlphaDropout',
+    'Unfold',
 ]
 
 
@@ -1380,3 +1381,73 @@ def extra_repr(self):
         if self._name is not None:
             main_str += ', name={_name}'
         return main_str.format(**self.__dict__)
+
+
+class Unfold(layers.Layer):
+    """
+    This op returns a col buffer of sliding local blocks of input x, also known
+    as im2col for batched 2D image tensors. For each block under the convolution filter,
+    all element will be rearranged as a column. While the convolution filter sliding over
+    the input feature map, a series of such columns will be formed.
+
+    For each input :math:`x` with shape [N, C, H, W], the output shape [N, Cout, Lout]
+    can be calculated as following.
+
+    See ``paddle.nn.functional.unfold`` for more details.
+
+    
+    Parameters:
+        kernel_sizes(int|list):   The size of convolution kernel, should be [k_h, k_w]
+                                  or an integer k treated as [k, k].
+        strides(int|list):        The strides, should be [stride_h, stride_w]
+                                  or an integer stride treated as [sride, stride].
+                                  For default, strides will be [1, 1].
+        paddings(int|list):       The paddings of each dimension, should be
+                                  [padding_top, padding_left, padding_bottom, padding_right]
+                                  or [padding_h, padding_w] or an integer padding.
+                                  If [padding_h, padding_w] was given, it will expanded to
+                                  [padding_h, padding_w, padding_h, padding_w]. If an integer
+                                  padding was given, [padding, padding, padding, padding] will
+                                  be used. For default, paddings will be [0, 0, 0, 0]
+        dilations(int|list):      the dilations of convolution kernel, should be
+                                  [dilation_h, dilation_w], or an integer dilation treated as
+                                  [dilation, dilation]. For default, it will be [1, 1].
+        name(str, optional): The default value is None.
+                             Normally there is no need for user to set this property.
+                             For more information, please refer to :ref:`api_guide_Name`
+
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn as nn
+
+            x = paddle.randn((100,3,224,224))
+            unfold = nn.Unfold(kernel_sizes=[3, 3])
+            result = unfold(x)
+            print(result)
+   """
+
+    def __init__(self,
+                 kernel_sizes,
+                 dilations=1,
+                 paddings=0,
+                 strides=1,
+                 name=None):
+        super(Unfold, self).__init__()
+
+        self.kernel_sizes = kernel_sizes
+        self.dilations = dilations
+        self.paddings = paddings
+        self.strides = strides
+        self.name = name
+
+    def forward(self, input):
+        return F.unfold(input, self.kernel_sizes, self.dilations, self.paddings,
+                        self.strides, self.name)
+
+    def extra_repr(self):
+        name_str = ', name={}'.format(self.name) if self.name else ''
+        return 'kernel_size={}, dilation={}, padding={}, stride={}{}'.\
+                format(self.kernel_sizes, self.dilations, self.paddings, self.strides, name_str)
diff --git a/python/paddle/nn/layer/conv.py b/python/paddle/nn/layer/conv.py
index d65b874c8badc..b90421c2f8c29 100644
--- a/python/paddle/nn/layer/conv.py
+++ b/python/paddle/nn/layer/conv.py
@@ -153,6 +153,13 @@ def _get_default_param_initializer():
                                           in_channels != 1 and
                                           out_channels % in_channels == 0):
             self._op_type = 'depthwise_conv2d'
+            if core.is_compiled_with_rocm():
+                self._use_cudnn = True
+            else:
+                self._use_cudnn = False
+
+        if (core.is_compiled_with_cuda() and get_flags(
+                "FLAGS_conv2d_disable_cudnn")["FLAGS_conv2d_disable_cudnn"]):
             self._use_cudnn = False
 
     def extra_repr(self):
@@ -192,7 +199,7 @@ class Conv1D(_ConvNd):
 
     .. math::
 
-        Out = \sigma (W \\ast X + b)
+        Out = \sigma (W \ast X + b)
 
     Where:
 
@@ -219,7 +226,7 @@ class Conv1D(_ConvNd):
 
         .. math::
 
-            L_{out}&= \\frac{(L_{in} + 2 * padding - (dilation * (L_f - 1) + 1))}{stride} + 1
+            L_{out}&= \frac{(L_{in} + 2 * padding - (dilation * (L_f - 1) + 1))}{stride} + 1 \\
 
     Parameters:
         in_channels(int): The number of channels in the input image.
@@ -251,7 +258,7 @@ class Conv1D(_ConvNd):
             of conv1d. If it is set to None or one attribute of ParamAttr, conv1d
             will create ParamAttr as param_attr. If the Initializer of the param_attr
             is not set, the parameter is initialized with :math:`Normal(0.0, std)`,
-            and the :math:`std` is :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None.
+            and the :math:`std` is :math:`(\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None.
         bias_attr (ParamAttr or bool, optional): The attribute for the bias of conv1d.
             If it is set to False, no bias will be added to the output units.
             If it is set to None or one attribute of ParamAttr, conv1d
@@ -361,7 +368,7 @@ class Conv1DTranspose(_ConvNd):
 
     .. math::
 
-        Out = \sigma (W \\ast X + b)
+        Out = \sigma (W \ast X + b)
 
     Where:
 
@@ -397,7 +404,7 @@ class Conv1DTranspose(_ConvNd):
           so for conv1d_transpose, when stride > 1, input shape maps multiple output shape.
           If output_size is None, :math:`L_{out} = L^\prime_{out}`;
           else, the :math:`L_{out}` of the output size must between :math:`L^\prime_{out}`
-          and :math:`L^\prime_{out} + stride`. conv1d_transpose can compute the kernel size automatically.
+          and :math:`L^\prime_{out} + stride`.
 
     Args:
         in_channels(int): The number of channels in the input image.
@@ -533,7 +540,7 @@ class Conv2D(_ConvNd):
 
     ..  math::
 
-        Out = \sigma (W \\ast X + b)
+        Out = \sigma (W \ast X + b)
 
     Where:
 
@@ -571,7 +578,7 @@ class Conv2D(_ConvNd):
             of conv2d. If it is set to None or one attribute of ParamAttr, conv2d
             will create ParamAttr as param_attr. If it is set to None, the parameter
             is initialized with :math:`Normal(0.0, std)`, and the :math:`std` is
-            :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. The default value is None.
+            :math:`(\frac{2.0 }{filter\_elem\_num})^{0.5}`. The default value is None.
         bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of conv2d.
             If it is set to False, no bias will be added to the output units.
             If it is set to None or one attribute of ParamAttr, conv2d
@@ -596,9 +603,9 @@ class Conv2D(_ConvNd):
 
         ..  math::
 
-           H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (kernel\_size[0] - 1) + 1))}{strides[0]} + 1
+           H_{out}&= \frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (kernel\_size[0] - 1) + 1))}{strides[0]} + 1
 
-           W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (kernel\_size[1] - 1) + 1))}{strides[1]} + 1
+           W_{out}&= \frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (kernel\_size[1] - 1) + 1))}{strides[1]} + 1
 
     Examples:
 
@@ -645,10 +652,6 @@ def __init__(self,
             bias_attr=bias_attr,
             data_format=data_format)
 
-        if (core.is_compiled_with_cuda() and get_flags(
-                "FLAGS_conv2d_disable_cudnn")["FLAGS_conv2d_disable_cudnn"]):
-            self._use_cudnn = False
-
     def forward(self, x):
         if self._padding_mode != 'zeros':
             x = F.pad(x,
@@ -693,7 +696,7 @@ class Conv2DTranspose(_ConvNd):
 
     ..  math::
 
-        Out = \sigma (W \\ast X + b)
+        Out = \sigma (W \ast X + b)
 
     Where:
 
@@ -707,7 +710,7 @@ class Conv2DTranspose(_ConvNd):
     Parameters:
         in_channels(int): The number of channels in the input image.
         out_channels(int): The number of channels produced by the convolution.
-        kernel_size(int|list|uple): The kernel size. If kernel_size is a tuple,
+        kernel_size(int|list|tuple): The kernel size. If kernel_size is a tuple,
             it must contain two integers, (kernel_size_H, kernel_size_W).
             Otherwise, the kernel will be a square.
         stride(int|list|tuple, optional): The stride size. If stride is a tuple, it must
@@ -848,7 +851,7 @@ class Conv3D(_ConvNd):
 
     ..  math::
 
-        Out = \sigma (W \\ast X + b)
+        Out = \sigma (W \ast X + b)
 
     In the above equation:
 
@@ -886,7 +889,7 @@ class Conv3D(_ConvNd):
             of conv3d. If it is set to None or one attribute of ParamAttr, conv3d
             will create ParamAttr as param_attr. If it is set to None, the parameter
             is initialized with :math:`Normal(0.0, std)`, and the :math:`std` is
-            :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. The default value is None.
+            :math:`(\frac{2.0 }{filter\_elem\_num})^{0.5}`. The default value is None.
         bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of conv3d.
             If it is set to False, no bias will be added to the output units.
             If it is set to None or one attribute of ParamAttr, conv3d
@@ -911,11 +914,11 @@ class Conv3D(_ConvNd):
 
         ..  math::
 
-           D_{out}&= \\frac{(D_{in} + 2 * paddings[0] - (dilations[0] * (kernel\_size[0] - 1) + 1))}{strides[0]} + 1
+           D_{out}&= \frac{(D_{in} + 2 * paddings[0] - (dilations[0] * (kernel\_size[0] - 1) + 1))}{strides[0]} + 1
 
-           H_{out}&= \\frac{(H_{in} + 2 * paddings[1] - (dilations[1] * (kernel\_size[1] - 1) + 1))}{strides[1]} + 1
+           H_{out}&= \frac{(H_{in} + 2 * paddings[1] - (dilations[1] * (kernel\_size[1] - 1) + 1))}{strides[1]} + 1
 
-           W_{out}&= \\frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (kernel\_size[2] - 1) + 1))}{strides[2]} + 1
+           W_{out}&= \frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (kernel\_size[2] - 1) + 1))}{strides[2]} + 1
 
     Raises:
         ValueError: If the shapes of input, filter_size, stride, padding and
@@ -1007,7 +1010,7 @@ class Conv3DTranspose(_ConvNd):
     
     ..  math::
 
-        Out = \sigma (W \\ast X + b)
+        Out = \sigma (W \ast X + b)
 
     In the above equation:
 
diff --git a/python/paddle/optimizer/__init__.py b/python/paddle/optimizer/__init__.py
index edebfdfcf3710..07d2935bc7646 100644
--- a/python/paddle/optimizer/__init__.py
+++ b/python/paddle/optimizer/__init__.py
@@ -12,19 +12,27 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__all__ = [
-    'Optimizer', 'Adagrad', 'Adam', 'AdamW', 'Adamax', 'RMSProp', 'Adadelta',
-    'SGD', 'Momentum', 'Lamb', 'lr'
-]
+from .optimizer import Optimizer  # noqa: F401
+from .adagrad import Adagrad  # noqa: F401
+from .adam import Adam  # noqa: F401
+from .adamw import AdamW  # noqa: F401
+from .adamax import Adamax  # noqa: F401
+from .rmsprop import RMSProp  # noqa: F401
+from .adadelta import Adadelta  # noqa: F401
+from .sgd import SGD  # noqa: F401
+from .momentum import Momentum  # noqa: F401
+from .lamb import Lamb  # noqa: F401
+from . import lr  # noqa: F401
 
-from .optimizer import Optimizer
-from .adagrad import Adagrad
-from .adam import Adam
-from .adamw import AdamW
-from .adamax import Adamax
-from .rmsprop import RMSProp
-from .adadelta import Adadelta
-from .sgd import SGD
-from .momentum import Momentum
-from .lamb import Lamb
-from . import lr
+__all__ = [     #noqa
+           'Optimizer',
+           'Adagrad',
+           'Adam',
+           'AdamW',
+           'Adamax',
+           'RMSProp',
+           'Adadelta',
+           'SGD',
+           'Momentum',
+           'Lamb'
+]
diff --git a/python/paddle/optimizer/adadelta.py b/python/paddle/optimizer/adadelta.py
index e921eda41cfb6..42e2a5851c21b 100644
--- a/python/paddle/optimizer/adadelta.py
+++ b/python/paddle/optimizer/adadelta.py
@@ -17,8 +17,6 @@
 from ..fluid import framework
 from ..fluid.framework import Variable, name_scope
 
-__all__ = ["Adadelta"]
-
 
 class Adadelta(Optimizer):
     r"""
diff --git a/python/paddle/optimizer/adagrad.py b/python/paddle/optimizer/adagrad.py
index ec14828e693ee..d3077949ff0ae 100644
--- a/python/paddle/optimizer/adagrad.py
+++ b/python/paddle/optimizer/adagrad.py
@@ -17,8 +17,6 @@
 from ..fluid import framework
 from ..fluid.framework import Variable
 
-__all__ = ["Adagrad"]
-
 
 class Adagrad(Optimizer):
     r"""
diff --git a/python/paddle/optimizer/adam.py b/python/paddle/optimizer/adam.py
index 0cafbda893dd2..dcedf4fc5020a 100644
--- a/python/paddle/optimizer/adam.py
+++ b/python/paddle/optimizer/adam.py
@@ -24,8 +24,6 @@
 
 import paddle
 
-__all__ = ["Adam"]
-
 
 class Adam(Optimizer):
     r"""
diff --git a/python/paddle/optimizer/adamax.py b/python/paddle/optimizer/adamax.py
index 4a6c2278a46f4..9d5adf0bba508 100644
--- a/python/paddle/optimizer/adamax.py
+++ b/python/paddle/optimizer/adamax.py
@@ -17,8 +17,6 @@
 from ..fluid import framework
 from ..fluid.framework import Variable, name_scope
 
-__all__ = ["Adamax"]
-
 
 class Adamax(Optimizer):
     r"""
diff --git a/python/paddle/optimizer/adamw.py b/python/paddle/optimizer/adamw.py
index 78c9fcb83fc24..eb88a48f30320 100644
--- a/python/paddle/optimizer/adamw.py
+++ b/python/paddle/optimizer/adamw.py
@@ -19,8 +19,6 @@
 from ..fluid.dygraph import base as imperative_base
 import paddle
 
-__all__ = ['AdamW']
-
 
 class AdamW(Adam):
     r"""
@@ -59,7 +57,7 @@ class AdamW(Adam):
         weight_decay (float|Tensor, optional): The weight decay coefficient, it can be float or Tensor. The default value is 0.01.
         apply_decay_param_fun (function|None, optional): If it is not None,
             only tensors that makes apply_decay_param_fun(Tensor.name)==True
-            will be updated. It only works when we want to specify tensors.
+            will be updated with weight decay. It only works when we want to specify tensors.
             Default: None.
         grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
             some derived class of ``GradientClipBase`` . There are three cliping strategies
diff --git a/python/paddle/optimizer/lamb.py b/python/paddle/optimizer/lamb.py
index a692f59de5b5f..bab130ec59098 100644
--- a/python/paddle/optimizer/lamb.py
+++ b/python/paddle/optimizer/lamb.py
@@ -17,8 +17,6 @@
 from ..fluid import framework
 from ..fluid.framework import Variable
 
-__all__ = ["Lamb"]
-
 
 class Lamb(Optimizer):
     r"""
diff --git a/python/paddle/optimizer/lr.py b/python/paddle/optimizer/lr.py
index 484b4fb7246a7..f269bffc75ed9 100644
--- a/python/paddle/optimizer/lr.py
+++ b/python/paddle/optimizer/lr.py
@@ -17,10 +17,19 @@
 import warnings
 from paddle import Tensor
 
-__all__ = [
-    'LRScheduler', 'NoamDecay', 'PiecewiseDecay', 'NaturalExpDecay',
-    'InverseTimeDecay', 'PolynomialDecay', 'LinearWarmup', 'ExponentialDecay',
-    'MultiStepDecay', 'StepDecay', 'LambdaDecay', 'ReduceOnPlateau',
+__all__ = [ #noqa
+    'LRScheduler',
+    'NoamDecay',
+    'PiecewiseDecay',
+    'NaturalExpDecay',
+    'InverseTimeDecay',
+    'PolynomialDecay',
+    'LinearWarmup',
+    'ExponentialDecay',
+    'MultiStepDecay',
+    'StepDecay',
+    'LambdaDecay',
+    'ReduceOnPlateau',
     'CosineAnnealingDecay'
 ]
 
diff --git a/python/paddle/optimizer/momentum.py b/python/paddle/optimizer/momentum.py
index 111b2720c8668..932a4ad100ec4 100644
--- a/python/paddle/optimizer/momentum.py
+++ b/python/paddle/optimizer/momentum.py
@@ -21,7 +21,6 @@
 from ..fluid import layers
 import paddle.fluid as fluid
 from paddle.fluid.regularizer import L2DecayRegularizer
-__all__ = ["Momentum"]
 
 
 class Momentum(Optimizer):
diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py
index b37d172606411..a050852728da9 100644
--- a/python/paddle/optimizer/optimizer.py
+++ b/python/paddle/optimizer/optimizer.py
@@ -42,8 +42,6 @@
 from .. import compat as cpt
 from .lr import LRScheduler
 
-__all__ = ['Optimizer']
-
 
 class Optimizer(object):
     r"""Optimizer Base class.
@@ -100,8 +98,19 @@ def __init__(self,
                  weight_decay=None,
                  grad_clip=None,
                  name=None):
-        self._parameter_list = list(
-            parameters) if parameters is not None else None
+        if parameters is not None:
+            # paddle.Tensor is also iterable, so here we don't check whether
+            # the input is iterable, if the input is paddle.Tensor, the
+            # list(paddle.Tensor) will be a error value
+            if isinstance(parameters, paddle.Tensor):
+                raise TypeError(
+                    "`parameters` argument given to the optimizer should be "
+                    "an iterable of paddle Tensors, but got argument type is `{}`.".
+                    format(type(parameters)))
+            self._parameter_list = list(parameters)
+        else:
+            self._parameter_list = None
+
         self._name = name
         if framework.in_dygraph_mode():
             if self._parameter_list is None:
@@ -110,7 +119,8 @@ def __init__(self,
                 )
             if weight_decay is not None:
                 for param in self._parameter_list:
-                    if param.regularizer is not None:
+                    if hasattr(param,
+                               'regularizer') and param.regularizer is not None:
                         logging.info(
                             "If regularizer of a Parameter has been set by 'paddle.ParamAttr' or 'static.WeightNormParamAttr' already. "
                             "The weight_decay[%s] in Optimizer will not take effect, and it will only be applied to other Parameters!"
@@ -433,17 +443,20 @@ def _append_optimize_op(self, block, param_and_grad):
     def _create_param_lr(self, param_and_grad):
         # create learning rate tensor for every parameter
         param = param_and_grad[0]
-        param_lr = param.optimize_attr['learning_rate']
-        if type(param_lr) == Variable:
-            return param_lr
-        else:
-            if param_lr == 1.0:
-                return self._global_learning_rate()
+        if hasattr(param, 'optimize_attr'):
+            param_lr = param.optimize_attr['learning_rate']
+            if type(param_lr) == Variable:
+                return param_lr
             else:
-                with default_main_program()._lr_schedule_guard(
-                        is_with_opt=True), framework.name_scope(
-                            'scale_with_param_lr'):
-                    return self._global_learning_rate() * param_lr
+                if param_lr == 1.0:
+                    return self._global_learning_rate()
+                else:
+                    with default_main_program()._lr_schedule_guard(
+                            is_with_opt=True), framework.name_scope(
+                                'scale_with_param_lr'):
+                        return self._global_learning_rate() * param_lr
+        else:
+            return self._global_learning_rate()
 
     def _create_accumulators(self, block, parameters):
         """Create all accumulators needed by the parameters
diff --git a/python/paddle/optimizer/rmsprop.py b/python/paddle/optimizer/rmsprop.py
index 12825bb781381..7146b7d89935c 100644
--- a/python/paddle/optimizer/rmsprop.py
+++ b/python/paddle/optimizer/rmsprop.py
@@ -17,8 +17,6 @@
 from ..fluid import framework
 from ..fluid.framework import Variable
 
-__all__ = ["RMSProp"]
-
 
 class RMSProp(Optimizer):
     r"""
diff --git a/python/paddle/optimizer/sgd.py b/python/paddle/optimizer/sgd.py
index b2937ff162064..fc208519a2e61 100644
--- a/python/paddle/optimizer/sgd.py
+++ b/python/paddle/optimizer/sgd.py
@@ -17,7 +17,6 @@
 from ..fluid import framework
 from ..fluid.framework import Variable, name_scope
 from ..fluid.dygraph import no_grad
-__all__ = ["SGD"]
 
 
 class SGD(Optimizer):
diff --git a/python/paddle/reader/__init__.py b/python/paddle/reader/__init__.py
index 1a4d45469235d..9002cd0676eda 100644
--- a/python/paddle/reader/__init__.py
+++ b/python/paddle/reader/__init__.py
@@ -63,7 +63,15 @@ def reader():
 
 """
 
-import paddle.reader.decorator
-from paddle.reader.decorator import *
+from paddle.reader.decorator import map_readers  # noqa: F401
+from paddle.reader.decorator import shuffle  # noqa: F401
+from paddle.reader.decorator import xmap_readers  # noqa: F401
+from paddle.reader.decorator import firstn  # noqa: F401
+from paddle.reader.decorator import buffered  # noqa: F401
+from paddle.reader.decorator import compose  # noqa: F401
+from paddle.reader.decorator import cache  # noqa: F401
+from paddle.reader.decorator import ComposeNotAligned  # noqa: F401
+from paddle.reader.decorator import chain  # noqa: F401
+from paddle.reader.decorator import multiprocess_reader  # noqa: F401
 
 __all__ = []
diff --git a/python/paddle/reader/decorator.py b/python/paddle/reader/decorator.py
index 8ee4d73ea847e..4e1c3827d3845 100644
--- a/python/paddle/reader/decorator.py
+++ b/python/paddle/reader/decorator.py
@@ -12,11 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__all__ = [
-    'cache', 'map_readers', 'buffered', 'compose', 'chain', 'shuffle',
-    'ComposeNotAligned', 'firstn', 'xmap_readers', 'multiprocess_reader'
-]
-
 from threading import Thread
 import subprocess
 import multiprocessing
diff --git a/python/paddle/static/nn/__init__.py b/python/paddle/static/nn/__init__.py
index fd84a0a9284ee..0e9754d3c1fbf 100644
--- a/python/paddle/static/nn/__init__.py
+++ b/python/paddle/static/nn/__init__.py
@@ -39,6 +39,21 @@
     'switch_case',
     'while_loop',
     'sparse_embedding',
+    'sequence_conv',
+    'sequence_softmax',
+    'sequence_pool',
+    'sequence_concat',
+    'sequence_first_step',
+    'sequence_last_step',
+    'sequence_slice',
+    'sequence_expand',
+    'sequence_expand_as',
+    'sequence_pad',
+    'sequence_unpad',
+    'sequence_reshape',
+    'sequence_scatter',
+    'sequence_enumerate',
+    'sequence_reverse',
 ]
 
 from .common import fc  #DEFINE_ALIAS
@@ -69,3 +84,19 @@
 
 from ...fluid.input import embedding  #DEFINE_ALIAS
 from ...fluid.contrib.layers import sparse_embedding  #DEFINE_ALIAS
+
+from ...fluid.layers.sequence_lod import sequence_conv
+from ...fluid.layers.sequence_lod import sequence_softmax
+from ...fluid.layers.sequence_lod import sequence_pool
+from ...fluid.layers.sequence_lod import sequence_concat
+from ...fluid.layers.sequence_lod import sequence_first_step
+from ...fluid.layers.sequence_lod import sequence_last_step
+from ...fluid.layers.sequence_lod import sequence_slice
+from ...fluid.layers.sequence_lod import sequence_expand
+from ...fluid.layers.sequence_lod import sequence_expand_as
+from ...fluid.layers.sequence_lod import sequence_pad
+from ...fluid.layers.sequence_lod import sequence_unpad
+from ...fluid.layers.sequence_lod import sequence_reshape
+from ...fluid.layers.sequence_lod import sequence_scatter
+from ...fluid.layers.sequence_lod import sequence_enumerate
+from ...fluid.layers.sequence_lod import sequence_reverse
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index 377435a50008a..669225d813641 100644
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -16,7 +16,7 @@
 
 from ..fluid.layers import core
 from ..fluid.layer_helper import LayerHelper
-from ..fluid.framework import Variable, OpProtoHolder, in_dygraph_mode, convert_np_dtype_to_dtype_, device_guard
+from ..fluid.framework import Variable, OpProtoHolder, in_dygraph_mode, convert_np_dtype_to_dtype_, device_guard, dygraph_only
 from ..fluid.data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
 from ..fluid.layers.tensor import fill_constant
 from ..fluid.layers import utils
@@ -76,6 +76,42 @@ def _print_warning_in_static_mode(api_name):
         format(api_name, api_name))
 
 
+@dygraph_only
+def tolist(x):
+    """
+    **Notes**:
+        **This API is ONLY available in Dygraph mode**
+
+    This function translate the paddle.Tensor to python list.
+
+    Args:
+        x(Tensor): ``x`` is the Tensor we want to translate to list
+
+    Returns:
+        list: A list that contain the same value of current Tensor.
+
+    Returns type:
+        list: dtype is same as current Tensor
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            t = paddle.to_tensor([0,1,2,3,4])
+            expectlist = t.tolist()
+            print(expectlist)   #[0, 1, 2, 3, 4]
+
+            expectlist = paddle.tolist(t)
+            print(expectlist)   #[0, 1, 2, 3, 4]
+
+    """
+    return x.numpy().tolist()
+
+
+setattr(core.VarBase, 'tolist', tolist)
+
+
 def concat(x, axis=0, name=None):
     """
 
@@ -1432,7 +1468,8 @@ def expand(x, shape, name=None):
                     'Elements in shape must be 1-D Tensors or integers.')
 
     check_variable_and_dtype(
-        x, 'x', ['bool', 'float32', 'float64', 'int32', 'int64'], 'expand')
+        x, 'x', ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
+        'expand')
     check_type(shape, 'shape', (list, tuple, Variable), 'expand')
     if convert_dtype(x.dtype) == 'bool' and x.stop_gradient == False:
         raise ValueError("When the data type of input 'x' for expand is bool, "
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index 87efa9ac442b6..215d467828a14 100755
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -1475,10 +1475,10 @@ def clip(x, min=None, max=None, name=None):
         Out = MIN(MAX(x, min), max)
 
     Args:
-        x (Tensor): An N-D Tensor with data type float32 or float64.
-        min (float32|Tensor): The lower bound with type ``float32`` or a ``Tensor``
+        x (Tensor): An N-D Tensor with data type float32, float64, int32 or int64.
+        min (float|int|Tensor): The lower bound with type ``float`` , ``int`` or a ``Tensor``
             with shape [1] and type ``int32``, ``float32``, ``float64``.
-        max (float32|Tensor): The upper bound with type ``float32`` or a ``Tensor``
+        max (float|int|Tensor): The upper bound with type ``float``, ``int`` or a ``Tensor``
             with shape [1] and type ``int32``, ``float32``, ``float64``.
         name (str, optional): The default value is None. Normally there is no
             need for user to set this property. For more information, please
@@ -1503,16 +1503,24 @@ def clip(x, min=None, max=None, name=None):
             # [[4.5, 6.4]
     """
 
-    fmin = float(np.finfo(np.float32).min)
-    fmax = float(np.finfo(np.float32).max)
+    x_dtype = str(x.dtype)
+    if x_dtype == 'paddle.int32':
+        min_ = np.iinfo(np.int32).min
+        max_ = np.iinfo(np.int32).max - 2**7
+    elif x_dtype == 'paddle.int64':
+        min_ = np.iinfo(np.int64).min
+        max_ = np.iinfo(np.int64).max - 2**39
+    else:
+        min_ = float(np.finfo(np.float32).min)
+        max_ = float(np.finfo(np.float32).max)
 
     if in_dygraph_mode():
         if isinstance(min, Variable):
             min = min.numpy().item(0)
         if isinstance(max, Variable):
             max = max.numpy().item(0)
-        min = fmin if min is None else min
-        max = fmax if max is None else max
+        min = min_ if min is None else min
+        max = max_ if max is None else max
         return core.ops.clip(x, "min", min, "max", max)
 
     if min is not None:
@@ -1526,10 +1534,10 @@ def clip(x, min=None, max=None, name=None):
             check_dtype(max.dtype, 'max', ['float32', 'float64', 'int32'],
                         'clip', '(When the type of max in clip is Variable.)')
 
-    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'clip')
+    check_variable_and_dtype(x, 'x', ['float32', 'float64', 'int32', 'int64'], 'clip')
 
     inputs = {'X': x}
-    attrs = {'min': fmin, 'max': fmax}
+    attrs = {'min': min_, 'max': max_}
 
     if isinstance(min, Variable):
         min.stop_gradient = True
diff --git a/python/paddle/tensor/random.py b/python/paddle/tensor/random.py
index ba7ca417382e2..5aca87c150706 100644
--- a/python/paddle/tensor/random.py
+++ b/python/paddle/tensor/random.py
@@ -135,6 +135,9 @@ def multinomial(x, num_samples=1, replacement=False, name=None):
 
     """
 
+    assert core.is_compiled_with_rocm() == False, (
+        "multinomial op is not supported on ROCM yet.")
+
     if in_dygraph_mode():
         return core.ops.multinomial(x, 'num_samples', num_samples,
                                     'replacement', replacement)
diff --git a/python/paddle/tensor/to_string.py b/python/paddle/tensor/to_string.py
index 778a391df605e..e5148d039c927 100644
--- a/python/paddle/tensor/to_string.py
+++ b/python/paddle/tensor/to_string.py
@@ -93,6 +93,10 @@ def set_printoptions(precision=None,
 def _to_sumary(var):
     edgeitems = DEFAULT_PRINT_OPTIONS.edgeitems
 
+    # Handle tensor of shape contains 0, like [0, 2], [3, 0, 3]
+    if np.prod(var.shape) == 0:
+        return np.array([])
+
     if len(var.shape) == 0:
         return var
     elif len(var.shape) == 1:
diff --git a/python/paddle/tests/CMakeLists.txt b/python/paddle/tests/CMakeLists.txt
index 9a676b6b7396b..bb572973fdb36 100644
--- a/python/paddle/tests/CMakeLists.txt
+++ b/python/paddle/tests/CMakeLists.txt
@@ -49,3 +49,4 @@ set_tests_properties(test_vision_models PROPERTIES TIMEOUT 120)
 set_tests_properties(test_dataset_uci_housing PROPERTIES TIMEOUT 120)
 set_tests_properties(test_dataset_imdb PROPERTIES TIMEOUT 300)
 set_tests_properties(test_pretrained_model PROPERTIES TIMEOUT 600) 
+set_tests_properties(test_hapi_hub PROPERTIES TIMEOUT 300) 
diff --git a/python/paddle/tests/dist_hapi_pure_fp16_static.py b/python/paddle/tests/dist_hapi_pure_fp16_static.py
new file mode 100644
index 0000000000000..0174e4f54e341
--- /dev/null
+++ b/python/paddle/tests/dist_hapi_pure_fp16_static.py
@@ -0,0 +1,60 @@
+# copyright (c) 2021 paddlepaddle authors. all rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle import fluid
+
+from paddle import Model
+from paddle.static import InputSpec
+from paddle.nn.layer.loss import CrossEntropyLoss
+from paddle.vision.models import LeNet
+
+
+@unittest.skipIf(not fluid.is_compiled_with_cuda(),
+                 'CPU testing is not supported')
+class TestDistTraningWithPureFP16(unittest.TestCase):
+    def test_amp_training_purefp16(self):
+        if not fluid.is_compiled_with_cuda():
+            self.skipTest('module not tested when ONLY_CPU compling')
+        data = np.random.random(size=(4, 1, 28, 28)).astype(np.float32)
+        label = np.random.randint(0, 10, size=(4, 1)).astype(np.int64)
+
+        paddle.enable_static()
+        paddle.set_device('gpu')
+        net = LeNet()
+        amp_level = "O2"
+        inputs = InputSpec([None, 1, 28, 28], "float32", 'x')
+        labels = InputSpec([None, 1], "int64", "y")
+        model = Model(net, inputs, labels)
+        optim = paddle.optimizer.Adam(
+            learning_rate=0.001,
+            parameters=model.parameters(),
+            multi_precision=True)
+        amp_configs = {"level": amp_level, "use_fp16_guard": False}
+        model.prepare(
+            optimizer=optim,
+            loss=CrossEntropyLoss(reduction="sum"),
+            amp_configs=amp_configs)
+        model.train_batch([data], [label])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/tests/hubconf.py b/python/paddle/tests/hubconf.py
new file mode 100644
index 0000000000000..4b4a853ef2cd9
--- /dev/null
+++ b/python/paddle/tests/hubconf.py
@@ -0,0 +1,24 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+dependencies = ['paddle']
+
+import paddle
+from test_hapi_hub_model import MM as _MM
+
+
+def MM(out_channels=8, pretrained=False):
+    '''This is a test demo for paddle hub
+    '''
+    return _MM(out_channels)
diff --git a/python/paddle/tests/test_dist_hapi_model.py b/python/paddle/tests/test_dist_hapi_model.py
index db5b63c5ae0e2..16788e4656192 100644
--- a/python/paddle/tests/test_dist_hapi_model.py
+++ b/python/paddle/tests/test_dist_hapi_model.py
@@ -129,6 +129,9 @@ def test_hapi_multiple_gpus_static(self):
     def test_hapi_multiple_gpus_dynamic(self):
         self.run_mnist_2gpu('dist_hapi_mnist_dynamic.py')
 
+    def test_hapi_amp_static(self):
+        self.run_mnist_2gpu('dist_hapi_pure_fp16_static.py')
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/tests/test_hapi_amp.py b/python/paddle/tests/test_hapi_amp.py
new file mode 100644
index 0000000000000..ecab4db7516d7
--- /dev/null
+++ b/python/paddle/tests/test_hapi_amp.py
@@ -0,0 +1,115 @@
+# copyright (c) 2020 paddlepaddle authors. all rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle import fluid
+
+from paddle import Model
+from paddle.static import InputSpec
+from paddle.nn.layer.loss import CrossEntropyLoss
+from paddle.vision.models import LeNet
+
+
+@unittest.skipIf(not fluid.is_compiled_with_cuda(),
+                 'CPU testing is not supported')
+class TestDistTraningUsingAMP(unittest.TestCase):
+    def test_amp_training(self):
+        if not fluid.is_compiled_with_cuda():
+            self.skipTest('module not tested when ONLY_CPU compling')
+        data = np.random.random(size=(4, 1, 28, 28)).astype(np.float32)
+        label = np.random.randint(0, 10, size=(4, 1)).astype(np.int64)
+        amp_level = "O1"
+        for dynamic in [True, False]:
+            if not fluid.is_compiled_with_cuda():
+                self.skipTest('module not tested when ONLY_CPU compling')
+            paddle.enable_static() if not dynamic else None
+            paddle.set_device('gpu')
+            net = LeNet()
+            inputs = InputSpec([None, 1, 28, 28], "float32", 'x')
+            labels = InputSpec([None, 1], "int64", "y")
+            model = Model(net, inputs, labels)
+            optim = paddle.optimizer.Adam(
+                learning_rate=0.001, parameters=model.parameters())
+            amp_configs = {"level": amp_level}
+            model.prepare(
+                optimizer=optim,
+                loss=CrossEntropyLoss(reduction="sum"),
+                amp_configs=amp_configs)
+            model.train_batch([data], [label])
+
+    def test_dynamic_check_input(self):
+        paddle.disable_static()
+        amp_configs_list = [
+            {
+                "level": "O3"
+            },
+            {
+                "level": "O1",
+                "test": 0
+            },
+            {
+                "level": "O1",
+                "use_fp16_guard": True
+            },
+            "O3",
+        ]
+        if not fluid.is_compiled_with_cuda():
+            self.skipTest('module not tested when ONLY_CPU compling')
+        paddle.set_device('gpu')
+        net = LeNet()
+        model = Model(net)
+        optim = paddle.optimizer.Adam(
+            learning_rate=0.001, parameters=model.parameters())
+        loss = CrossEntropyLoss(reduction="sum")
+        with self.assertRaises(ValueError):
+            for amp_configs in amp_configs_list:
+                model.prepare(
+                    optimizer=optim, loss=loss, amp_configs=amp_configs)
+        model.prepare(optimizer=optim, loss=loss, amp_configs="O2")
+        model.prepare(
+            optimizer=optim,
+            loss=loss,
+            amp_configs={
+                "custom_white_list": {"matmul"},
+                "init_loss_scaling": 1.0
+            })
+
+    def test_static_check_input(self):
+        paddle.enable_static()
+        amp_configs = {"level": "O2", "use_pure_fp16": True}
+        if not fluid.is_compiled_with_cuda():
+            self.skipTest('module not tested when ONLY_CPU compling')
+        paddle.set_device('gpu')
+
+        net = LeNet()
+        inputs = InputSpec([None, 1, 28, 28], "float32", 'x')
+        labels = InputSpec([None, 1], "int64", "y")
+        model = Model(net, inputs, labels)
+
+        optim = paddle.optimizer.Adam(
+            learning_rate=0.001, parameters=model.parameters())
+        loss = CrossEntropyLoss(reduction="sum")
+        with self.assertRaises(ValueError):
+            model.prepare(optimizer=optim, loss=loss, amp_configs=amp_configs)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/tests/test_hapi_hub.py b/python/paddle/tests/test_hapi_hub.py
new file mode 100644
index 0000000000000..06000d6c83367
--- /dev/null
+++ b/python/paddle/tests/test_hapi_hub.py
@@ -0,0 +1,132 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+import os
+
+import paddle
+from paddle.hapi import hub
+
+import numpy as np
+
+
+class TestHub(unittest.TestCase):
+    def setUp(self, ):
+        self.local_repo = os.path.dirname(os.path.abspath(__file__))
+        self.github_repo = 'lyuwenyu/paddlehub_demo:main'
+
+    def testLoad(self, ):
+        model = hub.load(
+            self.local_repo, model='MM', source='local', out_channels=8)
+
+        data = paddle.rand((1, 3, 100, 100))
+        out = model(data)
+        np.testing.assert_equal(out.shape, [1, 8, 50, 50])
+
+        model = hub.load(
+            self.github_repo, model='MM', source='github', force_reload=True)
+
+        model = hub.load(
+            self.github_repo,
+            model='MM',
+            source='github',
+            force_reload=False,
+            pretrained=False)
+
+        model = hub.load(
+            self.github_repo.split(':')[0],
+            model='MM',
+            source='github',
+            force_reload=False,
+            pretrained=False)
+
+        model = hub.load(
+            self.github_repo,
+            model='MM',
+            source='github',
+            force_reload=False,
+            pretrained=True,
+            out_channels=8)
+
+        data = paddle.ones((1, 3, 2, 2))
+        out = model(data)
+
+        gt = np.array([
+            1.53965068, 0., 0., 1.39455748, 0.72066200, 0.19773030, 2.09201908,
+            0.37345418
+        ])
+        np.testing.assert_equal(out.shape, [1, 8, 1, 1])
+        np.testing.assert_almost_equal(
+            out.numpy(), gt.reshape(1, 8, 1, 1), decimal=5)
+
+    def testHelp(self, ):
+        docs1 = hub.help(
+            self.local_repo,
+            model='MM',
+            source='local', )
+
+        docs2 = hub.help(
+            self.github_repo, model='MM', source='github', force_reload=False)
+
+        assert docs1 == docs2 == 'This is a test demo for paddle hub\n    ', ''
+
+    def testList(self, ):
+        models1 = hub.list(
+            self.local_repo,
+            source='local',
+            force_reload=False, )
+
+        models2 = hub.list(
+            self.github_repo,
+            source='github',
+            force_reload=False, )
+
+        assert models1 == models2 == ['MM'], ''
+
+    def testExcept(self, ):
+        with self.assertRaises(ValueError):
+            _ = hub.help(
+                self.github_repo,
+                model='MM',
+                source='github-test',
+                force_reload=False)
+
+        with self.assertRaises(ValueError):
+            _ = hub.load(
+                self.github_repo,
+                model='MM',
+                source='github-test',
+                force_reload=False)
+
+        with self.assertRaises(ValueError):
+            _ = hub.list(
+                self.github_repo, source='github-test', force_reload=False)
+
+        with self.assertRaises(ValueError):
+            _ = hub.load(
+                self.local_repo, model=123, source='local', force_reload=False)
+
+        with self.assertRaises(RuntimeError):
+            _ = hub.load(
+                self.local_repo,
+                model='123',
+                source='local',
+                force_reload=False)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/tests/test_hapi_hub_model.py b/python/paddle/tests/test_hapi_hub_model.py
new file mode 100644
index 0000000000000..774c7f6f33a65
--- /dev/null
+++ b/python/paddle/tests/test_hapi_hub_model.py
@@ -0,0 +1,29 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+
+class MM(nn.Layer):
+    def __init__(self, out_channels):
+        super(MM, self).__init__()
+        self.conv = nn.Conv2D(3, out_channels, 3, 2, 1)
+
+    def forward(self, x):
+        out = self.conv(x)
+        out = F.relu(out)
+
+        return out
diff --git a/python/paddle/tests/test_model.py b/python/paddle/tests/test_model.py
index af54b046fe699..10ceb48796903 100644
--- a/python/paddle/tests/test_model.py
+++ b/python/paddle/tests/test_model.py
@@ -622,6 +622,8 @@ def test_export_deploy_model(self):
             paddle.enable_static()
 
     def test_dygraph_export_deploy_model_about_inputs(self):
+        self.set_seed()
+        np.random.seed(201)
         mnist_data = MnistDataset(mode='train')
         paddle.disable_static()
         # without inputs
diff --git a/python/paddle/utils/cpp_extension/extension_utils.py b/python/paddle/utils/cpp_extension/extension_utils.py
index 06596c0fae888..aa5a7ab533a28 100644
--- a/python/paddle/utils/cpp_extension/extension_utils.py
+++ b/python/paddle/utils/cpp_extension/extension_utils.py
@@ -612,12 +612,34 @@ def find_paddle_includes(use_cuda=False):
 
     if OS_NAME.startswith('darwin'):
         # NOTE(Aurelius84): Ensure to find std v1 headers correctly.
-        std_v1_includes = '/Library/Developer/CommandLineTools/usr/include/c++/v1/'
-        include_dirs.append(std_v1_includes)
+        std_v1_includes = find_clang_cpp_include()
+        if std_v1_includes is not None and os.path.exists(std_v1_includes):
+            include_dirs.append(std_v1_includes)
 
     return include_dirs
 
 
+def find_clang_cpp_include(compiler='clang'):
+    std_v1_includes = None
+    try:
+        compiler_version = subprocess.check_output([compiler, "--version"])
+        if six.PY3:
+            compiler_version = compiler_version.decode()
+        infos = compiler_version.split("\n")
+        for info in infos:
+            if "InstalledDir" in info:
+                v1_path = info.split(':')[-1].strip()
+                if v1_path and os.path.exists(v1_path):
+                    std_v1_includes = os.path.join(
+                        os.path.dirname(v1_path), 'include/c++/v1')
+    except Exception:
+        # Just raise warnings because the include dir is not required.
+        warnings.warn(
+            "Failed to search `include/c++/v1/` include dirs. Don't worry because it's not required."
+        )
+    return std_v1_includes
+
+
 def find_cuda_libraries():
     """
     Use heuristic method to find cuda static lib path
@@ -793,22 +815,29 @@ def _custom_api_content(op_name):
     params_str, ins_str, attrs_str, outs_str = _get_api_inputs_str(op_name)
 
     API_TEMPLATE = textwrap.dedent("""
+        from paddle.fluid.core import VarBase
+        from paddle.fluid.framework import in_dygraph_mode, _dygraph_tracer
         from paddle.fluid.layer_helper import LayerHelper
 
         def {op_name}({inputs}):
-            helper = LayerHelper("{op_name}", **locals())
-
             # prepare inputs and outputs
             ins = {ins}
             attrs = {attrs}
             outs = {{}}
             out_names = {out_names}
-            for out_name in out_names:
-                # Set 'float32' temporarily, and the actual dtype of output variable will be inferred
-                # in runtime.
-                outs[out_name] = helper.create_variable(dtype='float32')
 
-            helper.append_op(type="{op_name}", inputs=ins, outputs=outs, attrs=attrs)
+            # The output variable's dtype use default value 'float32',
+            # and the actual dtype of output variable will be inferred in runtime.
+            if in_dygraph_mode():
+                for out_name in out_names:
+                    outs[out_name] = VarBase()
+                _dygraph_tracer().trace_op(type="{op_name}", inputs=ins, outputs=outs, attrs=attrs)
+            else:
+                helper = LayerHelper("{op_name}", **locals())
+                for out_name in out_names:
+                    outs[out_name] = helper.create_variable(dtype='float32')
+
+                helper.append_op(type="{op_name}", inputs=ins, outputs=outs, attrs=attrs)
 
             res = [outs[out_name] for out_name in out_names]
 
diff --git a/python/paddle/utils/download.py b/python/paddle/utils/download.py
index b7d7d0b5adb54..dda8abeff21c0 100644
--- a/python/paddle/utils/download.py
+++ b/python/paddle/utils/download.py
@@ -117,7 +117,11 @@ def _get_unique_endpoints(trainer_endpoints):
     return unique_endpoints
 
 
-def get_path_from_url(url, root_dir, md5sum=None, check_exist=True):
+def get_path_from_url(url,
+                      root_dir,
+                      md5sum=None,
+                      check_exist=True,
+                      decompress=True):
     """ Download from given url to root_dir.
     if file or directory specified by url is exists under
     root_dir, return the path directly, otherwise download
@@ -152,7 +156,8 @@ def get_path_from_url(url, root_dir, md5sum=None, check_exist=True):
                 time.sleep(1)
 
     if ParallelEnv().current_endpoint in unique_endpoints:
-        if tarfile.is_tarfile(fullpath) or zipfile.is_zipfile(fullpath):
+        if decompress and (tarfile.is_tarfile(fullpath) or
+                           zipfile.is_zipfile(fullpath)):
             fullpath = _decompress(fullpath)
 
     return fullpath
diff --git a/python/paddle/utils/install_check.py b/python/paddle/utils/install_check.py
index 3b98680c89f25..b39009985e735 100644
--- a/python/paddle/utils/install_check.py
+++ b/python/paddle/utils/install_check.py
@@ -74,6 +74,34 @@ def _is_cuda_available():
         return False
 
 
+def _run_dygraph_single(use_cuda):
+    """
+    Testing the simple network in dygraph mode using one CPU/GPU.
+
+    Args:
+        use_cuda (bool): Whether running with CUDA.
+    """
+    paddle.disable_static()
+    if use_cuda:
+        paddle.set_device('gpu')
+    else:
+        paddle.set_device('cpu')
+    weight_attr = paddle.ParamAttr(
+        name="weight", initializer=paddle.nn.initializer.Constant(value=0.5))
+    bias_attr = paddle.ParamAttr(
+        name="bias", initializer=paddle.nn.initializer.Constant(value=1.0))
+    linear = paddle.nn.Linear(
+        2, 4, weight_attr=weight_attr, bias_attr=bias_attr)
+    input_np = _prepare_data(1)
+    input_tensor = paddle.to_tensor(input_np)
+    linear_out = linear(input_tensor)
+    out = paddle.tensor.sum(linear_out)
+    out.backward()
+    opt = paddle.optimizer.Adam(
+        learning_rate=0.001, parameters=linear.parameters())
+    opt.step()
+
+
 def _run_static_single(use_cuda):
     """
     Testing the simple network with executor running directly, using one CPU/GPU.
@@ -152,7 +180,11 @@ def run_check():
 
     print("Running verify PaddlePaddle program ... ")
 
-    use_cuda = _is_cuda_available()
+    if paddle.is_compiled_with_cuda():
+        use_cuda = _is_cuda_available()
+    else:
+        use_cuda = False
+
     if use_cuda:
         device_str = "GPU"
         device_list = paddle.static.cuda_places()
@@ -162,6 +194,7 @@ def run_check():
     device_count = len(device_list)
 
     _run_static_single(use_cuda)
+    _run_dygraph_single(use_cuda)
     print("PaddlePaddle works well on 1 {}.".format(device_str))
 
     try:
diff --git a/python/setup.py.in b/python/setup.py.in
index 601e6e48703ac..3458a42d2d907 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -145,17 +145,21 @@ packages=['paddle',
           'paddle.distributed',
           'paddle.incubate',
           'paddle.incubate.optimizer',
+          'paddle.incubate.checkpoint',
           'paddle.distributed.fleet',
           'paddle.distributed.fleet.base',
           'paddle.distributed.fleet.meta_optimizers',
           'paddle.distributed.fleet.meta_optimizers.sharding',
           'paddle.distributed.fleet.meta_optimizers.ascend',
+          'paddle.distributed.fleet.meta_optimizers.dygraph_optimizer',
           'paddle.distributed.fleet.runtime',
           'paddle.distributed.fleet.dataset',
           'paddle.distributed.fleet.data_generator',
           'paddle.distributed.fleet.metrics',
           'paddle.distributed.fleet.proto',
           'paddle.distributed.fleet.utils',
+          'paddle.distributed.fleet.meta_parallel',
+          'paddle.distributed.fleet.meta_parallel.parallel_layers',
           'paddle.framework',
           'paddle.jit',
           'paddle.jit.dy2static',
@@ -173,11 +177,9 @@ packages=['paddle',
           'paddle.fluid.contrib',
           'paddle.fluid.contrib.decoder',
           'paddle.fluid.contrib.quantize',
-          'paddle.fluid.contrib.reader',
           'paddle.fluid.contrib.slim',
           'paddle.fluid.contrib.slim.quantization',
           'paddle.fluid.contrib.slim.quantization.imperative',
-          'paddle.fluid.contrib.utils',
           'paddle.fluid.contrib.extend_optimizer',
           'paddle.fluid.contrib.mixed_precision',
           'paddle.fluid.contrib.mixed_precision.bf16',
@@ -185,7 +187,6 @@ packages=['paddle',
           'paddle.fluid.transpiler',
           'paddle.fluid.transpiler.details',
           'paddle.fluid.incubate',
-          'paddle.fluid.incubate.data_generator',
           'paddle.fluid.incubate.fleet',
           'paddle.fluid.incubate.checkpoint',
           'paddle.fluid.incubate.fleet.base',
@@ -391,15 +392,22 @@ if os.name == 'nt':
 elif sys.platform == 'darwin':
     ext_modules = []
 
-def find_files(pattern, root):
+def find_files(pattern, root, recursive=False):
     for dirpath, _, files in os.walk(root):
-      for filename in fnmatch.filter(files, pattern):
-          yield os.path.join(dirpath, filename)
+        for filename in fnmatch.filter(files, pattern):
+            yield os.path.join(dirpath, filename)
+        if not recursive:
+            break
 
 headers = (
     list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle')) +
-    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/extension')) +  # extension
-    list(find_files('*', '${BOOST_INCLUDE_DIR}/boost')))  # boost
+    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/extension/include')) +  # extension
+    list(find_files('*', '${BOOST_INCLUDE_DIR}/boost', True)) + # boost
+    # For paddle uew custom op, only copy data type headers from `paddle/fluid/platform`
+    # to `extension/incude`,
+    ['@PADDLE_SOURCE_DIR@/paddle/fluid/platform/complex64.h'] +
+    ['@PADDLE_SOURCE_DIR@/paddle/fluid/platform/complex128.h'] +
+    ['@PADDLE_SOURCE_DIR@/paddle/fluid/platform/float16.h'])
 
 if '${WITH_MKLDNN}' == 'ON':
     headers += list(find_files('*', '${MKLDNN_INSTALL_DIR}/include')) # mkldnn
@@ -439,35 +447,18 @@ class InstallHeaders(Command):
                                    ('install_headers', 'install_dir'),
                                    ('force', 'force'))
 
-    def copy_data_type_headers(self):
-        # For paddle uew custom op, only copy data type headers from `paddle/fluid/platform`
-        # to `extension/incude`,
-        data_type_headers = (['@PADDLE_SOURCE_DIR@/paddle/fluid/platform/complex64.h'] + 
-                             ['@PADDLE_SOURCE_DIR@/paddle/fluid/platform/complex128.h'] + 
-                             ['@PADDLE_SOURCE_DIR@/paddle/fluid/platform/float16.h'])
-
-        install_dir = os.path.join(self.install_dir, "paddle/fluid/extension/include")
-        if not os.path.exists(install_dir):
-            self.mkpath(install_dir)
-        for header in data_type_headers:
-            self.copy_file(header, install_dir)
-
     def mkdir_and_copy_file(self, header):
         if 'pb.h' in header:
             install_dir = re.sub('${PADDLE_BINARY_DIR}/', '', header)
         elif 'third_party' not in header:
             # paddle headers
             install_dir = re.sub('@PADDLE_SOURCE_DIR@/', '', header)
+            if 'fluid' in install_dir:
+                install_dir = "paddle/extension/include/"
         else:
             # third_party
             install_dir = re.sub('${THIRD_PARTY_PATH}', 'third_party', header)
-            patterns = ['eigen3/src/extern_eigen3', 'boost/src/extern_boost',
-                       'dlpack/src/extern_dlpack/include',
-                       'install/protobuf/include',
-                       'install/gflags/include',
-                       'install/glog/include', 'install/xxhash/include',
-                       'install/mkldnn/include',
-                       'threadpool/src/extern_threadpool']
+            patterns = ['boost/src/extern_boost', 'install/mkldnn/include']
             for pattern in patterns:
                 install_dir = re.sub(pattern, '', install_dir)
         install_dir = os.path.join(self.install_dir, os.path.dirname(install_dir))
@@ -483,7 +474,6 @@ class InstallHeaders(Command):
         for header in hdrs:
             (out, _) = self.mkdir_and_copy_file(header)
             self.outfiles.append(out)
-        self.copy_data_type_headers()
 
     def get_inputs(self):
         return self.distribution.headers or []
@@ -515,6 +505,12 @@ else:
     with open("@PADDLE_BINARY_DIR@/python/paddle/README.rst", "r")as f:
         long_description = unicode(f.read(), 'UTF-8')
 
+# strip *.so to reduce package size
+if '${WITH_STRIP}' == 'ON':
+    command = 'find ${PADDLE_BINARY_DIR}/python/paddle -name "*.so" | xargs -i strip {}'
+    if os.system(command) != 0:
+        raise Exception("strip *.so failed, command: %s" % command)
+
 with redirect_stdout():
     setup(name='${PACKAGE_NAME}',
         version='${PADDLE_VERSION}',
diff --git a/tools/__pycache__/static_mode_white_list.cpython-37.pyc b/tools/__pycache__/static_mode_white_list.cpython-37.pyc
new file mode 100644
index 0000000000000..b1e58ce7689c7
Binary files /dev/null and b/tools/__pycache__/static_mode_white_list.cpython-37.pyc differ
diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh
index f3bf3ea508ba7..05466883e58d2 100644
--- a/tools/check_file_diff_approvals.sh
+++ b/tools/check_file_diff_approvals.sh
@@ -53,6 +53,7 @@ API_FILES=("CMakeLists.txt"
            "python/paddle/fluid/tests/unittests/white_list/check_op_sequence_batch_1_input_white_list.py"
            "python/paddle/fluid/tests/unittests/white_list/no_grad_set_white_list.py"
            "tools/wlist.json"
+           "tools/sampcd_processor.py"
            "paddle/scripts/paddle_build.bat"
            "tools/windows/run_unittests.sh"
            "tools/parallel_UT_rule.py"
@@ -79,6 +80,12 @@ function add_failed(){
     echo_list="${echo_list[@]}$1"
 }
 
+function run_test_sampcd_processor() {
+    CUR_PWD=$(pwd)
+    cd ${PADDLE_ROOT}/tools
+    python test_sampcd_processor.py
+    cd ${CUR_PWD}
+}
 
 if [[ $git_files -gt 19 || $git_count -gt 999 ]];then
     echo_line="You must have Dianhai approval for change 20+ files or add than 1000+ lines of content.\n"
@@ -136,6 +143,9 @@ for API_FILE in ${API_FILES[*]}; do
       elif [ "${API_FILE}" == "tools/wlist.json" ];then
           echo_line="You must have one TPM (jzhang533) approval for the api whitelist for the tools/wlist.json.\n"
           check_approval 1 29231
+      elif [ "${API_FILE}" == "tools/sampcd_processor.py" ];then
+          echo_line="test_sampcd_processor.py will be executed for changed sampcd_processor.py.\n"
+          run_test_sampcd_processor
       elif [ "${API_FILE}" == "python/paddle/distributed/fleet/__init__.py" ]; then
 	      echo_line="You must have (fuyinno4 (Recommend), raindrops2sea) approval for ${API_FILE} changes"
 	      check_approval 1 35824027 38231817
diff --git a/tools/coverage/gcda_clean.py b/tools/coverage/gcda_clean.py
index 39fa3509cb86e..12bd04a6907ea 100644
--- a/tools/coverage/gcda_clean.py
+++ b/tools/coverage/gcda_clean.py
@@ -18,6 +18,7 @@
 
 import os
 import sys
+import time
 
 from github import Github
 
@@ -33,7 +34,18 @@ def get_pull(pull_id):
     """
     token = os.getenv('GITHUB_API_TOKEN')
     github = Github(token, timeout=60)
-    repo = github.get_repo('PaddlePaddle/Paddle')
+    idx = 1
+    while idx < 4:
+        try:
+            repo = github.get_repo('PaddlePaddle/Paddle')
+        except Exception as e:
+            print(e)
+            print("get_repo error, retry {} times after {} secs.".format(
+                idx, idx * 10))
+        else:
+            break
+        idx += 1
+        time.sleep(idx * 10)
     pull = repo.get_pull(pull_id)
 
     return pull
diff --git a/tools/coverage/paddle_coverage.sh b/tools/coverage/paddle_coverage.sh
index fd28d939bd1cc..7fb32040e795c 100644
--- a/tools/coverage/paddle_coverage.sh
+++ b/tools/coverage/paddle_coverage.sh
@@ -33,7 +33,7 @@ make install
 
 cd /paddle/build
 
-python3.7 ${PADDLE_ROOT}/tools/coverage/gcda_clean.py ${GIT_PR_ID}
+python3.7 ${PADDLE_ROOT}/tools/coverage/gcda_clean.py ${GIT_PR_ID} || exit 101
 
 lcov --capture -d ./ -o coverage.info --rc lcov_branch_coverage=0
 
diff --git a/tools/parallel_UT_rule.py b/tools/parallel_UT_rule.py
index 0f745f212078f..d2969618b85e8 100644
--- a/tools/parallel_UT_rule.py
+++ b/tools/parallel_UT_rule.py
@@ -425,7 +425,6 @@
     'cpu_info_test',
     'cpu_helper_test',
     'cow_ptr_tests',
-    'convert_model2dot_ernie',
     'conditional_block_op_test',
     'cipher_utils_test',
     'check_reduce_rank_test',
@@ -483,6 +482,7 @@
 # It run 2 job each time, If it failed due to Insufficient GPU memory or CUBLAS_STATUS_ALLOC_FAILED, 
 # just remove it from this list.
 TWO_PARALLEL_JOB = [
+    'convert_model2dot_ernie',
     'im2col_test',
     'test_elementwise_add_grad_grad',
     'test_logical_op',
@@ -512,13 +512,10 @@
     'test_activation_nn_grad',
     'test_pool2d_int8_mkldnn_op',
     'test_adagrad_op_v2',
-    'test_elementwise_add_op',
     'test_nn_functional_hot_op',
     'test_op_name_conflict',
-    'test_softmax_with_cross_entropy_op',
     'test_imperative_gan',
     'test_simnet',
-    'test_instance_norm_op',
     'test_amp_check_finite_and_scale_op',
     'test_random_seed',
     'test_histogram_op',
@@ -539,7 +536,6 @@
     'test_sigmoid_cross_entropy_with_logits_op',
     'test_regularizer_api',
     'test_lrn_op',
-    'test_rank_attention_op',
     'test_parallel_ssa_graph_inference_feed_partial_data',
     'test_lod_reset_op',
     'test_install_check',
@@ -554,14 +550,12 @@
     'test_gather_tree_op',
     'test_decoupled_py_reader',
     'test_imperative_named_members',
-    'test_conv3d_op',
     'test_seqconv_eltadd_relu_fuse_pass',
     'test_analysis_predictor',
     'test_convert_operators',
     'test_add_reader_dependency',
     'test_is_tensor',
     'test_variable',
-    'test_unsqueeze_op',
     'test_save_model_without_var',
     'test_unfold_op',
     'test_conv_bn_fuse_pass',
@@ -618,7 +612,6 @@
     'test_adamax_op',
     'test_while_loop_op',
     'test_affine_grid_function',
-    'test_trilinear_interp_op',
     'test_transpose_flatten_concat_fuse_pass',
     'test_trace_op',
     'test_backward',
@@ -813,7 +806,6 @@
     'test_sequence_expand_as',
     'test_sequence_reverse',
     'test_shape_op',
-    'test_lod_tensor',
     'test_diag',
     'test_strided_slice_op',
     'test_switch_case',
@@ -831,7 +823,6 @@
     'test_arange',
     'test_lrn_mkldnn_op',
     'test_imperative_gnn',
-    'test_eager_deletion_while_op',
     'test_dequantize_abs_max_op',
     'test_elementwise_mul_op',
     'test_tensor_scalar_type_promotion_dynamic',
@@ -873,7 +864,6 @@
     'test_manual_seed',
     'test_buffer_shared_memory_reuse_pass',
     'test_range',
-    'test_activation_op',
     'test_box_decoder_and_assign_op',
     'test_imperative_optimizer_v2',
     'test_python_operator_overriding',
diff --git a/tools/print_signatures.py b/tools/print_signatures.py
index a18774a8b57b6..5b250eaf8f98f 100644
--- a/tools/print_signatures.py
+++ b/tools/print_signatures.py
@@ -27,19 +27,12 @@
 import hashlib
 import six
 import functools
-import logging
+import paddle
 
 member_dict = collections.OrderedDict()
 
 visited_modules = set()
 
-# APIs that should not be printed into API.spec 
-omitted_list = [
-    "paddle.fluid.LoDTensor.set",  # Do not know why it should be omitted
-    "paddle.fluid.io.ComposeNotAligned",
-    "paddle.fluid.io.ComposeNotAligned.__init__",
-]
-
 
 def md5(doc):
     hash = hashlib.md5()
@@ -74,13 +67,28 @@ def format_spec(spec):
 
 
 def queue_dict(member, cur_name):
-    if cur_name in omitted_list:
-        return
-
-    doc_md5 = md5(member.__doc__)
-
-    if inspect.isclass(member):
+    if cur_name != 'paddle':
+        try:
+            eval(cur_name)
+        except (AttributeError, NameError, SyntaxError) as e:
+            print(
+                "Error({}) occurred when `eval({})`, discard it.".format(
+                    str(e), cur_name),
+                file=sys.stderr)
+            return
+
+    if (inspect.isclass(member) or inspect.isfunction(member) or
+            inspect.ismethod(member)) and hasattr(
+                member, '__module__') and hasattr(member, '__name__'):
         args = member.__module__ + "." + member.__name__
+        try:
+            eval(args)
+        except (AttributeError, NameError, SyntaxError) as e:
+            print(
+                "Error({}) occurred when `eval({})`, discard it for {}.".format(
+                    str(e), args, cur_name),
+                file=sys.stderr)
+            return
     else:
         try:
             args = inspect.getargspec(member)
@@ -95,6 +103,7 @@ def queue_dict(member, cur_name):
         if not has_type_error:
             args = format_spec(args)
 
+    doc_md5 = md5(member.__doc__)
     member_dict[cur_name] = "({}, ('document', '{}'))".format(args, doc_md5)
 
 
@@ -106,8 +115,7 @@ def visit_member(parent_name, member, member_name=None):
     if inspect.isclass(member):
         queue_dict(member, cur_name)
         for name, value in inspect.getmembers(member):
-            if hasattr(value, '__name__') and (not name.startswith("_") or
-                                               name == "__init__"):
+            if hasattr(value, '__name__') and not name.startswith("_"):
                 visit_member(cur_name, value)
     elif inspect.ismethoddescriptor(member):
         return
@@ -149,11 +157,14 @@ def visit_all_module(mod):
         return
 
     visited_modules.add(mod)
-
-    for member_name in (
-            name
-            for name in (mod.__all__ if hasattr(mod, "__all__") else dir(mod))
-            if not name.startswith("_")):
+    if hasattr(mod, "__all__"):
+        member_names = (name for name in mod.__all__
+                        if not name.startswith("_"))
+    elif mod_name == 'paddle':
+        member_names = dir(mod)
+    else:
+        return
+    for member_name in member_names:
         instance = getattr(mod, member_name, None)
         if instance is None:
             continue
@@ -168,17 +179,19 @@ def visit_all_module(mod):
             visit_all_module(instance)
         else:
             if member_name != instance.__name__:
-                logging.warn(
+                print(
                     "Found alias API, alias name is: {}, original name is: {}".
-                    format(member_name, instance.__name__))
+                    format(member_name, instance.__name__),
+                    file=sys.stderr)
                 visit_member(mod.__name__, instance, member_name)
             else:
                 visit_member(mod.__name__, instance)
 
 
-modules = sys.argv[1].split(",")
-for m in modules:
-    visit_all_module(importlib.import_module(m))
+if __name__ == '__main__':
+    modules = sys.argv[1].split(",")
+    for m in modules:
+        visit_all_module(importlib.import_module(m))
 
-for name in member_dict:
-    print(name, member_dict[name])
+    for name in member_dict:
+        print(name, member_dict[name])
diff --git a/tools/sampcd_processor.py b/tools/sampcd_processor.py
index ce0490d487fbe..cd5d9b5c08ad7 100644
--- a/tools/sampcd_processor.py
+++ b/tools/sampcd_processor.py
@@ -19,9 +19,13 @@
 import math
 import platform
 import inspect
-import paddle
-import paddle.fluid
+#import paddle
+#import paddle.fluid
 import json
+import argparse
+import shutil
+import re
+import logging
 """
 please make sure to run in the tools path
 usage: python sample_test.py {arg1} 
@@ -33,6 +37,24 @@
 
 """
 
+logger = logging.getLogger()
+if logger.handlers:
+    console = logger.handlers[
+        0]  # we assume the first handler is the one we want to configure
+else:
+    console = logging.StreamHandler()
+    logger.addHandler(console)
+console.setFormatter(logging.Formatter("%(message)s"))
+
+RUN_ON_DEVICE = 'cpu'
+GPU_ID = 0
+methods = []
+whl_error = []
+API_DEV_SPEC_FN = 'paddle/fluid/API_DEV.spec'
+API_PR_SPEC_FN = 'paddle/fluid/API_PR.spec'
+API_DIFF_SPEC_FN = 'dev_pr_diff_api.spec'
+SAMPLECODE_TEMPDIR = 'samplecode_temp'
+
 
 def find_all(srcstr, substr):
     """
@@ -98,35 +120,50 @@ def sampcd_extract_and_run(srccom, name, htype="def", hname=""):
 
     Returns:
         result: True or False
+        name(str): the name of the API.
+        msg(str): messages
     """
+    sample_code_filenames = sampcd_extract_to_file(srccom, name, htype, hname)
+    if not sample_code_filenames:
+        return False, name, 'No sample code!'
+
+    results = []
+    msgs = []
+    for tfname in sample_code_filenames:
+        result, msg = execute_samplecode_test(tfname)
+        results.append(result)
+        msgs.append(msg)
+
+    if not all(results):
+        failed_fn = []
+        for i, result in enumerate(results):
+            if not result:
+                failed_fn.append(sample_code_filenames[i])
+        return False, name, 'failed sample codes: ' + ','.join(failed_fn)
+    return True, name, 'success!'
+
+
+def sampcd_extract_to_file(srccom, name, htype="def", hname=""):
+    """
+    Extract sample codes from __doc__, and write them to files.
 
-    result = True
+    Args:
+        srccom(str): the source comment of some API whose
+                     example codes will be extracted and run.
+        name(str): the name of the API.
+        htype(str): the type of hint banners, def/class/method.
+        hname(str): the name of the hint  banners , e.t. def hname.
 
-    def sampcd_header_print(name, sampcd, htype, hname):
-        """
-        print hint banner headers.
-
-        Args:
-            name(str): the name of the API.
-            sampcd(str): sample code string
-            htype(str): the type of hint banners, def/class/method.
-            hname(str): the name of the hint  banners , e.t. def hname.
-            flushed.
-        """
-        print_header(htype, hname)
-        print("Sample code ", str(y), " extracted for ", name, "   :")
-        print(sampcd)
-        print("----example code check----\n")
-        print("executing sample code .....")
-        print("execution result:")
+    Returns:
+        sample_code_filenames(list of str)
+    """
+    global GPU_ID, RUN_ON_DEVICE, SAMPLECODE_TEMPDIR
 
     sampcd_begins = find_all(srccom, " code-block:: python")
     if len(sampcd_begins) == 0:
-        print_header(htype, hname)
-        '''
-        detect sample codes using >>> to format
-        and consider this situation as wrong
-        '''
+        # detect sample codes using >>> to format and consider this situation as wrong
+        print(htype, " name:", hname)
+        print("-----------------------")
         if srccom.find("Examples:") != -1:
             print("----example code check----\n")
             if srccom.find(">>>") != -1:
@@ -134,11 +171,11 @@ def sampcd_header_print(name, sampcd, htype, hname):
                     "Deprecated sample code style:\n\n    Examples:\n\n        >>>codeline\n        >>>codeline\n\n\n ",
                     "Please use '.. code-block:: python' to ",
                     "format sample code.\n")
-                result = False
+                return []
         else:
             print("Error: No sample code!\n")
-            result = False
-
+            return []
+    sample_code_filenames = []
     for y in range(1, len(sampcd_begins) + 1):
         sampcd_begin = sampcd_begins[y - 1]
         sampcd = srccom[sampcd_begin + len(" code-block:: python") + 1:]
@@ -164,46 +201,59 @@ def sampcd_header_print(name, sampcd, htype, hname):
                 sampcd_to_write.append(cdline[min_indent:])
 
         sampcd = '\n'.join(sampcd_to_write)
-        if sys.argv[1] == "cpu":
-            sampcd = '\nimport os\n' + 'os.environ["CUDA_VISIBLE_DEVICES"] = ""\n' + sampcd
-        if sys.argv[1] == "gpu":
-            sampcd = '\nimport os\n' + 'os.environ["CUDA_VISIBLE_DEVICES"] = "0"\n' + sampcd
+        if RUN_ON_DEVICE == "cpu":
+            sampcd = '\nimport os\nos.environ["CUDA_VISIBLE_DEVICES"] = ""\n' + sampcd
+        if RUN_ON_DEVICE == "gpu":
+            sampcd = '\nimport os\nos.environ["CUDA_VISIBLE_DEVICES"] = "{}"\n'.format(
+                GPU_ID) + sampcd
         sampcd += '\nprint(' + '\"' + name + ' sample code is executed successfully!\")'
 
-        if len(sampcd_begins) > 1:
-            tfname = name + "_example_" + str(y) + ".py"
-        else:
-            tfname = name + "_example" + ".py"
-        tempf = open("samplecode_temp/" + tfname, 'w')
-        tempf.write(sampcd)
-        tempf.close()
-        if platform.python_version()[0] == "2":
-            cmd = ["python", "samplecode_temp/" + tfname]
-        elif platform.python_version()[0] == "3":
-            cmd = ["python3", "samplecode_temp/" + tfname]
-        else:
-            print("Error: fail to parse python version!")
-            result = False
-            exit(1)
+        tfname = os.path.join(SAMPLECODE_TEMPDIR, '{}_example{}'.format(
+            name, '.py' if len(sampcd_begins) == 1 else '_{}.py'.format(y)))
+        with open(tfname, 'w') as tempf:
+            tempf.write(sampcd)
+        sample_code_filenames.append(tfname)
+    return sample_code_filenames
+
 
-        subprc = subprocess.Popen(
-            cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-        output, error = subprc.communicate()
-        msg = "".join(output.decode(encoding='utf-8'))
-        err = "".join(error.decode(encoding='utf-8'))
+def execute_samplecode_test(tfname):
+    result = True
+    msg = None
+    if platform.python_version()[0] in ["2", "3"]:
+        cmd = [sys.executable, tfname]
+    else:
+        print("Error: fail to parse python version!")
+        result = False
+        exit(1)
 
-        if subprc.returncode != 0:
-            print("\nSample code error found in ", name, ":\n")
-            sampcd_header_print(name, sampcd, htype, hname)
-            print("subprocess return code: ", str(subprc.returncode))
-            print("Error Raised from Sample Code ", name, " :\n")
-            print(err)
-            print(msg)
-            result = False
-        # msg is the returned code execution report
-        #os.remove("samplecode_temp/" + tfname)
+    logging.info('running %s', tfname)
+    print("\n----example code check----")
+    print("executing sample code .....", tfname)
+    subprc = subprocess.Popen(
+        cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    output, error = subprc.communicate()
+    msg = "".join(output.decode(encoding='utf-8'))
+    err = "".join(error.decode(encoding='utf-8'))
+
+    if subprc.returncode != 0:
+        print("Sample code error found in ", tfname, ":")
+        print("-----------------------")
+        print(open(tfname).read())
+        print("-----------------------")
+        print("subprocess return code: ", str(subprc.returncode))
+        print("Error Raised from Sample Code ", tfname, " :")
+        print(err)
+        print(msg)
+        print("----example code check failed----\n")
+        logging.warning('%s error: %s', tfname, err)
+        logging.warning('%s msg: %s', tfname, msg)
+        result = False
+    else:
+        print("----example code check success----\n")
+
+    # msg is the returned code execution report
 
-    return result
+    return result, tfname, msg
 
 
 def single_defcom_extract(start_from, srcls, is_class_begin=False):
@@ -264,12 +314,7 @@ def single_defcom_extract(start_from, srcls, is_class_begin=False):
     return fcombody
 
 
-def print_header(htype, name):
-    print(htype, " name:", name)
-    print("-----------------------")
-
-
-def srccoms_extract(srcfile, wlist):
+def srccoms_extract(srcfile, wlist, methods):
     """
     Given a source file ``srcfile``, this function will
     extract its API(doc comments) and run sample codes in the
@@ -278,12 +323,15 @@ def srccoms_extract(srcfile, wlist):
     Args:
         srcfile(file): the source file
         wlist(list): white list
+        methods(list): only elements of this list considered.
 
     Returns:
         result: True or False
+        error_methods: the methods that failed.
     """
 
     process_result = True
+    error_methods = []
     srcc = srcfile.read()
     # 2. get defs and classes header line number
     # set file pointer to its beginning
@@ -292,8 +340,8 @@ def srccoms_extract(srcfile, wlist):
 
     # 1. fetch__all__ list
     allidx = srcc.find("__all__")
-    srcfile_new = srcfile.name
-    srcfile_new = srcfile_new.replace('.py', '')
+    logger.debug('processing %s, methods: %s', srcfile.name, str(methods))
+    srcfile_new, _ = os.path.splitext(srcfile.name)
     srcfile_list = srcfile_new.split('/')
     srcfile_str = ''
     for i in range(4, len(srcfile_list)):
@@ -301,7 +349,7 @@ def srccoms_extract(srcfile, wlist):
     if allidx != -1:
         alllist = []
         # get all list for layers/ops.py
-        if srcfile.name.find("ops.py") != -1:
+        if srcfile.name.find("fluid/layers/ops.py") != -1:
             for ai in range(0, len(srcls)):
                 if srcls[ai].startswith("__all__"):
                     lb = srcls[ai].find('[')
@@ -323,15 +371,27 @@ def srccoms_extract(srcfile, wlist):
             if '' in alllist:
                 alllist.remove('')
         api_alllist_count = len(alllist)
+        logger.debug('found %d items: %s', api_alllist_count, str(alllist))
         api_count = 0
         handled = []
         # get src contents in layers/ops.py
-        if srcfile.name.find("ops.py") != -1:
+        if srcfile.name.find("fluid/layers/ops.py") != -1:
             for i in range(0, len(srcls)):
-                if srcls[i].find("__doc__") != -1:
-                    opname = srcls[i][:srcls[i].find("__doc__") - 1]
+                opname = None
+                opres = re.match(r"^(\w+)\.__doc__", srcls[i])
+                if opres is not None:
+                    opname = opres.group(1)
+                else:
+                    opres = re.match(
+                        r"^add_sample_code\(globals\(\)\[\"(\w+)\"\]", srcls[i])
+                    if opres is not None:
+                        opname = opres.group(1)
+                if opname is not None:
                     if opname in wlist:
+                        logger.info('%s is in the whitelist, skip it.', opname)
                         continue
+                    else:
+                        logger.debug('%s\'s docstring found.', opname)
                     comstart = i
                     for j in range(i, len(srcls)):
                         if srcls[j].find("\"\"\"") != -1:
@@ -341,51 +401,73 @@ def srccoms_extract(srcfile, wlist):
                         opcom += srcls[j]
                         if srcls[j].find("\"\"\"") != -1:
                             break
+                    result, _, _ = sampcd_extract_and_run(opcom, opname, "def",
+                                                          opname)
+                    if not result:
+                        error_methods.append(opname)
+                        process_result = False
                     api_count += 1
                     handled.append(
                         opname)  # ops.py also has normal formatted functions
                     # use list 'handled'  to mark the functions have been handled here
                     # which will be ignored in the following step
+                    # handled what?
+        logger.debug('%s already handled.', str(handled))
         for i in range(0, len(srcls)):
             if srcls[i].startswith(
                     'def '):  # a function header is detected in line i
                 f_header = srcls[i].replace(" ", '')
                 fn = f_header[len('def'):f_header.find('(')]  # function name
                 if "%s%s" % (srcfile_str, fn) not in methods:
+                    logger.info(
+                        '[file:%s, function:%s] not in methods list, skip it.',
+                        srcfile_str, fn)
                     continue
                 if fn in handled:
                     continue
                 if fn in alllist:
                     api_count += 1
                     if fn in wlist or fn + "@" + srcfile.name in wlist:
+                        logger.info('[file:%s, function:%s] skip by wlist.',
+                                    srcfile_str, fn)
                         continue
                     fcombody = single_defcom_extract(i, srcls)
                     if fcombody == "":  # if no comment
-                        print_header("def", fn)
+                        print("def name:", fn)
+                        print("-----------------------")
                         print("WARNING: no comments in function ", fn,
                               ", but it deserves.")
                         continue
                     else:
-                        if not sampcd_extract_and_run(fcombody, fn, "def", fn):
+                        result, _, _ = sampcd_extract_and_run(fcombody, fn,
+                                                              "def", fn)
+                        if not result:
+                            error_methods.append(fn)
                             process_result = False
 
             if srcls[i].startswith('class '):
                 c_header = srcls[i].replace(" ", '')
                 cn = c_header[len('class'):c_header.find('(')]  # class name
                 if '%s%s' % (srcfile_str, cn) not in methods:
+                    logger.info(
+                        '[file:%s, class:%s] not in methods list, skip it.',
+                        srcfile_str, cn)
                     continue
                 if cn in handled:
                     continue
                 if cn in alllist:
                     api_count += 1
                     if cn in wlist or cn + "@" + srcfile.name in wlist:
+                        logger.info('[file:%s, class:%s] skip by wlist.',
+                                    srcfile_str, cn)
                         continue
                     # class comment
                     classcom = single_defcom_extract(i, srcls, True)
                     if classcom != "":
-                        if not sampcd_extract_and_run(classcom, cn, "class",
-                                                      cn):
-
+                        result, _, _ = sampcd_extract_and_run(classcom, cn,
+                                                              "class", cn)
+                        if not result:
+                            error_methods.append(cn)
                             process_result = False
                     else:
                         print("WARNING: no comments in class itself ", cn,
@@ -410,10 +492,19 @@ def srccoms_extract(srcfile, wlist):
                                 if '%s%s' % (
                                         srcfile_str, name
                                 ) not in methods:  # class method not in api.spec 
+                                    logger.info(
+                                        '[file:%s, func:%s] not in methods, skip it.',
+                                        srcfile_str, name)
                                     continue
                                 if mn.startswith('_'):
+                                    logger.info(
+                                        '[file:%s, func:%s] startswith _, it\'s private method, skip it.',
+                                        srcfile_str, name)
                                     continue
                                 if name in wlist or name + "@" + srcfile.name in wlist:
+                                    logger.info(
+                                        '[file:%s, class:%s] skip by wlist.',
+                                        srcfile_str, name)
                                     continue
                                 thismethod = [thisl[indent:]
                                               ]  # method body lines
@@ -434,111 +525,112 @@ def srccoms_extract(srcfile, wlist):
                                 thismtdcom = single_defcom_extract(0,
                                                                    thismethod)
                                 if thismtdcom != "":
-                                    if not sampcd_extract_and_run(
-                                            thismtdcom, name, "method", name):
+                                    result, _, _ = sampcd_extract_and_run(
+                                        thismtdcom, name, "method", name)
+                                    if not result:
+                                        error_methods.append(name)
                                         process_result = False
+    else:
+        logger.warning('__all__ not found in file:%s', srcfile.name)
 
-    return process_result
+    return process_result, error_methods
 
 
 def test(file_list):
+    global methods  # readonly
     process_result = True
     for file in file_list:
         with open(file, 'r') as src:
-            if not srccoms_extract(src, wlist):
+            if not srccoms_extract(src, wlist, methods):
                 process_result = False
     return process_result
 
 
+def run_a_test(tc_filename):
+    """
+    execute a sample code-block.
+    """
+    global methods  # readonly
+    process_result = True
+    with open(tc_filename, 'r') as src:
+        process_result, error_methods = srccoms_extract(src, wlist, methods)
+    return process_result, tc_filename, error_methods
+
+
 def get_filenames():
     '''
-    this function will get the modules that pending for check.
+    this function will get the sample code files that pending for check.
 
     Returns:
 
-        list: the modules pending for check .
+        dict: the sample code files pending for check .
 
     '''
-    filenames = []
-    global methods
+    global methods  # write
     global whl_error
-    methods = []
+    import paddle
     whl_error = []
     get_incrementapi()
-    API_spec = 'dev_pr_diff_api.spec'
-    with open(API_spec) as f:
+    all_sample_code_filenames = {}
+    with open(API_DIFF_SPEC_FN) as f:
         for line in f.readlines():
             api = line.replace('\n', '')
             try:
-                module = eval(api).__module__
+                api_obj = eval(api)
             except AttributeError:
                 whl_error.append(api)
                 continue
-            if len(module.split('.')) > 1:
-                filename = '../python/'
-                module_py = '%s.py' % module.split('.')[-1]
-                for i in range(0, len(module.split('.')) - 1):
-                    filename = filename + '%s/' % module.split('.')[i]
-                filename = filename + module_py
-            else:
-                filename = ''
-                print("\nWARNING:----Exception in get api filename----\n")
-                print("\n" + api + ' module is ' + module + "\n")
-            if filename != '' and filename not in filenames:
-                filenames.append(filename)
-            # get all methods
-            method = ''
-            if inspect.isclass(eval(api)):
-                name = api.split('.')[-1]
-            elif inspect.isfunction(eval(api)):
-                name = api.split('.')[-1]
-            elif inspect.ismethod(eval(api)):
-                name = '%s.%s' % (api.split('.')[-2], api.split('.')[-1])
-            else:
-                name = ''
-                print("\nWARNING:----Exception in get api methods----\n")
-                print("\n" + line + "\n")
-                print("\n" + api + ' method is None!!!' + "\n")
-            for j in range(2, len(module.split('.'))):
-                method = method + '%s.' % module.split('.')[j]
-            method = method + name
-            if method not in methods:
-                methods.append(method)
-    os.remove(API_spec)
-    return filenames
+            except SyntaxError:
+                logger.warning('line:%s, api:%s', line, api)
+                # paddle.Tensor.<lambda>
+                continue
+            if hasattr(api_obj, '__doc__') and api_obj.__doc__:
+                sample_code_filenames = sampcd_extract_to_file(api_obj.__doc__,
+                                                               api)
+                for tfname in sample_code_filenames:
+                    all_sample_code_filenames[tfname] = api
+    return all_sample_code_filenames
+
+
+def get_api_md5(path):
+    api_md5 = {}
+    API_spec = '%s/%s' % (os.path.abspath(os.path.join(os.getcwd(), "..")),
+                          path)
+    pat = re.compile(r'\((paddle[^,]+)\W*document\W*([0-9a-z]{32})')
+    patArgSpec = re.compile(
+        r'^(paddle[^,]+)\s+\(ArgSpec.*document\W*([0-9a-z]{32})')
+    with open(API_spec) as f:
+        for line in f.readlines():
+            mo = pat.search(line)
+            if not mo:
+                mo = patArgSpec.search(line)
+            if mo:
+                api_md5[mo.group(1)] = mo.group(2)
+    return api_md5
 
 
 def get_incrementapi():
     '''
     this function will get the apis that difference between API_DEV.spec and API_PR.spec.
     '''
-
-    def get_api_md5(path):
-        api_md5 = {}
-        API_spec = '%s/%s' % (os.path.abspath(os.path.join(os.getcwd(), "..")),
-                              path)
-        with open(API_spec) as f:
-            for line in f.readlines():
-                api = line.split(' ', 1)[0]
-                md5 = line.split("'document', ")[1].replace(')', '').replace(
-                    '\n', '')
-                api_md5[api] = md5
-        return api_md5
-
-    dev_api = get_api_md5('paddle/fluid/API_DEV.spec')
-    pr_api = get_api_md5('paddle/fluid/API_PR.spec')
-    with open('dev_pr_diff_api.spec', 'w') as f:
+    global API_DEV_SPEC_FN, API_PR_SPEC_FN, API_DIFF_SPEC_FN  ## readonly
+    dev_api = get_api_md5(API_DEV_SPEC_FN)
+    pr_api = get_api_md5(API_PR_SPEC_FN)
+    with open(API_DIFF_SPEC_FN, 'w') as f:
         for key in pr_api:
             if key in dev_api:
                 if dev_api[key] != pr_api[key]:
+                    logger.debug("%s in dev is %s, different from pr's %s", key,
+                                 dev_api[key], pr_api[key])
                     f.write(key)
                     f.write('\n')
             else:
+                logger.debug("%s is not in dev", key)
                 f.write(key)
                 f.write('\n')
 
 
-def get_wlist():
+def get_wlist(fn="wlist.json"):
     '''
     this function will get the white list of API.
 
@@ -551,7 +643,7 @@ def get_wlist():
     wlist_file = []
     # only white on CPU
     gpu_not_white = []
-    with open("wlist.json", 'r') as load_f:
+    with open(fn, 'r') as load_f:
         load_dict = json.load(load_f)
         for key in load_dict:
             if key == 'wlist_dir':
@@ -567,84 +659,123 @@ def get_wlist():
     return wlist, wlist_file, gpu_not_white
 
 
-wlist, wlist_file, gpu_not_white = get_wlist()
+arguments = [
+    # flags, dest, type, default, help
+    ['--gpu_id', 'gpu_id', int, 0, 'GPU device id to use [0]'],
+    ['--logf', 'logf', str, None, 'file for logging'],
+    ['--threads', 'threads', int, 0, 'sub processes number'],
+]
 
-if len(sys.argv) < 2:
-    print("Error: inadequate number of arguments")
-    print('''If you are going to run it on 
-        "CPU: >>> python sampcd_processor.py cpu
-        "GPU: >>> python sampcd_processor.py gpu
-        ''')
-    sys.exit("lack arguments")
-else:
-    if sys.argv[1] == "gpu":
+
+def parse_args():
+    """
+    Parse input arguments
+    """
+    global arguments
+    parser = argparse.ArgumentParser(description='run Sample Code Test')
+    # parser.add_argument('--cpu', dest='cpu_mode', action="store_true",
+    #                     help='Use CPU mode (overrides --gpu)')
+    # parser.add_argument('--gpu', dest='gpu_mode', action="store_true")
+    parser.add_argument('--debug', dest='debug', action="store_true")
+    parser.add_argument('mode', type=str, help='run on device', default='cpu')
+    for item in arguments:
+        parser.add_argument(
+            item[0], dest=item[1], help=item[4], type=item[2], default=item[3])
+
+    if len(sys.argv) == 1:
+        args = parser.parse_args(['cpu'])
+        return args
+    #    parser.print_help()
+    #    sys.exit(1)
+
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    if args.debug:
+        logger.setLevel(logging.DEBUG)
+    if args.logf:
+        logfHandler = logging.FileHandler(args.logf)
+        logfHandler.setFormatter(
+            logging.Formatter(
+                "%(asctime)s - %(funcName)s:%(lineno)d - %(levelname)s - %(message)s"
+            ))
+        logger.addHandler(logfHandler)
+
+    wlist, wlist_file, gpu_not_white = get_wlist()
+
+    if args.mode == "gpu":
+        GPU_ID = args.gpu_id
+        logger.info("using GPU_ID %d", GPU_ID)
         for _gnw in gpu_not_white:
             wlist.remove(_gnw)
-    elif sys.argv[1] != "cpu":
-        print("Unrecognized argument:'", sys.argv[1], "' , 'cpu' or 'gpu' is ",
-              "desired\n")
+    elif args.mode != "cpu":
+        logger.error("Unrecognized argument:%s, 'cpu' or 'gpu' is desired.",
+                     args.mode)
         sys.exit("Invalid arguments")
-    print("API check -- Example Code")
-    print("sample_test running under python", platform.python_version())
-    if not os.path.isdir("./samplecode_temp"):
-        os.mkdir("./samplecode_temp")
-    cpus = multiprocessing.cpu_count()
+    RUN_ON_DEVICE = args.mode
+    logger.info("API check -- Example Code")
+    logger.info("sample_test running under python %s",
+                platform.python_version())
+
+    if os.path.exists(SAMPLECODE_TEMPDIR):
+        if not os.path.isdir(SAMPLECODE_TEMPDIR):
+            os.remove(SAMPLECODE_TEMPDIR)
+            os.mkdir(SAMPLECODE_TEMPDIR)
+    else:
+        os.mkdir(SAMPLECODE_TEMPDIR)
+
     filenames = get_filenames()
     if len(filenames) == 0 and len(whl_error) == 0:
-        print("-----API_PR.spec is the same as API_DEV.spec-----")
+        logger.info("-----API_PR.spec is the same as API_DEV.spec-----")
         exit(0)
-    rm_file = []
-    for f in filenames:
-        for w_file in wlist_file:
-            if f.startswith(w_file):
-                rm_file.append(f)
-                filenames.remove(f)
-    if len(rm_file) != 0:
-        print("REMOVE white files: %s" % rm_file)
-    print("API_PR is diff from API_DEV: %s" % filenames)
-    one_part_filenum = int(math.ceil(len(filenames) / cpus))
-    if one_part_filenum == 0:
-        one_part_filenum = 1
-    divided_file_list = [
-        filenames[i:i + one_part_filenum]
-        for i in range(0, len(filenames), one_part_filenum)
-    ]
-
-    po = multiprocessing.Pool()
-    results = po.map_async(test, divided_file_list)
+    logger.info("API_PR is diff from API_DEV: %s", filenames)
+
+    threads = multiprocessing.cpu_count()
+    if args.threads:
+        threads = args.threads
+    po = multiprocessing.Pool(threads)
+    # results = po.map_async(test, divided_file_list)
+    results = po.map_async(execute_samplecode_test, filenames.keys())
     po.close()
     po.join()
 
     result = results.get()
 
     # delete temp files
-    for root, dirs, files in os.walk("./samplecode_temp"):
-        for fntemp in files:
-            os.remove("./samplecode_temp/" + fntemp)
-    os.rmdir("./samplecode_temp")
+    if not args.debug:
+        shutil.rmtree(SAMPLECODE_TEMPDIR)
 
-    print("----------------End of the Check--------------------")
+    logger.info("----------------End of the Check--------------------")
     if len(whl_error) != 0:
-        print("%s is not in whl." % whl_error)
-        print("")
-        print("Please check the whl package and API_PR.spec!")
-        print("You can follow these steps in order to generate API.spec:")
-        print("1. cd ${paddle_path}, compile paddle;")
-        print("2. pip install build/python/dist/(build whl package);")
-        print(
+        logger.info("%s is not in whl.", whl_error)
+        logger.info("")
+        logger.info("Please check the whl package and API_PR.spec!")
+        logger.info("You can follow these steps in order to generate API.spec:")
+        logger.info("1. cd ${paddle_path}, compile paddle;")
+        logger.info("2. pip install build/python/dist/(build whl package);")
+        logger.info(
             "3. run 'python tools/print_signatures.py paddle > paddle/fluid/API.spec'."
         )
         for temp in result:
-            if not temp:
-                print("")
-                print("In addition, mistakes found in sample codes.")
-                print("Please check sample codes.")
-        print("----------------------------------------------------")
+            if not temp[0]:
+                logger.info("In addition, mistakes found in sample codes: %s",
+                            temp[1])
+                logger.info("error_methods: %s", str(temp[2]))
+        logger.info("----------------------------------------------------")
         exit(1)
     else:
+        has_error = False
         for temp in result:
-            if not temp:
-                print("Mistakes found in sample codes.")
-                print("Please check sample codes.")
-                exit(1)
-    print("Sample code check is successful!")
+            if not temp[0]:
+                logger.info("In addition, mistakes found in sample codes: %s",
+                            temp[1])
+                logger.info("error_methods: %s", str(temp[2]))
+                has_error = True
+        if has_error:
+            logger.info("Mistakes found in sample codes.")
+            logger.info("Please check sample codes.")
+            exit(1)
+    logger.info("Sample code check is successful!")
diff --git a/tools/static_mode_white_list.py b/tools/static_mode_white_list.py
index ab5b6516b90f8..7c1f54adfb3d9 100644
--- a/tools/static_mode_white_list.py
+++ b/tools/static_mode_white_list.py
@@ -421,6 +421,8 @@
     'test_reader_reset',
     'test_recurrent_op',
     'test_reduce_op',
+    'test_reduce_mkldnn_op',
+    'test_reduce_bf16_mkldnn_op',
     'test_ref_by_trainer_id_op',
     'test_registry',
     'test_regularizer',
@@ -601,7 +603,9 @@
     'test_fc_mkldnn_op',
     'test_fc_bf16_mkldnn_op',
     'test_nearest_interp_mkldnn_op',
+    'test_nearest_interp_v2_mkldnn_op',
     'test_bilinear_interp_mkldnn_op',
+    'test_bilinear_interp_v2_mkldnn_op',
     'test_fusion_gru_int8_mkldnn_op',
     'test_fusion_gru_bf16_mkldnn_op',
     'test_fusion_gru_mkldnn_op',
@@ -701,4 +705,5 @@
     'test_generate_proposals_v2_op',
     'test_lamb_op_xpu',
     'test_model_cast_to_bf16',
+    'test_sgd_op_bf16',
 ]
diff --git a/tools/statistics_UT_resource.sh b/tools/statistics_UT_resource.sh
new file mode 100644
index 0000000000000..a6f1f264c4cd2
--- /dev/null
+++ b/tools/statistics_UT_resource.sh
@@ -0,0 +1,58 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+unset GREP_OPTIONS
+rm ./run_detail.log
+rm ./UT_resource.log
+rm ./UT_resource_sort.log
+rm ./while_list.log
+
+export LD_LIBRARY_PATH="$PWD/python/paddle/libs;$LD_LIBRARY_PATH"
+export CUDA_VISIBLE_DEVICES=0,1
+
+test_cases=$(ctest -N | awk -F ': ' '{print $2}' | sed '/^$/d' | sed '$d')
+use_memory_base=$(nvidia-smi -q -i 0  | grep "Used"  | head -1 | grep -o "[0-9]*")
+for unittest in $test_cases
+do
+    use_memory=0
+    gpu_utilization=0
+    memory_utilization=0
+    ctest -R "^${unittest}$" --repeat-until-fail 5 -j 1 &
+    PID=$!
+    echo -e "******************************************************"
+    echo -e "[$unittest]:    PID:$PID \n"
+    while [[ $(ps aux | awk '{print $2}' | grep "^$PID$" | grep -v "grep" | wc -l) -ge 1 ]]
+    do
+        use_memory_current=$(nvidia-smi -q -i 0  | grep "Used"  | head -1 | grep -o "[0-9]*")
+        if [[ $use_memory_current -gt $use_memory ]];then
+            use_memory=$use_memory_current
+        fi
+        memory_utilization_current=$(nvidia-smi -q -i 0 |  grep "Memory" | sed -n '3p' | grep -o "[0-9]*")
+        if [[ $memory_utilization_current -gt $memory_utilization ]];then
+            memory_utilization=$memory_utilization_current
+        fi
+
+        gpu_utilization_current=$(nvidia-smi -q -i 0  | grep "Gpu"  | grep -o "[0-9]*")
+        if [[ $gpu_utilization_current -gt $gpu_utilization ]];then
+            gpu_utilization=$gpu_utilization_current
+        fi
+    done
+    use_memory=`expr $use_memory - $use_memory_base`
+    echo -e "     use_memory:$use_memory \n     memory_utilization:$memory_utilization \n     gpu_utilization:$gpu_utilization\n"
+    echo -e "[$unittest]: \n     use_memory:$use_memory \n     memory_utilization:$memory_utilization \n     gpu_utilization:$gpu_utilization\n" >> run_detail.log
+    echo -e "$unittest:$use_memory:$memory_utilization:$gpu_utilization" >> UT_resource.log
+done
+
+sort -r -n -k 2 -t : UT_resource.log > UT_resource_sort.log
+cat UT_resource_sort.log | awk -F ':' '{print $1}' > while_list.log
diff --git a/tools/test_model_benchmark.sh b/tools/test_model_benchmark.sh
new file mode 100644
index 0000000000000..720bb33479069
--- /dev/null
+++ b/tools/test_model_benchmark.sh
@@ -0,0 +1,62 @@
+#!/bin/bash
+
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+function compile_install_paddle {
+    export CUDA_ARCH_NAME=Auto
+    export PY_VERSION=3.7
+    export WITH_DISTRIBUTE=OFF
+    export WITH_GPU=ON
+    export WITH_TENSORRT=OFF
+    export WITH_TESTING=OFF
+    export WITH_UNITY_BUILD=ON
+    bash -x paddle/scripts/paddle_build.sh build
+    [ $? -ne 0 ] && echo "build paddle failed." && exit 1
+    pip uninstall -y paddlepaddle_gpu
+    pip install build/python/dist/paddlepaddle_gpu-0.0.0-cp37-cp37m-linux_x86_64.whl
+    [ $? -ne 0 ] && echo "install paddle failed." && exit 1
+}
+
+function prepare_data {
+    cd ${cache_dir}
+    if [ -d "benchmark_data" ];then 
+        echo -e "benchmark_data exist!"
+    else
+        mkdir benchmark_data
+        cd benchmark_data
+        mkdir dataset
+        cd dataset
+        wget --no-proxy -q https://paddle-qa.bj.bcebos.com/benchmark_data/Bert.zip 
+        unzip Bert.zip
+        wget --no-proxy -q https://paddle-qa.bj.bcebos.com/benchmark_data/imagenet100_data.zip
+        unzip imagenet100_data.zip
+    fi
+}
+
+function run_model_benchmark {
+    cd ${cache_dir}/benchmark_data
+    if [ -d "benchmark" ];then rm -rf benchmark
+    fi
+    git clone --recurse-submodules=PaddleClas --recurse-submodules=PaddleNLP https://github.com/paddlepaddle/benchmark.git
+    export data_path=${cache_dir}/benchmark_data/dataset
+    export BENCHMARK_ROOT=${cache_dir}/benchmark_data/benchmark
+    cd ${BENCHMARK_ROOT}/scripts/benchmark_ci
+    bash model_ci.sh
+}
+
+compile_install_paddle
+prepare_data
+run_model_benchmark
diff --git a/tools/test_sampcd_processor.py b/tools/test_sampcd_processor.py
new file mode 100644
index 0000000000000..6b675d2a11508
--- /dev/null
+++ b/tools/test_sampcd_processor.py
@@ -0,0 +1,450 @@
+#! python
+
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import os
+import tempfile
+import shutil
+import sys
+import importlib
+from sampcd_processor import find_all
+from sampcd_processor import check_indent
+from sampcd_processor import sampcd_extract_and_run
+from sampcd_processor import single_defcom_extract
+from sampcd_processor import srccoms_extract
+from sampcd_processor import get_api_md5
+from sampcd_processor import get_incrementapi
+from sampcd_processor import get_wlist
+
+
+class Test_find_all(unittest.TestCase):
+    def test_find_none(self):
+        self.assertEqual(0, len(find_all('hello', 'world')))
+
+    def test_find_one(self):
+        self.assertListEqual([0], find_all('hello', 'hello'))
+
+    def test_find_two(self):
+        self.assertListEqual([1, 15],
+                             find_all(' hello, world; hello paddle!', 'hello'))
+
+
+class Test_check_indent(unittest.TestCase):
+    def test_no_indent(self):
+        self.assertEqual(0, check_indent('hello paddle'))
+
+    def test_indent_4_spaces(self):
+        self.assertEqual(4, check_indent('    hello paddle'))
+
+    def test_indent_1_tab(self):
+        self.assertEqual(4, check_indent("\thello paddle"))
+
+
+class Test_sampcd_extract_and_run(unittest.TestCase):
+    def setUp(self):
+        if not os.path.exists('samplecode_temp/'):
+            os.mkdir('samplecode_temp/')
+
+    def test_run_a_defs_samplecode(self):
+        comments = """
+        Examples:
+            .. code-block:: python
+                print(1+1)
+        """
+        funcname = 'one_plus_one'
+        res, name, msg = sampcd_extract_and_run(comments, funcname)
+        self.assertTrue(res)
+        self.assertEqual(funcname, name)
+
+    def test_run_a_def_no_code(self):
+        comments = """
+        placeholder
+        """
+        funcname = 'one_plus_one'
+        res, name, msg = sampcd_extract_and_run(comments, funcname)
+        self.assertFalse(res)
+        self.assertEqual(funcname, name)
+
+    def test_run_a_def_raise_expection(self):
+        comments = """
+        placeholder
+        Examples:
+            .. code-block:: python
+                print(1/0)
+        """
+        funcname = 'one_plus_one'
+        res, name, msg = sampcd_extract_and_run(comments, funcname)
+        self.assertFalse(res)
+        self.assertEqual(funcname, name)
+
+
+class Test_single_defcom_extract(unittest.TestCase):
+    def test_extract_from_func(self):
+        defstr = '''
+import os
+def foo():
+            """
+            foo is a function.
+            """
+            pass
+def bar():
+            pass
+'''
+        comm = single_defcom_extract(
+            2, defstr.splitlines(True), is_class_begin=False)
+        self.assertEqual("            foo is a function.\n", comm)
+        pass
+
+    def test_extract_from_func_with_no_docstring(self):
+        defstr = '''
+import os
+def bar():
+            pass
+'''
+        comm = single_defcom_extract(
+            2, defstr.splitlines(True), is_class_begin=False)
+        self.assertEqual('', comm)
+        pass
+
+    def test_extract_from_class(self):
+        defstr = r'''
+import os
+class Foo():
+            """
+            Foo is a class.
+            second line.
+            """
+            pass
+            def bar():
+                pass
+def foo():
+            pass
+'''
+        comm = single_defcom_extract(
+            2, defstr.splitlines(True), is_class_begin=True)
+        rcomm = """            Foo is a class.
+            second line.
+"""
+        self.assertEqual(rcomm, comm)
+        pass
+
+    def test_extract_from_class_with_no_docstring(self):
+        defstr = '''
+import os
+class Foo():
+            pass
+            def bar():
+                pass
+def foo():
+            pass
+'''
+        comm = single_defcom_extract(
+            0, defstr.splitlines(True), is_class_begin=True)
+        self.assertEqual('', comm)
+
+
+class Test_get_api_md5(unittest.TestCase):
+    def setUp(self):
+        self.api_pr_spec_filename = os.path.abspath(
+            os.path.join(os.getcwd(), "..", 'paddle/fluid/API_PR.spec'))
+        with open(self.api_pr_spec_filename, 'w') as f:
+            f.write("\n".join([
+                """paddle.one_plus_one (ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ('document', 'ff0f188c95030158cc6398d2a6c55one'))""",
+                """paddle.two_plus_two (ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ('document', 'ff0f188c95030158cc6398d2a6c55two'))""",
+                """paddle.three_plus_three (ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ('document', 'ff0f188c95030158cc6398d2a6cthree'))""",
+                """paddle.four_plus_four (paddle.four_plus_four, ('document', 'ff0f188c95030158cc6398d2a6c5four'))""",
+            ]))
+
+    def tearDown(self):
+        os.remove(self.api_pr_spec_filename)
+        pass
+
+    def test_get_api_md5(self):
+        res = get_api_md5('paddle/fluid/API_PR.spec')
+        self.assertEqual("ff0f188c95030158cc6398d2a6c55one",
+                         res['paddle.one_plus_one'])
+        self.assertEqual("ff0f188c95030158cc6398d2a6c55two",
+                         res['paddle.two_plus_two'])
+        self.assertEqual("ff0f188c95030158cc6398d2a6cthree",
+                         res['paddle.three_plus_three'])
+        self.assertEqual("ff0f188c95030158cc6398d2a6c5four",
+                         res['paddle.four_plus_four'])
+
+
+class Test_get_incrementapi(unittest.TestCase):
+    def setUp(self):
+        self.api_pr_spec_filename = os.path.abspath(
+            os.path.join(os.getcwd(), "..", 'paddle/fluid/API_PR.spec'))
+        with open(self.api_pr_spec_filename, 'w') as f:
+            f.write("\n".join([
+                """paddle.one_plus_one (ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ('document', 'ff0f188c95030158cc6398d2a6c55one'))""",
+                """paddle.two_plus_two (ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ('document', 'ff0f188c95030158cc6398d2a6c55two'))""",
+                """paddle.three_plus_three (ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ('document', 'ff0f188c95030158cc6398d2a6cthree'))""",
+                """paddle.four_plus_four (paddle.four_plus_four, ('document', 'ff0f188c95030158cc6398d2a6c5four'))""",
+            ]))
+        self.api_dev_spec_filename = os.path.abspath(
+            os.path.join(os.getcwd(), "..", 'paddle/fluid/API_DEV.spec'))
+        with open(self.api_dev_spec_filename, 'w') as f:
+            f.write("\n".join([
+                """paddle.one_plus_one (ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ('document', 'ff0f188c95030158cc6398d2a6c55one'))""",
+            ]))
+        self.api_diff_spec_filename = os.path.abspath(
+            os.path.join(os.getcwd(), "dev_pr_diff_api.spec"))
+
+    def tearDown(self):
+        os.remove(self.api_pr_spec_filename)
+        os.remove(self.api_dev_spec_filename)
+        os.remove(self.api_diff_spec_filename)
+
+    def test_it(self):
+        get_incrementapi()
+        with open(self.api_diff_spec_filename, 'r') as f:
+            lines = f.readlines()
+            self.assertCountEqual([
+                "paddle.two_plus_two\n", "paddle.three_plus_three\n",
+                "paddle.four_plus_four\n"
+            ], lines)
+
+
+class Test_get_wlist(unittest.TestCase):
+    def setUp(self):
+        self.tmpDir = tempfile.mkdtemp()
+        self.wlist_filename = os.path.join(self.tmpDir, 'wlist.json')
+        with open(self.wlist_filename, 'w') as f:
+            f.write(r'''
+{
+    "wlist_dir":[
+        {
+            "name":"../python/paddle/fluid/contrib",
+            "annotation":""
+        },
+        {
+            "name":"../python/paddle/verison.py",
+            "annotation":""
+        }
+    ],
+    "wlist_api":[
+        {
+            "name":"xxxxx",
+            "annotation":"not a real api, just for example"
+        }
+    ],
+    "wlist_temp_api":[
+        "to_tensor",
+        "save_persistables@dygraph/checkpoint.py"
+    ],
+    "gpu_not_white":[
+        "deformable_conv"
+    ]
+}
+''')
+
+    def tearDown(self):
+        os.remove(self.wlist_filename)
+        shutil.rmtree(self.tmpDir)
+
+    def test_get_wlist(self):
+        wlist, wlist_file, gpu_not_white = get_wlist(self.wlist_filename)
+        self.assertCountEqual(
+            ["xxxxx", "to_tensor",
+             "save_persistables@dygraph/checkpoint.py"], wlist)
+        self.assertCountEqual([
+            "../python/paddle/fluid/contrib",
+            "../python/paddle/verison.py",
+        ], wlist_file)
+        self.assertCountEqual(["deformable_conv"], gpu_not_white)
+
+
+class Test_srccoms_extract(unittest.TestCase):
+    def setUp(self):
+        self.tmpDir = tempfile.mkdtemp()
+        print('tmpDir=', self.tmpDir)
+        self.opsDir = os.path.join(self.tmpDir, 'fluid/layers')
+        os.makedirs(self.opsDir)
+        sys.path.append(self.opsDir)
+        sys.path.append(self.tmpDir)
+        self.api_pr_spec_filename = os.path.abspath(
+            os.path.join(os.getcwd(), "..", 'paddle/fluid/API_PR.spec'))
+        with open(self.api_pr_spec_filename, 'w') as f:
+            f.write("\n".join([
+                """one_plus_one (ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ('document', "one_plus_one"))""",
+                """two_plus_two (ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ('document', "two_plus_two"))""",
+                """three_plus_three (ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ('document', "three_plus_three"))""",
+                """four_plus_four (ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ('document', "four_plus_four"))""",
+            ]))
+
+    def tearDown(self):
+        #sys.path.remove(self.tmpDir)
+        shutil.rmtree(self.tmpDir)
+        os.remove(self.api_pr_spec_filename)
+
+    def test_from_ops_py(self):
+        filecont = '''
+def add_sample_code(obj, docstr):
+    pass
+
+__unary_func__ = [
+    'exp',
+]
+
+__all__ = []
+__all__ += __unary_func__
+__all__ += ['one_plus_one']
+
+def exp():
+    pass
+add_sample_code(globals()["exp"], r"""
+Examples:
+    .. code-block:: python
+
+        # import paddle
+        # x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
+        # out = paddle.exp(x)
+        out = [0.67032005, 0.81873075, 1.10517092, 1.34985881]
+        print(out)
+        # [0.67032005 0.81873075 1.10517092 1.34985881]
+""")
+
+def one_plus_one():
+            return 1+1
+
+one_plus_one.__doc__ = """
+            placeholder
+
+            Examples:
+            .. code-block:: python
+                print(1+1)
+"""
+
+__all__ += ['two_plus_two']
+def two_plus_two():
+            return 2+2
+add_sample_code(globals()["two_plus_two"], """
+            Examples:
+            .. code-block:: python
+                print(2+2)
+""")
+'''
+        pyfilename = os.path.join(self.opsDir, 'ops.py')
+        with open(pyfilename, 'w') as pyfile:
+            pyfile.write(filecont)
+        self.assertTrue(os.path.exists(pyfilename))
+        utsp = importlib.import_module('ops')
+        print('testing srccoms_extract from ops.py')
+        methods = ['one_plus_one', 'two_plus_two', 'exp']
+        # os.remove("samplecode_temp/" "one_plus_one_example.py")
+        self.assertFalse(
+            os.path.exists("samplecode_temp/"
+                           "one_plus_one_example.py"))
+        with open(pyfilename, 'r') as pyfile:
+            res, error_methods = srccoms_extract(pyfile, [], methods)
+            self.assertTrue(res)
+        self.assertTrue(
+            os.path.exists("samplecode_temp/"
+                           "one_plus_one_example.py"))
+        os.remove("samplecode_temp/" "one_plus_one_example.py")
+        self.assertTrue(
+            os.path.exists("samplecode_temp/"
+                           "two_plus_two_example.py"))
+        os.remove("samplecode_temp/" "two_plus_two_example.py")
+        self.assertTrue(os.path.exists("samplecode_temp/" "exp_example.py"))
+        os.remove("samplecode_temp/" "exp_example.py")
+
+    def test_from_not_ops_py(self):
+        filecont = '''
+__all__ = [
+        'one_plus_one'
+]
+
+def one_plus_one():
+            """
+            placeholder
+
+            Examples:
+            .. code-block:: python
+                print(1+1)
+            """
+            return 1+1
+
+'''
+        pyfilename = os.path.join(self.tmpDir, 'opo.py')  # not ops.py
+        with open(pyfilename, 'w') as pyfile:
+            pyfile.write(filecont)
+        utsp = importlib.import_module('opo')
+        methods = ['one_plus_one']
+        with open(pyfilename, 'r') as pyfile:
+            res, error_methods = srccoms_extract(pyfile, [], methods)
+            self.assertTrue(res)
+        expectedFile = os.path.join("samplecode_temp",
+                                    "one_plus_one_example.py")
+        self.assertTrue(os.path.exists(expectedFile))
+        os.remove(expectedFile)
+
+    def test_with_empty_wlist(self):
+        """
+        see test_from_ops_py
+        """
+        pass
+
+    def test_with_wlist(self):
+        filecont = '''
+__all__ = [
+        'four_plus_four',
+        'three_plus_three'
+        ]
+
+def four_plus_four():
+            """
+            placeholder
+
+            Examples:
+            .. code-block:: python
+                print(4+4)
+            """
+            return 4+4
+def three_plus_three():
+            """
+            placeholder
+
+            Examples:
+            .. code-block:: python
+                print(3+3)
+            """
+            return 3+3
+
+'''
+        pyfilename = os.path.join(self.tmpDir, 'three_and_four.py')
+        with open(pyfilename, 'w') as pyfile:
+            pyfile.write(filecont)
+        utsp = importlib.import_module('three_and_four')
+        methods = ['four_plus_four', 'three_plus_three']
+        with open(pyfilename, 'r') as pyfile:
+            res, error_methods = srccoms_extract(pyfile, ['three_plus_three'],
+                                                 methods)
+            self.assertTrue(res)
+
+        expectedFile = os.path.join("samplecode_temp",
+                                    "four_plus_four_example.py")
+        self.assertTrue(os.path.exists(expectedFile))
+        os.remove(expectedFile)
+        self.assertFalse(os.path.exists(expectedFile))
+
+
+# https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/layers/ops.py
+# why? unabled to use the ast module. emmmmm
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tools/timeline.py b/tools/timeline.py
index 119018380b551..2a399b71b7786 100644
--- a/tools/timeline.py
+++ b/tools/timeline.py
@@ -186,6 +186,13 @@ def _allocate_pids(self):
                         self._chrome_trace.emit_pid(
                             "memory usage on %s:cudapinnedplace:%d" %
                             (k, mevent.device_id), pid)
+                elif mevent.place == profiler_pb2.MemEvent.NPUPlace:
+                    if (k, mevent.device_id, "NPU") not in self._mem_devices:
+                        pid = self._allocate_pid()
+                        self._mem_devices[(k, mevent.device_id, "NPU")] = pid
+                        self._chrome_trace.emit_pid(
+                            "memory usage on %s:npu:%d" % (k, mevent.device_id),
+                            pid)
                 if (k, 0, "CPU") not in self._mem_devices:
                     pid = self._allocate_pid()
                     self._mem_devices[(k, 0, "CPU")] = pid
@@ -201,6 +208,11 @@ def _allocate_pids(self):
                     self._mem_devices[(k, 0, "CUDAPinnedPlace")] = pid
                     self._chrome_trace.emit_pid(
                         "memory usage on %s:cudapinnedplace:%d" % (k, 0), pid)
+                if (k, 0, "NPU") not in self._mem_devices:
+                    pid = self._allocate_pid()
+                    self._mem_devices[(k, 0, "NPU")] = pid
+                    self._chrome_trace.emit_pid("memory usage on %s:npu:%d" %
+                                                (k, 0), pid)
 
     def _allocate_events(self):
         for k, profile_pb in six.iteritems(self._profile_dict):
@@ -227,7 +239,8 @@ def _allocate_memory_event(self):
         place_to_str = {
             profiler_pb2.MemEvent.CPUPlace: "CPU",
             profiler_pb2.MemEvent.CUDAPlace: "GPU",
-            profiler_pb2.MemEvent.CUDAPinnedPlace: "CUDAPinnedPlace"
+            profiler_pb2.MemEvent.CUDAPinnedPlace: "CUDAPinnedPlace",
+            profiler_pb2.MemEvent.NPUPlace: "NPU"
         }
         for k, profile_pb in six.iteritems(self._profile_dict):
             mem_list = []
diff --git a/tools/windows/build_compile_environment.bat b/tools/windows/build_compile_environment.bat
index 736a19ddf52f4..4a61a99c34fa2 100644
--- a/tools/windows/build_compile_environment.bat
+++ b/tools/windows/build_compile_environment.bat
@@ -22,9 +22,9 @@
 ::   Include:
 ::     1. CMake 3.17.0
 ::     2. Git 2.28.0
-::     3. Python 3.7.8
-::     4. Visual Studio 2015 with update 3
-::     5. CUDA 10
+::     3. Python 3.8.3
+::     4. Visual Studio 2017 Community
+::     5. CUDA 11.2
 ::     6. java jre
 ::     7. xly agent
 
@@ -73,7 +73,6 @@ if %errorlevel% == 0 (
 ) else (
   echo Error***** Install Cmake-3.17.0 failed, please re-install it manually.
 )
-del cmake-3.17.0-win64-x64.msi
 goto :eof
 :: ===== end step 1: cmake =====
 
@@ -99,91 +98,87 @@ if %errorlevel% == 0 (
 ) else (
   echo Error***** Install Git-2.28.0 failed, please re-install it manually.
 )
-del Git-2.28.0-64-bit.exe
 goto :eof
 :: ===== end step 2: Git =====
 
 :: ===== start step 3: Python =====
-:: Download Python-3.7.8 and add in PATH when it not installed.
-:: TODO: limit version >= 3.7.8
+:: Download Python-3.8.3 and add in PATH when it not installed.
+:: TODO: limit version >= 3.8.3
 :python
-echo ">>>>>>>> step [3/7]: Python 3.7.8"
-python -V 2>&1 | findstr /C:"Python 3.7.8" > nul 2> nul || call :install_python
-goto vs2015
+echo ">>>>>>>> step [3/7]: Python 3.8.3"
+python -V 2>&1 | findstr /C:"Python 3.8.3" > nul 2> nul || call :install_python
+goto vs
 
 :install_python
-echo There is not Python in this PC, will install Python-3.7.8.
-echo Download package from https://npm.taobao.org/mirrors/python/3.7.8/python-3.7.8-amd64.exe ...
-wget -O python-3.7.8-amd64.exe https://npm.taobao.org/mirrors/python/3.7.8/python-3.7.8-amd64.exe
-echo Install Python-3.7.8 ...
+echo There is not Python in this PC, will install Python-3.8.3
+echo Download package from https://paddle-ci.gz.bcebos.com/window_requirement/python-3.8.3-amd64.exe ...
+wget -O python-3.8.3-amd64.exe https://paddle-ci.gz.bcebos.com/window_requirement/python-3.8.3-amd64.exe
+echo Install Python-3.8.3 ...
 :: /passive [silent install]
 :: InstallAllUsers [add path for all users]
 :: PrependPath [add script/install into PATH]
 :: TargetDir [install directory]
-start /wait python-3.7.8-amd64.exe /passive InstallAllUsers=1 PrependPath=1 TargetDir=C:\Python37
+start /wait python-3.8.3-amd64.exe /passive InstallAllUsers=1 PrependPath=1 TargetDir=C:\Python38
 if %errorlevel% == 0 (
-  echo Install python-3.7.8 success!
+  echo Install python-3.8.3 success!
 ) else (
-  echo Error***** Install python-3.7.8 failed, please re-install it manually.
+  echo Error***** Install python-3.8.3 failed, please re-install it manually.
 )
-del python-3.7.8-amd64.exe
 goto :eof
 :: ===== end step 3: Python =====
 
-:: ===== start step 4: Visual Studio 2015 =====
-:: Download Visual Studio 2015 when it not installed.
-:vs2015
-echo ">>>>>>>> step [4/7]: Visual Studio 2015"
-cmd /C "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" amd64 > nul 2> nul || call :install_visual_studio
+:: ===== start step 4: Visual Studio 2017 Community =====
+:: Download Visual Studio 2017 when it not installed.
+:vs
+echo ">>>>>>>> step [4/7]: Visual Studio 2017 "
+cmd /C "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvars64.bat"  > nul 2> nul || call :install_visual_studio
 goto :cuda10
 
 :install_visual_studio
-echo There is not Visual Studio in this PC, will install VS2015.
-echo Download package from "https://paddle-ci.gz.bcebos.com/window_requirement/en_visual_studio_enterprise_2015_with_update_3_x86_x64_web_installer_8922986.exe"
-wget -O vs_installer.exe "https://paddle-ci.gz.bcebos.com/window_requirement/en_visual_studio_enterprise_2015_with_update_3_x86_x64_web_installer_8922986.exe"
-echo Install Visual Studio 2015 ...
+echo There is not Visual Studio in this PC, will install VS2017.
+echo Download package from "https://paddle-ci.gz.bcebos.com/window_requirement/VS2017/vs_Community.exe"
+wget -O vs_Community.exe "https://paddle-ci.gz.bcebos.com/window_requirement/VS2017/vs_Community.exe"
+echo Install Visual Studio 2017 ...
 :: /passive [silent install]
 :: /norestart [no restart]
 :: /NoRefresh [no refresh]
 :: /InstallSelectableItems NativeLanguageSupport_Group [select Visual C++ for installing]
-start /wait vs_installer.exe /passive /norestart /NoRefresh /InstallSelectableItems NativeLanguageSupport_Group
+start /wait vs_Community.exe --passive --add Microsoft.VisualStudio.Workload.NativeDesktop --add Microsoft.VisualStudio.Workload.Universal --includeRecommended
 if %errorlevel% == 0 (
-  echo Install Visual Studio 2015 success!
+  echo Install Visual Studio 2017 success!
 ) else (
-  echo Error***** Install Visual Studio 2015 failed, please re-install it manually.
+  echo Error***** Install Visual Studio 2017 failed, please re-install it manually.
 )
-del vs_installer.exe
 goto :eof
-:: ===== end step 4: Visual Studio 2015 =====
+:: ===== end step 4: Visual Studio 2017 =====
 
-:: ===== start step 5: CUDA 10 =====
+:: ===== start step 5: CUDA 11 =====
 :cuda10
-echo ">>>>>>>> step [5/7]: CUDA 10.2"
-cmd /C nvcc --version 2> nul | findstr /C:"10.2" > nul 2> nul || call :install_cuda
+echo ">>>>>>>> step [5/7]: CUDA 11.2"
+cmd /C nvcc --version 2> nul | findstr /C:"11.2" > nul 2> nul || call :install_cuda
 goto java-jre
 
 :install_cuda
-echo There is not CUDA in this PC, will install CUDA-10.2.
-echo Download package from "https://paddle-ci.gz.bcebos.com/window_requirement/cuda_10.2.89_441.22_win10.exe"
-wget -O cuda_installer.exe "https://paddle-ci.gz.bcebos.com/window_requirement/cuda_10.2.89_441.22_win10.exe"
-echo Install CUDA-10.2 ...
+echo There is not CUDA in this PC, will install CUDA-11.2.
+echo Download package from "https://paddle-ci.gz.bcebos.com/window_requirement/cuda_11.2.0_460.89_win10.exe"
+wget -O cuda_installer.exe "https://paddle-ci.gz.bcebos.com/window_requirement/cuda_11.2.0_460.89_win10.exe"
+echo Install CUDA-11.2 ...
 :: -s [silent install]
 start /wait cuda_installer.exe -s
 if %errorlevel% == 0 (
-  echo Install CUDA-10.2 success!
+  echo Install CUDA-11.2 success!
 ) else (
-  echo Error***** Install CUDA-10.2 failed, please re-install it manually.
+  echo Error***** Install CUDA-11.2 failed, please re-install it manually.
   goto :eof
 )
 del cuda_installer.exe
-echo Download cudnn from "https://paddle-ci.gz.bcebos.com/window_requirement/cudnn-10.2-windows10-x64-v7.6.5.32.zip"
-wget -O cudnn-10.2-windows10-x64-v7.6.5.32.zip "https://paddle-ci.gz.bcebos.com/window_requirement/cudnn-10.2-windows10-x64-v7.6.5.32.zip"
-tar xf cudnn-10.2-windows10-x64-v7.6.5.32.zip
-xcopy /E /Y /R "cuda\bin\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2\bin"
-xcopy /E /Y /R "cuda\include\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2\include"
-xcopy /E /Y /R "cuda\lib\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2\lib"
+echo Download cudnn from "https://paddle-ci.gz.bcebos.com/window_requirement/cudnn-11.2-windows-x64-v8.1.0.77.zip"
+wget -O cudnn-11.2-windows-x64-v8.1.0.77.zip "https://paddle-ci.gz.bcebos.com/window_requirement/cudnn-11.2-windows-x64-v8.1.0.77.zip"
+tar xf cudnn-11.2-windows-x64-v8.1.0.77.zip
+xcopy /E /Y /R "cuda\bin\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.2\bin"
+xcopy /E /Y /R "cuda\include\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.2\include"
+xcopy /E /Y /R "cuda\lib\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.2\lib"
 rd /s /q cuda
-del cudnn-10.2-windows10-x64-v7.6.5.32.zip
 goto :eof
 :: ===== end step 5: CUDA 10 =====
 
@@ -212,7 +207,7 @@ goto :eof
 :: ===== start step 7: xly agent =====
 :xly-agent
 echo ">>>>>>>> step [7/7]: xly agent"
-wget -O agent.jar "https://paddle-ci.gz.bcebos.com/window_requirement/agent.jar"
+wget -O agent.jar "https://xly.bce.baidu.com/sa_server/agent/v1/download?version=1.2.8"
 :: ===== end step 8: xly agent =====
 
 pause
diff --git a/tools/windows/get_prec_ut_list.py b/tools/windows/get_prec_ut_list.py
index ce5b2b6d205d0..2b89b978e459a 100644
--- a/tools/windows/get_prec_ut_list.py
+++ b/tools/windows/get_prec_ut_list.py
@@ -37,6 +37,9 @@ def get_prec_ut_list(all_test_cases, prec_test_cases):
 
 
 if __name__ == '__main__':
+    # get prec cases lists
+    with open('ut_list', 'r') as f:
+        prec_test_cases = f.read()
     all_test_cases = sys.argv[1]
-    prec_test_cases = sys.argv[2]
+    #prec_test_cases = sys.argv[2]
     get_prec_ut_list(all_test_cases, prec_test_cases)
diff --git a/tools/windows/run_unittests.sh b/tools/windows/run_unittests.sh
index c05907fb8998f..0aeea63d6ab28 100644
--- a/tools/windows/run_unittests.sh
+++ b/tools/windows/run_unittests.sh
@@ -38,11 +38,13 @@ else
 fi
 
 # check added ut
-set +e
-cp $PADDLE_ROOT/tools/check_added_ut.sh $PADDLE_ROOT/tools/check_added_ut_win.sh
-bash $PADDLE_ROOT/tools/check_added_ut_win.sh
-rm -rf $PADDLE_ROOT/tools/check_added_ut_win.sh
-set -e
+if [ ${WITH_GPU:-OFF} == "ON" ];then
+    set +e
+    cp $PADDLE_ROOT/tools/check_added_ut.sh $PADDLE_ROOT/tools/check_added_ut_win.sh
+    bash $PADDLE_ROOT/tools/check_added_ut_win.sh
+    rm -rf $PADDLE_ROOT/tools/check_added_ut_win.sh
+    set -e
+fi
 
 
 # /*==================Fixed Disabled Windows unittests==============================*/
@@ -231,9 +233,9 @@ if [ ${WITH_GPU:-OFF} == "ON" ];then
 
     set +e
     if [ ${PRECISION_TEST:-OFF} == "ON" ] && [[ "$precision_cases" != "" ]];then
-        UT_list_res=$(python ${PADDLE_ROOT}/tools/windows/get_prec_ut_list.py "$UT_list" "$precision_cases" )
+        UT_list_res=$(python ${PADDLE_ROOT}/tools/windows/get_prec_ut_list.py "$UT_list" )
         UT_list_prec=$(echo "${UT_list_res}" | grep -v 'PRECISION_TEST')
-        UT_list_prec_info=$(echo "${UT_list_res}" | grep 'PRECISION_TEST')
+        echo "${UT_list_res}" | grep 'PRECISION_TEST'
         UT_list=$UT_list_prec
     fi
     set -e