diff --git a/.gitignore b/.gitignore
index 749832c3930cf..c246a56cf15a4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -32,3 +32,6 @@ build_*
 cmake-build-*
 paddle/fluid/operators/distributed/send_recv.proto
 model_test
+
+Testing
+tools/__pycache__
diff --git a/README.md b/README.md
index 6b3f3ef86fe1b..fce850a3f6320 100644
--- a/README.md
+++ b/README.md
@@ -85,7 +85,7 @@ We provide [English](https://www.paddlepaddle.org.cn/documentation/docs/en/guide
 ## Communication
 
 - [Github Issues](https://github.com/PaddlePaddle/Paddle/issues): bug reports, feature requests, install issues, usage issues, etc.
-- QQ discussion group: 793866180 (PaddlePaddle).
+- QQ discussion group: 441226485 (PaddlePaddle).
 - [Forums](https://ai.baidu.com/forum/topic/list/168?pageNo=1): discuss implementations, research, etc.
     
 ## Courses
diff --git a/README_cn.md b/README_cn.md
index cc8afde7dd266..7d1dd7e8eb611 100644
--- a/README_cn.md
+++ b/README_cn.md
@@ -82,7 +82,7 @@ PaddlePaddle用户可领取**免费Tesla V100在线算力资源**，训练模型
 ## 交流与反馈
 
 - 欢迎您通过[Github Issues](https://github.com/PaddlePaddle/Paddle/issues)来提交问题、报告与建议
-- QQ群: 793866180 (PaddlePaddle)
+- QQ群: 441226485 (PaddlePaddle)
 - [论坛](https://ai.baidu.com/forum/topic/list/168): 欢迎大家在PaddlePaddle论坛分享在使用PaddlePaddle中遇到的问题和经验, 营造良好的论坛氛围
     
 ## 课程
diff --git a/cmake/external/boost.cmake b/cmake/external/boost.cmake
index d88d693d8286d..0c1ec19a2c293 100644
--- a/cmake/external/boost.cmake
+++ b/cmake/external/boost.cmake
@@ -23,7 +23,10 @@ set(BOOST_PROJECT       "extern_boost")
 # checked that the devtools package of CentOS 6 installs boost 1.41.0.
 # So we use 1.41.0 here.
 set(BOOST_VER   "1.41.0")
-set(BOOST_TAR   "boost_1_41_0" CACHE STRING "" FORCE)
+# boost_1_41_0_2021_10.tar.gz is almost the same with boost_1_41_0.tar.gz,
+# except in visualc.hpp i comment a warning of "unknown compiler version",
+# so if you need to change boost, you may need to block the warning similarly.
+set(BOOST_TAR   "boost_1_41_0_2021_10" CACHE STRING "" FORCE)
 set(BOOST_URL   "http://paddlepaddledeps.bj.bcebos.com/${BOOST_TAR}.tar.gz" CACHE STRING "" FORCE)
 
 MESSAGE(STATUS "BOOST_VERSION: ${BOOST_VER}, BOOST_URL: ${BOOST_URL}")
@@ -46,7 +49,7 @@ ExternalProject_Add(
     ${BOOST_PROJECT}
     ${EXTERNAL_PROJECT_LOG_ARGS}
     "${BOOST_DOWNLOAD_CMD}"
-    URL_MD5               f891e8c2c9424f0565f0129ad9ab4aff
+    URL_MD5               51be7cc203628dc0848e97eee32d79e3
     PREFIX                ${BOOST_PREFIX_DIR}
     DOWNLOAD_DIR          ${BOOST_SOURCE_DIR}
     SOURCE_DIR            ${BOOST_SOURCE_DIR}
diff --git a/cmake/external/cinn.cmake b/cmake/external/cinn.cmake
index ee5aea9f8b294..3e9114a35d0f0 100644
--- a/cmake/external/cinn.cmake
+++ b/cmake/external/cinn.cmake
@@ -27,7 +27,7 @@ add_definitions(-w)
 include(ExternalProject)
 set(CINN_SOURCE_DIR ${THIRD_PARTY_PATH}/CINN)
 # TODO(zhhsplendid): Modify git tag after we have release tag
-set(CINN_GIT_TAG e422c01b7875301996a2baf67a14ba61b0e6192a)
+set(CINN_GIT_TAG develop)
 set(CINN_OPTIONAL_ARGS -DPY_VERSION=${PY_VERSION} -DWITH_CUDA=${WITH_GPU} -DWITH_CUDNN=${WITH_GPU} -DPUBLISH_LIBS=ON -DWITH_TESTING=ON)
 set(CINN_BUILD_COMMAND $(MAKE) cinnapi -j)
 ExternalProject_Add(
diff --git a/cmake/external/dirent.cmake b/cmake/external/dirent.cmake
new file mode 100644
index 0000000000000..45c9c7c2d68a0
--- /dev/null
+++ b/cmake/external/dirent.cmake
@@ -0,0 +1,45 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Note(chenxin33): dirent.h is only exist in Linux, so get it from github when build in windows.
+# use dirent tag v1.23.2 on 09/05//2018 https://github.com/tronkko/dirent.git
+
+INCLUDE (ExternalProject)
+
+SET(DIRENT_PREFIX_DIR       ${THIRD_PARTY_PATH}/dirent)
+SET(DIRENT_SOURCE_DIR       ${THIRD_PARTY_PATH}/dirent/src/extern_dirent)
+SET(DIRENT_INCLUDE_DIR      ${DIRENT_SOURCE_DIR}/include)
+
+include_directories(${DIRENT_INCLUDE_DIR})
+
+set(DIRENT_REPOSITORY  ${GIT_URL}/tronkko/dirent)
+set(DIRENT_TAG         1.23.2)
+
+ExternalProject_Add(
+  extern_dirent
+  ${EXTERNAL_PROJECT_LOG_ARGS}
+  ${SHALLOW_CLONE}
+  GIT_REPOSITORY  ${DIRENT_REPOSITORY}
+  GIT_TAG         ${DIRENT_TAG}
+  PREFIX          ${DIRENT_PREFIX_DIR}
+  UPDATE_COMMAND    ""
+  CONFIGURE_COMMAND ""
+  BUILD_COMMAND     ""
+  INSTALL_COMMAND   ""
+  TEST_COMMAND      ""
+)
+
+add_library(dirent INTERFACE)
+
+add_dependencies(dirent extern_dirent)
\ No newline at end of file
diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
index 11a7adbbeb9a8..c87d11ad99426 100644
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -35,7 +35,7 @@ ELSE ()
 ENDIF()
 
 SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
-SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20211020")
+SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20211107")
 SET(XPU_XRE_URL  "${XPU_BASE_URL}/${XPU_XRE_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
 SET(XPU_XDNN_URL "${XPU_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
 SET(XPU_XCCL_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210623/${XPU_XCCL_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 197d12e7ad872..2004abcbfa1f2 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -116,6 +116,20 @@ function(find_fluid_modules TARGET_NAME)
   endif()
 endfunction(find_fluid_modules)
 
+set_property(GLOBAL PROPERTY PTEN_MODULES "")
+# find all pten modules is used for paddle static library
+# for building inference libs
+function(find_pten_modules TARGET_NAME)
+  get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE)
+  string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path ${__target_path})
+  string(FIND "${__target_path}" "pten" pos)
+  if(pos GREATER 1)
+    get_property(pten_modules GLOBAL PROPERTY PTEN_MODULES)
+    set(pten_modules ${pten_modules} ${TARGET_NAME})
+    set_property(GLOBAL PROPERTY PTEN_MODULES "${pten_modules}")
+  endif()
+endfunction(find_pten_modules)
+
 function(common_link TARGET_NAME)
   if (WITH_PROFILER)
     target_link_libraries(${TARGET_NAME} gperftools::profiler)
@@ -310,6 +324,7 @@ function(cc_library TARGET_NAME)
       else()
         add_library(${TARGET_NAME} STATIC ${cc_library_SRCS})
         find_fluid_modules(${TARGET_NAME})
+        find_pten_modules(${TARGET_NAME})
       endif()
     if(cc_library_DEPS)
       # Don't need link libwarpctc.so
@@ -482,6 +497,7 @@ function(nv_library TARGET_NAME)
       else()
         add_library(${TARGET_NAME} STATIC ${nv_library_SRCS})
         find_fluid_modules(${TARGET_NAME})
+        find_pten_modules(${TARGET_NAME})
       endif()
       if (nv_library_DEPS)
         add_dependencies(${TARGET_NAME} ${nv_library_DEPS})
@@ -572,6 +588,7 @@ function(hip_library TARGET_NAME)
       else()
         hip_add_library(${TARGET_NAME} STATIC ${hip_library_SRCS})
         find_fluid_modules(${TARGET_NAME})
+        find_pten_modules(${TARGET_NAME})
       endif()
       if (hip_library_DEPS)
         add_dependencies(${TARGET_NAME} ${hip_library_DEPS})
diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index 7830cf7b50acc..a537719cc7582 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -185,6 +185,7 @@ function(op_library TARGET)
         list(REMOVE_ITEM hip_srcs "cholesky_op.cu")
         list(REMOVE_ITEM hip_srcs "matrix_rank_op.cu")
         list(REMOVE_ITEM hip_srcs "svd_op.cu")
+        list(REMOVE_ITEM hip_srcs "eigvalsh_op.cu")
         list(REMOVE_ITEM hip_srcs "qr_op.cu")
         list(REMOVE_ITEM hip_srcs "eigh_op.cu")
         list(REMOVE_ITEM hip_srcs "multinomial_op.cu")
diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake
index 7cdbee1746a8f..8b3c17ae3dbb2 100644
--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -331,7 +331,7 @@ if (WITH_PSCORE)
 
     include(external/libmct)     # download, build, install libmct
     list(APPEND third_party_deps extern_libmct)
-    
+
     if (WITH_HETERPS)
         include(external/rocksdb)     # download, build, install libmct
         list(APPEND third_party_deps extern_rocksdb)
@@ -378,4 +378,9 @@ if (WITH_POCKETFFT)
     add_definitions(-DPADDLE_WITH_POCKETFFT)
 endif (WITH_POCKETFFT)
 
+if (WIN32)
+    include(external/dirent)
+    list(APPEND third_party_deps extern_dirent)
+endif (WIN32)
+
 add_custom_target(third_party ALL DEPENDS ${third_party_deps})
diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
index c0c04d475959d..b3a1b2e8c9587 100644
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_subdirectory(scripts)
 add_subdirectory(testing)
 set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests CACHE INTERNAL "python tests directory")
+add_subdirectory(pten)
 add_subdirectory(fluid)
diff --git a/paddle/fluid/distributed/CMakeLists.txt b/paddle/fluid/distributed/CMakeLists.txt
index 905347d031b35..24923d7268186 100644
--- a/paddle/fluid/distributed/CMakeLists.txt
+++ b/paddle/fluid/distributed/CMakeLists.txt
@@ -1,4 +1,5 @@
 if(NOT WITH_PSCORE)
+    add_subdirectory(fleet_executor)
     return()
 endif()
 
@@ -11,10 +12,12 @@ if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
             "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new")
 endif()
 
+add_subdirectory(common)
 add_subdirectory(service)
 add_subdirectory(table)
 add_subdirectory(test)
 add_subdirectory(index_dataset)
+add_subdirectory(fleet_executor)
 
 get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS)
 
diff --git a/paddle/fluid/distributed/common/CMakeLists.txt b/paddle/fluid/distributed/common/CMakeLists.txt
new file mode 100644
index 0000000000000..eab6165ca689e
--- /dev/null
+++ b/paddle/fluid/distributed/common/CMakeLists.txt
@@ -0,0 +1,4 @@
+
+cc_library(afs_wrapper SRCS afs_warpper.cc DEPS fs ps_framework_proto)
+
+#set_property(GLOBAL PROPERTY COMMON_DEPS afs_warpper)
diff --git a/paddle/fluid/distributed/common/afs_warpper.cc b/paddle/fluid/distributed/common/afs_warpper.cc
new file mode 100644
index 0000000000000..d539ec6080469
--- /dev/null
+++ b/paddle/fluid/distributed/common/afs_warpper.cc
@@ -0,0 +1,89 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/common/afs_warpper.h"
+#include "paddle/fluid/framework/io/fs.h"
+
+namespace paddle {
+namespace distributed {
+// AfsClient impl
+int AfsClient::initialize(const FsClientParameter& fs_client_param) {
+  // temporarily implemented with hdfs-client
+  return initialize(fs_client_param.hadoop_bin(), fs_client_param.uri(),
+                    fs_client_param.user(), fs_client_param.passwd(),
+                    fs_client_param.buffer_size());
+}
+int AfsClient::initialize(const std::string& hadoop_bin, const std::string& uri,
+                          const std::string& user, const std::string& passwd,
+                          int buffer_size_param) {
+  return initialize(hadoop_bin, uri, paddle::string::format_string(
+                                         "%s,%s", user.c_str(), passwd.c_str()),
+                    buffer_size_param);
+}
+int AfsClient::initialize(const std::string& hadoop_bin, const std::string& uri,
+                          const std::string& ugi, int buffer_size_param) {
+  // temporarily implemented with hdfs-client
+  size_t buffer_size = 1L << 25;  // 32MB
+  if (buffer_size_param > static_cast<int>(buffer_size)) {
+    buffer_size = buffer_size_param;
+  }
+  paddle::framework::hdfs_set_buffer_size(buffer_size);
+  paddle::framework::hdfs_set_command(paddle::string::format_string(
+      "2>>./hdfs_err.log %s fs -Dfs.default.name=%s -Dhadoop.job.ugi=%s "
+      "-Ddfs.client.block.write.retries=15 -Ddfs.rpc.timeout=300000",
+      hadoop_bin.c_str(), uri.c_str(), ugi.c_str()));
+  return 0;
+}
+
+// open file in 'w' or 'r'
+std::shared_ptr<FsReadChannel> AfsClient::open_r(const FsChannelConfig& config,
+                                                 uint32_t buffer_size,
+                                                 int* err_no) {
+  std::shared_ptr<FsReadChannel> channel =
+      std::make_shared<FsReadChannel>(buffer_size);
+  std::shared_ptr<FILE> fp =
+      paddle::framework::fs_open_read(config.path, err_no, config.deconverter);
+  channel->open(fp, config);
+  return channel;
+}
+std::shared_ptr<FsWriteChannel> AfsClient::open_w(const FsChannelConfig& config,
+                                                  uint32_t buffer_size,
+                                                  int* err_no) {
+  std::shared_ptr<FsWriteChannel> channel =
+      std::make_shared<FsWriteChannel>(buffer_size);
+  std::shared_ptr<FILE> fp =
+      paddle::framework::fs_open_write(config.path, err_no, config.converter);
+  channel->open(fp, config);
+  return channel;
+}
+
+// remove file in path, path maybe a reg, such as 'part-000-*'
+void AfsClient::remove(const std::string& path) {
+  return paddle::framework::fs_remove(path);
+}
+void AfsClient::remove_dir(const std::string& dir) {
+  return paddle::framework::fs_remove(dir);
+}
+
+// list files in path, path maybe a dir with reg
+std::vector<std::string> AfsClient::list(const std::string& path) {
+  return paddle::framework::fs_list(path);
+}
+
+// exist or not
+bool AfsClient::exist(const std::string& dir) {
+  return paddle::framework::fs_exists(dir);
+}
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/common/afs_warpper.h b/paddle/fluid/distributed/common/afs_warpper.h
new file mode 100644
index 0000000000000..d10668046c0a7
--- /dev/null
+++ b/paddle/fluid/distributed/common/afs_warpper.h
@@ -0,0 +1,156 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <functional>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <vector>
+#include "paddle/fluid/distributed/ps.pb.h"
+#include "paddle/fluid/string/string_helper.h"
+
+namespace paddle {
+namespace distributed {
+struct FsDataConverter {
+  std::string converter;
+  std::string deconverter;
+};
+
+struct FsChannelConfig {
+  std::string path;       // path of file
+  std::string converter;  // data converter
+  std::string deconverter;
+};
+
+class FsReadChannel {
+ public:
+  FsReadChannel() : _buffer_size(0) {}
+  explicit FsReadChannel(uint32_t buffer_size) : _buffer_size(buffer_size) {}
+  virtual ~FsReadChannel() {}
+  FsReadChannel(FsReadChannel&&) = delete;
+  FsReadChannel(const FsReadChannel&) = delete;
+  int open(std::shared_ptr<FILE> fp, const FsChannelConfig& config) {
+    _file = fp;
+    return 0;
+  }
+  inline int close() {
+    _file.reset();
+    return 0;
+  }
+
+  inline uint32_t read_line(std::string& line_data) {  // NOLINT
+    line_data.clear();
+    char buffer = '\0';
+    size_t read_count = 0;
+    while (1 == fread(&buffer, 1, 1, _file.get()) && buffer != '\n') {
+      ++read_count;
+      line_data.append(&buffer, 1);
+    }
+    if (read_count == 0 && buffer != '\n') {
+      return -1;
+    }
+    return 0;
+  }
+
+ private:
+  uint32_t _buffer_size;
+  FsChannelConfig _config;
+  std::shared_ptr<FILE> _file;
+};
+class FsWriteChannel {
+ public:
+  FsWriteChannel() : _buffer_size(0) {}
+  explicit FsWriteChannel(uint32_t buffer_size) : _buffer_size(buffer_size) {}
+  virtual ~FsWriteChannel() {}
+  FsWriteChannel(FsWriteChannel&&) = delete;
+  FsWriteChannel(const FsWriteChannel&) = delete;
+
+  int open(std::shared_ptr<FILE> fp, const FsChannelConfig& config) {
+    _file = fp;
+
+    // the buffer has set in fs.cc
+    // if (_buffer_size != 0) {
+    //    _buffer = std::shared_ptr<char>(new char[_buffer_size]);
+
+    //    CHECK(0 == setvbuf(&*_file, _buffer.get(), _IOFBF, _buffer_size));
+    //}
+    return 0;
+  }
+
+  inline void flush() { return; }
+
+  inline int close() {
+    flush();
+    _file.reset();
+    return 0;
+  }
+
+  inline uint32_t write_line(const char* data, uint32_t size) {
+    size_t write_count = fwrite_unlocked(data, 1, size, _file.get());
+    if (write_count != size) {
+      return -1;
+    }
+    write_count = fwrite_unlocked("\n", 1, 1, _file.get());
+    if (write_count != 1) {
+      return -1;
+    }
+    return 0;
+  }
+  inline uint32_t write_line(const std::string& data) {
+    return write_line(data.c_str(), data.size());
+  }
+
+ private:
+  uint32_t _buffer_size;
+  FsChannelConfig _config;
+  std::shared_ptr<FILE> _file;
+  std::shared_ptr<char> _buffer;
+};
+
+class AfsClient {
+ public:
+  AfsClient() {}
+  virtual ~AfsClient() {}
+  AfsClient(AfsClient&&) = delete;
+  AfsClient(const AfsClient&) = delete;
+
+  int initialize(const FsClientParameter& fs_client_param);
+  int initialize(const std::string& hadoop_bin, const std::string& uri,
+                 const std::string& user, const std::string& passwd,
+                 int buffer_size_param = (1L << 25));
+  int initialize(const std::string& hadoop_bin, const std::string& uri,
+                 const std::string& ugi, int buffer_size_param = (1L << 25));
+
+  // open file in 'w' or 'r'
+  std::shared_ptr<FsReadChannel> open_r(const FsChannelConfig& config,
+                                        uint32_t buffer_size = 0,
+                                        int* err_no = nullptr);
+  std::shared_ptr<FsWriteChannel> open_w(const FsChannelConfig& config,
+                                         uint32_t buffer_size = 0,
+                                         int* err_no = nullptr);
+
+  // remove file in path, path maybe a reg, such as 'part-000-*'
+  void remove(const std::string& path);
+  void remove_dir(const std::string& dir);
+
+  // list files in path, path maybe a dir with reg
+  std::vector<std::string> list(const std::string& path);
+
+  // exist or not
+  bool exist(const std::string& dir);
+};
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/common/cost_timer.h b/paddle/fluid/distributed/common/cost_timer.h
new file mode 100644
index 0000000000000..d7bf4cc11e0a3
--- /dev/null
+++ b/paddle/fluid/distributed/common/cost_timer.h
@@ -0,0 +1,93 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <memory>
+#include <unordered_map>
+#include "butil/time.h"
+#include "bvar/latency_recorder.h"
+#include "glog/logging.h"
+
+namespace paddle {
+namespace distributed {
+
+struct CostProfilerNode {
+  std::shared_ptr<bvar::LatencyRecorder> recorder;
+};
+
+class CostProfiler {
+ public:
+  ~CostProfiler() {}
+  static CostProfiler& instance() {
+    static CostProfiler profiler;
+    return profiler;
+  }
+
+  void register_profiler(const std::string& label) {
+    if (_cost_profiler_map.find(label) != _cost_profiler_map.end()) {
+      return;
+    }
+    auto profiler_node = std::make_shared<CostProfilerNode>();
+    profiler_node->recorder.reset(
+        new bvar::LatencyRecorder("cost_profiler", label));
+    _cost_profiler_map[label] = profiler_node;
+  }
+
+  CostProfilerNode* profiler(const std::string& label) {
+    auto itr = _cost_profiler_map.find(label);
+    if (itr != _cost_profiler_map.end()) {
+      return itr->second.get();
+    }
+    return NULL;
+  }
+
+ private:
+  CostProfiler() {}
+  std::unordered_map<std::string, std::shared_ptr<CostProfilerNode>>
+      _cost_profiler_map;
+};
+
+class CostTimer {
+ public:
+  explicit CostTimer(const std::string& label) {
+    _label = label;
+    auto& profiler = CostProfiler::instance();
+    _profiler_node = profiler.profiler(label);
+    // 如果不在profiler中，则使用log输出耗时信息
+    _is_print_cost = _profiler_node == NULL;
+    _start_time_ms = butil::gettimeofday_ms();
+  }
+  explicit CostTimer(CostProfilerNode& profiler_node) {  // NOLINT
+    _is_print_cost = false;
+    _profiler_node = &profiler_node;
+    _start_time_ms = butil::gettimeofday_ms();
+  }
+  ~CostTimer() {
+    if (_is_print_cost) {
+      LOG(INFO) << "CostTimer label:" << _label
+                << ", cost:" << butil::gettimeofday_ms() - _start_time_ms
+                << "ms";
+    } else {
+      *(_profiler_node->recorder) << butil::gettimeofday_ms() - _start_time_ms;
+    }
+  }
+
+ private:
+  std::string _label;
+  bool _is_print_cost;
+  uint64_t _start_time_ms;
+  CostProfilerNode* _profiler_node;
+};
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/common/utils.h b/paddle/fluid/distributed/common/utils.h
index 2305001ad6f8f..fb2189b8f5a1b 100644
--- a/paddle/fluid/distributed/common/utils.h
+++ b/paddle/fluid/distributed/common/utils.h
@@ -52,6 +52,20 @@ inline void ADD(int n, const T* x, const T y, T* z) {
   }
 }
 
+template <typename T>
+inline void DIV(int n, const T x, const T* y, T* z) {
+  for (int i = 0; i < n; ++i) {
+    z[i] = x / y[i];
+  }
+}
+
+template <typename T>
+inline void ELE_MUL(int n, const T* x, const T* y, T* z) {
+  for (int i = 0; i < n; ++i) {
+    z[i] = x[i] * y[i];
+  }
+}
+
 static bool StartWith(const std::string& str, const std::string& substr) {
   return str.find(substr) == 0;
 }
@@ -91,5 +105,6 @@ inline double GetCurrentUS() {
   gettimeofday(&time, NULL);
   return 1e+6 * time.tv_sec + time.tv_usec;
 }
+
 }  // namespace distributed
 }  // namespace paddle
diff --git a/paddle/fluid/distributed/fleet_executor/CMakeLists.txt b/paddle/fluid/distributed/fleet_executor/CMakeLists.txt
new file mode 100644
index 0000000000000..0941b2075b893
--- /dev/null
+++ b/paddle/fluid/distributed/fleet_executor/CMakeLists.txt
@@ -0,0 +1,24 @@
+proto_library(fleet_executor_desc_proto SRCS fleet_executor_desc.proto)
+if(WITH_PYTHON)
+  py_proto_compile(fleet_executor_desc_py_proto SRCS fleet_executor_desc.proto)
+endif()
+proto_library(interceptor_message_proto SRCS interceptor_message.proto)
+
+if(WITH_DISTRIBUTE AND NOT (WITH_ASCEND OR WITH_ASCEND_CL))
+  set(BRPC_DEPS brpc ssl crypto)
+else()
+  set(BRPC_DEPS "")
+endif()
+
+cc_library(fleet_executor SRCS fleet_executor.cc carrier.cc
+        interceptor.cc interceptor_message_service.cc message_bus.cc
+        DEPS proto_desc fleet_executor_desc_proto interceptor_message_proto ${BRPC_DEPS})
+
+if(WITH_DISTRIBUTE)
+  set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+  set_source_files_properties(message_bus.h PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+  set_source_files_properties(message_bus.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+  set_source_files_properties(carrier.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+  set_source_files_properties(interceptor_message_service.h PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+  set_source_files_properties(interceptor_message_service.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+endif()
diff --git a/paddle/fluid/distributed/fleet_executor/carrier.cc b/paddle/fluid/distributed/fleet_executor/carrier.cc
new file mode 100644
index 0000000000000..53a3af22c45e7
--- /dev/null
+++ b/paddle/fluid/distributed/fleet_executor/carrier.cc
@@ -0,0 +1,78 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/fleet_executor/carrier.h"
+#include "paddle/fluid/distributed/fleet_executor/interceptor.h"
+#include "paddle/fluid/distributed/fleet_executor/interceptor_message_service.h"
+#include "paddle/fluid/distributed/fleet_executor/task_node.h"
+
+namespace paddle {
+namespace distributed {
+
+Carrier::Carrier(
+    const std::unordered_map<int64_t, TaskNode*>& interceptor_id_to_node)
+    : interceptor_id_to_node_(interceptor_id_to_node) {
+  CreateInterceptors();
+}
+
+bool Carrier::EnqueueInterceptorMessage(
+    const InterceptorMessage& interceptor_message) {
+  // enqueue message to interceptor
+  if (interceptor_message.ctrl_message()) {
+    // handle control message
+    return true;
+  } else {
+    int64_t dst_id = interceptor_message.dst_id();
+    Interceptor* dst_interceptor = GetInterceptor(dst_id);
+    bool rst =
+        dst_interceptor->EnqueueRemoteInterceptorMessage(interceptor_message);
+    if (rst) {
+      std::condition_variable& interceptor_cond_var =
+          dst_interceptor->GetCondVar();
+      interceptor_cond_var.notify_all();
+    }
+    return rst;
+  }
+}
+
+Interceptor* Carrier::GetInterceptor(int64_t interceptor_id) {
+  auto iter = interceptor_idx_to_interceptor_.find(interceptor_id);
+  PADDLE_ENFORCE_NE(iter, interceptor_idx_to_interceptor_.end(),
+                    platform::errors::InvalidArgument(
+                        "Cannot find interceptor instance for interceptor "
+                        "id %lld. Wrong dst? Call before init?",
+                        interceptor_id));
+  return iter->second.get();
+}
+
+void Carrier::CreateInterceptors() {
+  // create each Interceptor
+  for (const auto& item : interceptor_id_to_node_) {
+    int64_t interceptor_id = item.first;
+    TaskNode* task_node = item.second;
+    const auto& iter = interceptor_idx_to_interceptor_.find(interceptor_id);
+    PADDLE_ENFORCE_EQ(iter, interceptor_idx_to_interceptor_.end(),
+                      platform::errors::AlreadyExists(
+                          "The interceptor id %lld has already been created! "
+                          "The interceptor is should be unique.",
+                          interceptor_id));
+    interceptor_idx_to_interceptor_.insert(std::make_pair(
+        interceptor_id,
+        std::make_unique<Interceptor>(interceptor_id, task_node)));
+    VLOG(3) << "Create Interceptor for " << interceptor_id;
+  }
+}
+
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/fleet_executor/carrier.h b/paddle/fluid/distributed/fleet_executor/carrier.h
new file mode 100644
index 0000000000000..bac836deaaaf7
--- /dev/null
+++ b/paddle/fluid/distributed/fleet_executor/carrier.h
@@ -0,0 +1,63 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+
+#include "paddle/fluid/distributed/fleet_executor/interceptor_message.pb.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/errors.h"
+#include "paddle/fluid/platform/macros.h"
+
+namespace paddle {
+namespace distributed {
+
+class Interceptor;
+class TaskNode;
+class InterceptorMessageServiceImpl;
+
+class Carrier final {
+ public:
+  Carrier() = delete;
+
+  explicit Carrier(
+      const std::unordered_map<int64_t, TaskNode*>& interceptor_id_to_node);
+
+  ~Carrier() = default;
+
+  // Enqueue a message to corresponding interceptor id
+  bool EnqueueInterceptorMessage(const InterceptorMessage& interceptor_message);
+
+  DISABLE_COPY_AND_ASSIGN(Carrier);
+
+ private:
+  // create each Interceptor
+  void CreateInterceptors();
+
+  // get interceptor based on the interceptor id
+  Interceptor* GetInterceptor(int64_t interceptor_id);
+
+  // interceptor logic id to the Nodes info
+  std::unordered_map<int64_t, TaskNode*> interceptor_id_to_node_;
+
+  // interceptor logic id to actually interceptor
+  std::unordered_map<int64_t, std::unique_ptr<Interceptor>>
+      interceptor_idx_to_interceptor_;
+};
+
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/fleet_executor/fleet_executor.cc b/paddle/fluid/distributed/fleet_executor/fleet_executor.cc
new file mode 100644
index 0000000000000..b184ea8a71601
--- /dev/null
+++ b/paddle/fluid/distributed/fleet_executor/fleet_executor.cc
@@ -0,0 +1,53 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/fleet_executor/fleet_executor.h"
+#include "paddle/fluid/distributed/fleet_executor/runtime_graph.h"
+#include "paddle/fluid/framework/program_desc.h"
+
+namespace paddle {
+namespace distributed {
+
+FleetExecutor::FleetExecutor(const std::string& exe_desc_str) {
+  // Initialize Executor
+}
+
+FleetExecutor::~FleetExecutor() {
+  // Destroy Executor
+}
+
+void FleetExecutor::Init(const paddle::framework::ProgramDesc& program_desc) {
+  // Compile and Initialize
+}
+
+void FleetExecutor::Run() {
+  // Run
+}
+
+void FleetExecutor::Release() {
+  // Release
+}
+
+std::shared_ptr<Carrier> FleetExecutor::GetCarrier() {
+  // get carrier
+  return nullptr;
+}
+
+std::shared_ptr<MessageBus> FleetExecutor::GetMessageBus() {
+  // get message bus
+  return nullptr;
+}
+
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/fleet_executor/fleet_executor.h b/paddle/fluid/distributed/fleet_executor/fleet_executor.h
new file mode 100644
index 0000000000000..613dacf5496f7
--- /dev/null
+++ b/paddle/fluid/distributed/fleet_executor/fleet_executor.h
@@ -0,0 +1,51 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <memory>
+
+#include "paddle/fluid/distributed/fleet_executor/fleet_executor_desc.pb.h"
+#include "paddle/fluid/platform/macros.h"
+
+namespace paddle {
+namespace framework {
+class ProgramDesc;
+}
+
+namespace distributed {
+class RuntimeGraph;
+class Carrier;
+class MessageBus;
+
+class FleetExecutor final {
+ public:
+  FleetExecutor() = delete;
+  FleetExecutor(const std::string& exe_desc_str);
+  ~FleetExecutor();
+  void Init(const paddle::framework::ProgramDesc& program_desc);
+  void Run();
+  void Release();
+  static std::shared_ptr<Carrier> GetCarrier();
+  static std::shared_ptr<MessageBus> GetMessageBus();
+
+ private:
+  DISABLE_COPY_AND_ASSIGN(FleetExecutor);
+  FleetExecutorDesc exe_desc_;
+  std::unique_ptr<RuntimeGraph> runtime_graph_;
+  static std::shared_ptr<Carrier> global_carrier_;
+  static std::shared_ptr<MessageBus> global_message_bus_;
+};
+
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/fleet_executor/fleet_executor_desc.proto b/paddle/fluid/distributed/fleet_executor/fleet_executor_desc.proto
new file mode 100644
index 0000000000000..3db8984b5dcff
--- /dev/null
+++ b/paddle/fluid/distributed/fleet_executor/fleet_executor_desc.proto
@@ -0,0 +1,21 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+syntax = "proto2";
+package paddle.distributed;
+
+message FleetExecutorDesc {
+  optional string grain = 1 [ default = "coarse" ];
+  repeated string addrs = 2; // "ip:port" of all ranks
+}
diff --git a/paddle/fluid/distributed/fleet_executor/interceptor.cc b/paddle/fluid/distributed/fleet_executor/interceptor.cc
new file mode 100644
index 0000000000000..0b3f3ff2de84a
--- /dev/null
+++ b/paddle/fluid/distributed/fleet_executor/interceptor.cc
@@ -0,0 +1,109 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/fleet_executor/interceptor.h"
+
+namespace paddle {
+namespace distributed {
+
+Interceptor::Interceptor(int64_t interceptor_id, TaskNode* node)
+    : interceptor_id_(interceptor_id), node_(node) {
+  interceptor_thread_ = std::thread([this]() {
+    VLOG(3) << "Start pooling local mailbox's thread.";
+    PoolTheMailbox();
+  });
+}
+
+Interceptor::~Interceptor() { interceptor_thread_.join(); }
+
+void Interceptor::RegisterInterceptorHandle(InterceptorHandle handle) {
+  handle_ = handle;
+}
+
+void Interceptor::Handle(const InterceptorMessage& msg) {
+  if (handle_) {
+    handle_(msg);
+  }
+}
+
+std::condition_variable& Interceptor::GetCondVar() {
+  // get the conditional var
+  return cond_var_;
+}
+
+int64_t Interceptor::GetInterceptorId() const {
+  // return the interceptor id
+  return interceptor_id_;
+}
+
+bool Interceptor::EnqueueRemoteInterceptorMessage(
+    const InterceptorMessage& interceptor_message) {
+  // Called by Carrier, enqueue an InterceptorMessage to remote mailbox
+  VLOG(3) << "Enqueue message: " << interceptor_message.message_type()
+          << " into " << interceptor_id_ << "'s remote mailbox.";
+  std::unique_lock<std::mutex> lock(remote_mailbox_mutex_);
+  remote_mailbox_.push(interceptor_message);
+  return true;
+}
+
+void Interceptor::Send(int64_t dst_id,
+                       std::unique_ptr<InterceptorMessage> msg) {
+  msg->set_src_id(interceptor_id_);
+  msg->set_dst_id(dst_id);
+  // send interceptor msg
+}
+
+void Interceptor::PoolTheMailbox() {
+  // pool the local mailbox, parse the Message
+  while (true) {
+    if (local_mailbox_.empty()) {
+      // local mailbox is empty, fetch the remote mailbox
+      VLOG(3) << interceptor_id_ << "'s local mailbox is empty. "
+              << "Fetch the remote mailbox.";
+      PADDLE_ENFORCE_EQ(FetchRemoteMailbox(), true,
+                        platform::errors::InvalidArgument(
+                            "Error encountered when fetch remote mailbox."));
+    }
+    const InterceptorMessage interceptor_message = local_mailbox_.front();
+    local_mailbox_.pop();
+    const MessageType message_type = interceptor_message.message_type();
+    VLOG(3) << interceptor_id_ << " has received a message: " << message_type
+            << ".";
+    if (message_type == STOP) {
+      // break the pooling thread
+      break;
+    }
+
+    Handle(interceptor_message);
+  }
+}
+
+bool Interceptor::FetchRemoteMailbox() {
+  // fetch all Message from remote mailbox to local mailbox
+  // return true if remote mailbox not empty, otherwise return false
+  std::unique_lock<std::mutex> lock(remote_mailbox_mutex_);
+  cond_var_.wait(lock, [this]() { return !remote_mailbox_.empty(); });
+  if (remote_mailbox_.empty()) {
+    // the thread has been unblocked accidentally
+    return false;
+  }
+  while (!remote_mailbox_.empty()) {
+    local_mailbox_.push(std::move(remote_mailbox_.front()));
+    remote_mailbox_.pop();
+  }
+  return true;
+}
+
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/fleet_executor/interceptor.h b/paddle/fluid/distributed/fleet_executor/interceptor.h
new file mode 100644
index 0000000000000..02696d8edd737
--- /dev/null
+++ b/paddle/fluid/distributed/fleet_executor/interceptor.h
@@ -0,0 +1,102 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <condition_variable>
+#include <functional>
+#include <map>
+#include <memory>
+#include <queue>
+#include <thread>
+#include <vector>
+
+#include "paddle/fluid/distributed/fleet_executor/interceptor_message.pb.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/errors.h"
+#include "paddle/fluid/platform/macros.h"
+
+namespace paddle {
+namespace distributed {
+
+class TaskNode;
+
+class Interceptor {
+ public:
+  using InterceptorHandle = std::function<void(const InterceptorMessage&)>;
+
+ public:
+  Interceptor() = delete;
+
+  Interceptor(int64_t interceptor_id, TaskNode* node);
+
+  virtual ~Interceptor();
+
+  // register interceptor handle
+  void RegisterInterceptorHandle(InterceptorHandle handle);
+
+  void Handle(const InterceptorMessage& msg);
+
+  // return the interceptor id
+  int64_t GetInterceptorId() const;
+
+  // return the conditional var
+  std::condition_variable& GetCondVar();
+
+  // Called by Carrier, enqueue an InterceptorMessage to remote mailbox
+  bool EnqueueRemoteInterceptorMessage(
+      const InterceptorMessage& interceptor_message);
+
+  void Send(int64_t dst_id, std::unique_ptr<InterceptorMessage> msg);
+
+  DISABLE_COPY_AND_ASSIGN(Interceptor);
+
+ private:
+  // pool the local mailbox, parse the Message
+  void PoolTheMailbox();
+
+  // fetch all Message from remote mailbox to local mailbox
+  // return true if remote mailbox not empty, otherwise return false
+  bool FetchRemoteMailbox();
+
+  // interceptor id, handed from above layer
+  int64_t interceptor_id_;
+
+  // node need to be handled by this interceptor
+  TaskNode* node_;
+
+  // interceptor handle which process message
+  InterceptorHandle handle_{nullptr};
+
+  // mutex to control read/write conflict for remote mailbox
+  std::mutex remote_mailbox_mutex_;
+
+  // interceptor runs PoolTheMailbox() function to poll local mailbox
+  std::thread interceptor_thread_;
+
+  // conditional variable for blocking the thread when
+  // fetch an empty remote mailbox
+  std::condition_variable cond_var_;
+
+  // remote mailbox, written by EnqueueRemoteMessage()
+  // read by FetchRemoteMailbox()
+  std::queue<InterceptorMessage> remote_mailbox_;
+
+  // local mailbox, written by FetchRemoteMailbox()
+  // read by PoolTheMailbox()
+  std::queue<InterceptorMessage> local_mailbox_;
+};
+
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/fleet_executor/interceptor_message.proto b/paddle/fluid/distributed/fleet_executor/interceptor_message.proto
new file mode 100644
index 0000000000000..a2fe01cfe3822
--- /dev/null
+++ b/paddle/fluid/distributed/fleet_executor/interceptor_message.proto
@@ -0,0 +1,40 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+syntax = "proto2";
+package paddle.distributed;
+option cc_generic_services = true;
+option cc_enable_arenas = true;
+
+enum MessageType {
+  STOP = 1;            // STOP an Interceptor
+  DATA_IS_READY = 2;   // upstream data is ready
+  DATE_IS_USELESS = 3; // downstream has used the data
+  ERROR = 4;           // current Interceptor encounters error
+  RESET = 5;           // reset the status
+}
+
+message InterceptorMessage {
+  optional int64 src_id = 1 [ default = 0 ];
+  optional int64 dst_id = 2 [ default = 0 ];
+  optional MessageType message_type = 3 [ default = RESET ];
+  optional bool ctrl_message = 4 [ default = false ];
+}
+
+message InterceptorResponse { optional bool rst = 1 [ default = false ]; }
+
+service TheInterceptorMessageService {
+  rpc InterceptorMessageService(InterceptorMessage)
+      returns (InterceptorResponse);
+}
diff --git a/paddle/fluid/distributed/fleet_executor/interceptor_message_service.cc b/paddle/fluid/distributed/fleet_executor/interceptor_message_service.cc
new file mode 100644
index 0000000000000..d30d356e4ff28
--- /dev/null
+++ b/paddle/fluid/distributed/fleet_executor/interceptor_message_service.cc
@@ -0,0 +1,42 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
+    !defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/distributed/fleet_executor/interceptor_message_service.h"
+#include "brpc/server.h"
+#include "paddle/fluid/distributed/fleet_executor/carrier.h"
+#include "paddle/fluid/distributed/fleet_executor/fleet_executor.h"
+
+namespace paddle {
+namespace distributed {
+
+void InterceptorMessageServiceImpl::InterceptorMessageService(
+    google::protobuf::RpcController* control_base,
+    const InterceptorMessage* request, InterceptorResponse* response,
+    google::protobuf::Closure* done) {
+  brpc::ClosureGuard done_guard(done);
+  VLOG(3) << "Interceptor Message Service receives a message from: "
+          << request->src_id()
+          << ", with the message: " << request->message_type();
+  response->set_rst(true);
+  // call interceptor manager's method to handle the message
+  std::shared_ptr<Carrier> carrier = FleetExecutor::GetCarrier();
+  if (carrier != nullptr) {
+    carrier->EnqueueInterceptorMessage(*request);
+  }
+}
+
+}  // namespace distributed
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/distributed/fleet_executor/interceptor_message_service.h b/paddle/fluid/distributed/fleet_executor/interceptor_message_service.h
new file mode 100644
index 0000000000000..0a8dfc861a910
--- /dev/null
+++ b/paddle/fluid/distributed/fleet_executor/interceptor_message_service.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
+    !defined(PADDLE_WITH_ASCEND_CL)
+#pragma once
+
+#include "brpc/server.h"
+#include "paddle/fluid/distributed/fleet_executor/interceptor_message.pb.h"
+
+namespace paddle {
+namespace distributed {
+
+class InterceptorMessageServiceImpl : public TheInterceptorMessageService {
+ public:
+  InterceptorMessageServiceImpl() {}
+  virtual ~InterceptorMessageServiceImpl() {}
+  virtual void InterceptorMessageService(
+      google::protobuf::RpcController* control_base,
+      const InterceptorMessage* request, InterceptorResponse* response,
+      google::protobuf::Closure* done);
+};
+
+}  // namespace distributed
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/distributed/fleet_executor/message_bus.cc b/paddle/fluid/distributed/fleet_executor/message_bus.cc
new file mode 100644
index 0000000000000..0094dbd1f10a1
--- /dev/null
+++ b/paddle/fluid/distributed/fleet_executor/message_bus.cc
@@ -0,0 +1,184 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+
+#include "paddle/fluid/distributed/fleet_executor/carrier.h"
+#include "paddle/fluid/distributed/fleet_executor/fleet_executor.h"
+#include "paddle/fluid/distributed/fleet_executor/message_bus.h"
+
+namespace paddle {
+namespace distributed {
+
+MessageBus::MessageBus(
+    const std::unordered_map<int64_t, int64_t>& interceptor_id_to_rank,
+    const std::unordered_map<int64_t, std::string>& rank_to_addr,
+    const std::string& addr)
+    : interceptor_id_to_rank_(interceptor_id_to_rank),
+      rank_to_addr_(rank_to_addr),
+      addr_(addr) {
+  listen_port_thread_ = std::thread([this]() {
+    VLOG(3) << "Start listen_port_thread_ for message bus";
+    ListenPort();
+  });
+}
+
+MessageBus::~MessageBus() {
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
+    !defined(PADDLE_WITH_ASCEND_CL)
+  server_.Stop(1000);
+  server_.Join();
+#endif
+  listen_port_thread_.join();
+}
+
+bool MessageBus::Send(const InterceptorMessage& interceptor_message) {
+  // called by Interceptor, send InterceptorMessage to dst
+  int64_t src_id = interceptor_message.src_id();
+  int64_t dst_id = interceptor_message.dst_id();
+  if (IsSameRank(src_id, dst_id)) {
+    VLOG(3) << "Send a message from rank " << src_id << " to rank " << dst_id
+            << ", which are same ranks.";
+    return SendIntraRank(interceptor_message);
+  } else {
+    VLOG(3) << "Send a message from rank " << src_id << " to rank " << dst_id
+            << ", which are different ranks.";
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
+    !defined(PADDLE_WITH_ASCEND_CL)
+    int retry_time = 0;  // message bus will retry sending for 10 times
+    while (retry_time < 10) {
+      ++retry_time;
+      if (SendInterRank(interceptor_message)) {
+        VLOG(3) << "Message bus sends inter rank successfully with "
+                << retry_time << " times retries.";
+        return true;
+      }
+    }
+    VLOG(3) << "Message bus sends inter rank fail after 10 times retries.";
+    return false;
+#else
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Fleet executor does not support sending message between different "
+        "ranks when Paddle is compiled with npu or "
+        "isn't compiled with distributed for now."));
+#endif
+  }
+  return true;
+}
+
+void MessageBus::ListenPort() {
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
+    !defined(PADDLE_WITH_ASCEND_CL)
+  // function keep listen the port and handle the message
+  InterceptorMessageServiceImpl interceptor_message_service;
+  PADDLE_ENFORCE_EQ(server_.AddService(&interceptor_message_service,
+                                       brpc::SERVER_DOESNT_OWN_SERVICE),
+                    0, platform::errors::Unavailable(
+                           "Message bus: init brpc service error."));
+
+  // start the server
+  const char* ip_for_brpc = addr_.c_str();
+  brpc::ServerOptions options;
+  options.idle_timeout_sec = -1;
+  PADDLE_ENFORCE_EQ(
+      server_.Start(ip_for_brpc, &options), 0,
+      platform::errors::Unavailable("Message bus: start brpc service error."));
+  VLOG(3) << "Message bus's listen port thread starts successful.";
+#else
+  VLOG(3) << "Fleet executor's ListenPort() is a fake function when Paddle is "
+             "compiled with npu or Paddle isn't compiled "
+             "with distributed for now.";
+#endif
+}
+
+bool MessageBus::IsSameRank(int64_t src_id, int64_t dst_id) {
+  // check whether the dst is the same rank or different rank with src
+  const auto& src_rank = interceptor_id_to_rank_.find(src_id);
+  const auto& dst_rank = interceptor_id_to_rank_.find(dst_id);
+  PADDLE_ENFORCE_NE(
+      src_rank, interceptor_id_to_rank_.end(),
+      platform::errors::NotFound(
+          "Cannot find rank for src interceptor id %lld. Init error.", src_id));
+  PADDLE_ENFORCE_NE(
+      dst_rank, interceptor_id_to_rank_.end(),
+      platform::errors::NotFound(
+          "Cannot find rank for dst interceptor id %lld. Init error.", dst_id));
+  const auto& src_ip = rank_to_addr_.find(src_rank->second);
+  PADDLE_ENFORCE_NE(src_ip, rank_to_addr_.end(),
+                    platform::errors::NotFound(
+                        "Cannot find addr for src rank id %lld. Init error.",
+                        src_rank->second));
+  PADDLE_ENFORCE_EQ(
+      src_ip->second, addr_,
+      platform::errors::Fatal("The src interceptor's addr is %s, while the "
+                              "message bus's addr is %s, which are different. "
+                              "Init error.",
+                              src_ip->second, addr_));
+  return src_rank->second == dst_rank->second;
+}
+
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
+    !defined(PADDLE_WITH_ASCEND_CL)
+bool MessageBus::SendInterRank(const InterceptorMessage& interceptor_message) {
+  // send the message inter rank (dst is different rank with src)
+  int64_t dst_id = interceptor_message.dst_id();
+  int64_t dst_rank = interceptor_id_to_rank_[dst_id];
+  auto dst_ip = rank_to_addr_.find(dst_rank);
+  PADDLE_ENFORCE_NE(dst_ip, rank_to_addr_.end(),
+                    platform::errors::InvalidArgument(
+                        "Cannot find rank for dst interceptor id %lld. "
+                        "Init error.",
+                        dst_id));
+  const char* dst_ip_for_brpc = dst_ip->second.c_str();
+  brpc::Channel channel;
+  brpc::ChannelOptions options;
+  options.protocol = "baidu_std";
+  options.connect_timeout_ms = 1000;
+  options.timeout_ms = 1000;
+  options.max_retry = 5;
+  PADDLE_ENFORCE_EQ(
+      channel.Init(dst_ip_for_brpc, &options), 0,
+      platform::errors::Unavailable("Message bus: init brpc channel error."));
+  TheInterceptorMessageService_Stub stub(&channel);
+  InterceptorResponse response;
+  brpc::Controller ctrl;
+  ctrl.set_log_id(0);
+  stub.InterceptorMessageService(&ctrl, &interceptor_message, &response, NULL);
+  if (!ctrl.Failed()) {
+    if (response.rst()) {
+      VLOG(3) << "Message bus: brpc sends success.";
+      return true;
+    } else {
+      VLOG(4) << "Message bus: InterceptorMessageService error.";
+      return false;
+    }
+  } else {
+    VLOG(4) << "Message bus: brpc sends failed with error text: "
+            << ctrl.ErrorText();
+    return false;
+  }
+}
+#endif
+
+bool MessageBus::SendIntraRank(const InterceptorMessage& interceptor_message) {
+  // send the message intra rank (dst is the same rank with src)
+  std::shared_ptr<Carrier> carrier = FleetExecutor::GetCarrier();
+  if (carrier != nullptr) {
+    return carrier->EnqueueInterceptorMessage(interceptor_message);
+  }
+  return true;
+}
+
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/fleet_executor/message_bus.h b/paddle/fluid/distributed/fleet_executor/message_bus.h
new file mode 100644
index 0000000000000..86f34e203c5de
--- /dev/null
+++ b/paddle/fluid/distributed/fleet_executor/message_bus.h
@@ -0,0 +1,90 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <thread>
+#include <unordered_map>
+
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
+    !defined(PADDLE_WITH_ASCEND_CL)
+#include "brpc/channel.h"
+#include "brpc/server.h"
+#include "paddle/fluid/distributed/fleet_executor/interceptor_message_service.h"
+#endif
+
+#include "paddle/fluid/distributed/fleet_executor/interceptor_message.pb.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/errors.h"
+#include "paddle/fluid/platform/macros.h"
+
+namespace paddle {
+namespace distributed {
+
+class Carrier;
+
+class MessageBus final {
+ public:
+  MessageBus() = delete;
+
+  MessageBus(const std::unordered_map<int64_t, int64_t>& interceptor_id_to_rank,
+             const std::unordered_map<int64_t, std::string>& rank_to_addr,
+             const std::string& addr);
+
+  ~MessageBus();
+
+  // called by Interceptor, send InterceptorMessage to dst
+  bool Send(const InterceptorMessage& interceptor_message);
+
+  DISABLE_COPY_AND_ASSIGN(MessageBus);
+
+ private:
+  // function keep listen the port and handle the message
+  void ListenPort();
+
+  // check whether the dst is the same rank or different rank with src
+  bool IsSameRank(int64_t src_id, int64_t dst_id);
+
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
+    !defined(PADDLE_WITH_ASCEND_CL)
+  // send the message inter rank (dst is different rank with src)
+  bool SendInterRank(const InterceptorMessage& interceptor_message);
+#endif
+
+  // send the message intra rank (dst is the same rank with src)
+  bool SendIntraRank(const InterceptorMessage& interceptor_message);
+
+  // handed by above layer, save the info mapping interceptor id to rank id
+  std::unordered_map<int64_t, int64_t> interceptor_id_to_rank_;
+
+  // handed by above layer, save the info mapping rank id to addr
+  std::unordered_map<int64_t, std::string> rank_to_addr_;
+
+  // the ip needs to be listened
+  std::string addr_;
+
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
+    !defined(PADDLE_WITH_ASCEND_CL)
+  // brpc server
+  brpc::Server server_;
+#endif
+
+  // thread keeps listening to the port to receive remote message
+  // this thread runs ListenPort() function
+  std::thread listen_port_thread_;
+};
+
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/fleet_executor/runtime_graph.h b/paddle/fluid/distributed/fleet_executor/runtime_graph.h
new file mode 100644
index 0000000000000..7ae573039e671
--- /dev/null
+++ b/paddle/fluid/distributed/fleet_executor/runtime_graph.h
@@ -0,0 +1,34 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+namespace paddle {
+namespace framework {
+class ProgramDesc;
+}
+
+namespace distributed {
+
+class RuntimeGraph final {
+ public:
+  RuntimeGraph() = default;
+  explicit RuntimeGraph(const paddle::framework::ProgramDesc &program) {}
+  ~RuntimeGraph() = default;
+
+  DISABLE_COPY_AND_ASSIGN(RuntimeGraph);
+};
+
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/fleet_executor/task_node.h b/paddle/fluid/distributed/fleet_executor/task_node.h
new file mode 100644
index 0000000000000..62fb9dfb01188
--- /dev/null
+++ b/paddle/fluid/distributed/fleet_executor/task_node.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+namespace paddle {
+namespace distributed {
+
+class TaskNode final {
+ public:
+  TaskNode() = default;
+  ~TaskNode() = default;
+};
+
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/service/communicator.cc b/paddle/fluid/distributed/service/communicator.cc
index 3d5ab8e16d902..30529d73fa199 100644
--- a/paddle/fluid/distributed/service/communicator.cc
+++ b/paddle/fluid/distributed/service/communicator.cc
@@ -283,6 +283,18 @@ void Communicator::RpcSendSparse(const std::string &var_name, int table_id,
     push_g_vec.push_back(tensor->mutable_value()->data<float>() + i * dim);
   }
 
+  // TODO(wangguanqun): padding_idx is not ignored, this is a bug.
+  // if padding_idx == padding in datareader, the server will core.
+  /*
+  for (size_t i = 0; i < tensor->rows().size(); ++i) {
+    uint64_t real_id = static_cast<uint64_t>(tensor->rows()[i]);
+    if (real_id != 0) {
+      sparse_push_keys.push_back(real_id);
+      push_g_vec.push_back(tensor->mutable_value()->data<float>() + i * dim);
+    }
+  }
+  */
+
   ++_async_call_num;
   DownpourBrpcClosure *closure = new DownpourBrpcClosure(
       request_call_num, [this, request_call_num](void *done) {
@@ -353,6 +365,17 @@ void Communicator::InitParams(const RecvCtxMap &recv_varname_to_ctx) {
   return;
 }
 
+void Communicator::PullDense(const RecvCtxMap &recv_varname_to_ctx) {
+  for (auto &iter : recv_varname_to_ctx) {
+    auto &table_id = iter.first;
+    auto &varnames = iter.second;
+    RpcRecvDense(varnames, table_id, recv_scope_);
+    VLOG(1) << "pull dense param to table " << table_id
+            << " from 0' trainer done";
+  }
+  return;
+}
+
 void Communicator::RpcProfilerControl() {
   if (trainer_id_ == 0) {
     if (!do_server_profiler_ && platform::IsProfileEnabled()) {
diff --git a/paddle/fluid/distributed/service/communicator.h b/paddle/fluid/distributed/service/communicator.h
index c6d37defbd626..01ec3c617d551 100644
--- a/paddle/fluid/distributed/service/communicator.h
+++ b/paddle/fluid/distributed/service/communicator.h
@@ -271,6 +271,8 @@ class Communicator {
 
   virtual void InitParams(const RecvCtxMap &recv_varname_to_ctx);
 
+  virtual void PullDense(const RecvCtxMap &recv_varname_to_ctx);
+
   virtual void Start() = 0;
 
   virtual void Stop() = 0;
diff --git a/paddle/fluid/distributed/service/env.h b/paddle/fluid/distributed/service/env.h
index ca395a776afd4..0cc57229b7a82 100644
--- a/paddle/fluid/distributed/service/env.h
+++ b/paddle/fluid/distributed/service/env.h
@@ -144,8 +144,8 @@ class PSEnvironment {
 
   virtual std::vector<uint64_t> get_client_info() {
     std::vector<uint64_t> client_info;
-    for (auto &i : _ps_client_sign_set) {
-      client_info.push_back(i);
+    for (auto &i : _ps_client_list) {
+      client_info.push_back(i.serialize_to_uint64());
     }
     return client_info;
   }
@@ -250,7 +250,7 @@ class PaddlePSEnvironment : public PSEnvironment {
     return 0;
   }
 
-  virtual int32_t set_ps_clients(std::vector<std::string> *host_sign_list,
+  virtual int32_t set_ps_clients(const std::vector<std::string> *host_sign_list,
                                  int node_num) {
     _ps_client_list.clear();
     _ps_client_sign_set.clear();
@@ -265,6 +265,7 @@ class PaddlePSEnvironment : public PSEnvironment {
     std::sort(
         _ps_client_list.begin(), _ps_client_list.end(),
         [](const PSHost &h1, const PSHost &h2) { return h1.rank < h2.rank; });
+    VLOG(1) << "env.set_ps_clients done\n";
     return 0;
   }
 
diff --git a/paddle/fluid/distributed/service/graph_brpc_client.cc b/paddle/fluid/distributed/service/graph_brpc_client.cc
index 9f65a66708def..13132740bb1dc 100644
--- a/paddle/fluid/distributed/service/graph_brpc_client.cc
+++ b/paddle/fluid/distributed/service/graph_brpc_client.cc
@@ -302,7 +302,7 @@ std::future<int32_t> GraphBrpcClient::remove_graph_node(
   return fut;
 }
 // char* &buffer,int &actual_size
-std::future<int32_t> GraphBrpcClient::batch_sample_neighboors(
+std::future<int32_t> GraphBrpcClient::batch_sample_neighbors(
     uint32_t table_id, std::vector<uint64_t> node_ids, int sample_size,
     std::vector<std::vector<std::pair<uint64_t, float>>> &res,
     int server_index) {
@@ -390,8 +390,8 @@ std::future<int32_t> GraphBrpcClient::batch_sample_neighboors(
         size_t fail_num = 0;
         for (size_t request_idx = 0; request_idx < request_call_num;
              ++request_idx) {
-          if (closure->check_response(request_idx,
-                                      PS_GRAPH_SAMPLE_NEIGHBOORS) != 0) {
+          if (closure->check_response(request_idx, PS_GRAPH_SAMPLE_NEIGHBORS) !=
+              0) {
             ++fail_num;
           } else {
             auto &res_io_buffer =
@@ -435,7 +435,7 @@ std::future<int32_t> GraphBrpcClient::batch_sample_neighboors(
 
   for (int request_idx = 0; request_idx < request_call_num; ++request_idx) {
     int server_index = request2server[request_idx];
-    closure->request(request_idx)->set_cmd_id(PS_GRAPH_SAMPLE_NEIGHBOORS);
+    closure->request(request_idx)->set_cmd_id(PS_GRAPH_SAMPLE_NEIGHBORS);
     closure->request(request_idx)->set_table_id(table_id);
     closure->request(request_idx)->set_client_id(_client_id);
     size_t node_num = node_id_buckets[request_idx].size();
@@ -494,6 +494,47 @@ std::future<int32_t> GraphBrpcClient::random_sample_nodes(
                    closure);
   return fut;
 }
+
+std::future<int32_t> GraphBrpcClient::use_neighbors_sample_cache(
+    uint32_t table_id, size_t total_size_limit, size_t ttl) {
+  DownpourBrpcClosure *closure = new DownpourBrpcClosure(
+      server_size, [&, server_size = this->server_size ](void *done) {
+        int ret = 0;
+        auto *closure = (DownpourBrpcClosure *)done;
+        size_t fail_num = 0;
+        for (size_t request_idx = 0; request_idx < server_size; ++request_idx) {
+          if (closure->check_response(
+                  request_idx, PS_GRAPH_USE_NEIGHBORS_SAMPLE_CACHE) != 0) {
+            ++fail_num;
+            break;
+          }
+        }
+        ret = fail_num == 0 ? 0 : -1;
+        closure->set_promise_value(ret);
+      });
+  auto promise = std::make_shared<std::promise<int32_t>>();
+  closure->add_promise(promise);
+  size_t size_limit = total_size_limit / server_size +
+                      (total_size_limit % server_size != 0 ? 1 : 0);
+  std::future<int> fut = promise->get_future();
+  for (size_t i = 0; i < server_size; i++) {
+    int server_index = i;
+    closure->request(server_index)
+        ->set_cmd_id(PS_GRAPH_USE_NEIGHBORS_SAMPLE_CACHE);
+    closure->request(server_index)->set_table_id(table_id);
+    closure->request(server_index)->set_client_id(_client_id);
+    closure->request(server_index)
+        ->add_params((char *)&size_limit, sizeof(size_t));
+    closure->request(server_index)->add_params((char *)&ttl, sizeof(size_t));
+    GraphPsService_Stub rpc_stub =
+        getServiceStub(get_cmd_channel(server_index));
+    closure->cntl(server_index)->set_log_id(butil::gettimeofday_ms());
+    rpc_stub.service(closure->cntl(server_index),
+                     closure->request(server_index),
+                     closure->response(server_index), closure);
+  }
+  return fut;
+}
 std::future<int32_t> GraphBrpcClient::pull_graph_list(
     uint32_t table_id, int server_index, int start, int size, int step,
     std::vector<FeatureNode> &res) {
@@ -515,7 +556,7 @@ std::future<int32_t> GraphBrpcClient::pull_graph_list(
         index += node.get_size(false);
         res.push_back(node);
       }
-      delete buffer;
+      delete[] buffer;
     }
     closure->set_promise_value(ret);
   });
diff --git a/paddle/fluid/distributed/service/graph_brpc_client.h b/paddle/fluid/distributed/service/graph_brpc_client.h
index 1fbb3fa9b0550..c1083afb71abf 100644
--- a/paddle/fluid/distributed/service/graph_brpc_client.h
+++ b/paddle/fluid/distributed/service/graph_brpc_client.h
@@ -61,8 +61,8 @@ class GraphBrpcClient : public BrpcPsClient {
  public:
   GraphBrpcClient() {}
   virtual ~GraphBrpcClient() {}
-  // given a batch of nodes, sample graph_neighboors for each of them
-  virtual std::future<int32_t> batch_sample_neighboors(
+  // given a batch of nodes, sample graph_neighbors for each of them
+  virtual std::future<int32_t> batch_sample_neighbors(
       uint32_t table_id, std::vector<uint64_t> node_ids, int sample_size,
       std::vector<std::vector<std::pair<uint64_t, float>>>& res,
       int server_index = -1);
@@ -89,6 +89,9 @@ class GraphBrpcClient : public BrpcPsClient {
   virtual std::future<int32_t> add_graph_node(
       uint32_t table_id, std::vector<uint64_t>& node_id_list,
       std::vector<bool>& is_weighted_list);
+  virtual std::future<int32_t> use_neighbors_sample_cache(uint32_t table_id,
+                                                          size_t size_limit,
+                                                          size_t ttl);
   virtual std::future<int32_t> remove_graph_node(
       uint32_t table_id, std::vector<uint64_t>& node_id_list);
   virtual int32_t initialize();
diff --git a/paddle/fluid/distributed/service/graph_brpc_server.cc b/paddle/fluid/distributed/service/graph_brpc_server.cc
index b404082f7c410..0aba2b9f44ae7 100644
--- a/paddle/fluid/distributed/service/graph_brpc_server.cc
+++ b/paddle/fluid/distributed/service/graph_brpc_server.cc
@@ -187,8 +187,8 @@ int32_t GraphBrpcService::initialize() {
   _service_handler_map[PS_STOP_PROFILER] = &GraphBrpcService::stop_profiler;
 
   _service_handler_map[PS_PULL_GRAPH_LIST] = &GraphBrpcService::pull_graph_list;
-  _service_handler_map[PS_GRAPH_SAMPLE_NEIGHBOORS] =
-      &GraphBrpcService::graph_random_sample_neighboors;
+  _service_handler_map[PS_GRAPH_SAMPLE_NEIGHBORS] =
+      &GraphBrpcService::graph_random_sample_neighbors;
   _service_handler_map[PS_GRAPH_SAMPLE_NODES] =
       &GraphBrpcService::graph_random_sample_nodes;
   _service_handler_map[PS_GRAPH_GET_NODE_FEAT] =
@@ -201,8 +201,9 @@ int32_t GraphBrpcService::initialize() {
   _service_handler_map[PS_GRAPH_SET_NODE_FEAT] =
       &GraphBrpcService::graph_set_node_feat;
   _service_handler_map[PS_GRAPH_SAMPLE_NODES_FROM_ONE_SERVER] =
-      &GraphBrpcService::sample_neighboors_across_multi_servers;
-
+      &GraphBrpcService::sample_neighbors_across_multi_servers;
+  _service_handler_map[PS_GRAPH_USE_NEIGHBORS_SAMPLE_CACHE] =
+      &GraphBrpcService::use_neighbors_sample_cache;
   // shard初始化,server启动后才可从env获取到server_list的shard信息
   initialize_shard_info();
 
@@ -373,7 +374,7 @@ int32_t GraphBrpcService::pull_graph_list(Table *table,
   cntl->response_attachment().append(buffer.get(), actual_size);
   return 0;
 }
-int32_t GraphBrpcService::graph_random_sample_neighboors(
+int32_t GraphBrpcService::graph_random_sample_neighbors(
     Table *table, const PsRequestMessage &request, PsResponseMessage &response,
     brpc::Controller *cntl) {
   CHECK_TABLE_EXIST(table, request, response)
@@ -386,10 +387,10 @@ int32_t GraphBrpcService::graph_random_sample_neighboors(
   size_t node_num = request.params(0).size() / sizeof(uint64_t);
   uint64_t *node_data = (uint64_t *)(request.params(0).c_str());
   int sample_size = *(uint64_t *)(request.params(1).c_str());
-  std::vector<std::unique_ptr<char[]>> buffers(node_num);
+  std::vector<std::shared_ptr<char>> buffers(node_num);
   std::vector<int> actual_sizes(node_num, 0);
   ((GraphTable *)table)
-      ->random_sample_neighboors(node_data, sample_size, buffers, actual_sizes);
+      ->random_sample_neighbors(node_data, sample_size, buffers, actual_sizes);
 
   cntl->response_attachment().append(&node_num, sizeof(size_t));
   cntl->response_attachment().append(actual_sizes.data(),
@@ -448,7 +449,7 @@ int32_t GraphBrpcService::graph_get_node_feat(Table *table,
 
   return 0;
 }
-int32_t GraphBrpcService::sample_neighboors_across_multi_servers(
+int32_t GraphBrpcService::sample_neighbors_across_multi_servers(
     Table *table, const PsRequestMessage &request, PsResponseMessage &response,
     brpc::Controller *cntl) {
   // sleep(5);
@@ -456,7 +457,7 @@ int32_t GraphBrpcService::sample_neighboors_across_multi_servers(
   if (request.params_size() < 2) {
     set_response_code(
         response, -1,
-        "graph_random_sample request requires at least 2 arguments");
+        "graph_random_neighbors_sample request requires at least 2 arguments");
     return 0;
   }
   size_t node_num = request.params(0).size() / sizeof(uint64_t),
@@ -487,7 +488,7 @@ int32_t GraphBrpcService::sample_neighboors_across_multi_servers(
         request2server.size() - 1;
   }
   size_t request_call_num = request2server.size();
-  std::vector<std::unique_ptr<char[]>> local_buffers;
+  std::vector<std::shared_ptr<char>> local_buffers;
   std::vector<int> local_actual_sizes;
   std::vector<size_t> seq;
   std::vector<std::vector<uint64_t>> node_id_buckets(request_call_num);
@@ -519,7 +520,7 @@ int32_t GraphBrpcService::sample_neighboors_across_multi_servers(
         remote_call_num);
     size_t fail_num = 0;
     for (size_t request_idx = 0; request_idx < remote_call_num; ++request_idx) {
-      if (closure->check_response(request_idx, PS_GRAPH_SAMPLE_NEIGHBOORS) !=
+      if (closure->check_response(request_idx, PS_GRAPH_SAMPLE_NEIGHBORS) !=
           0) {
         ++fail_num;
         failed[request2server[request_idx]] = true;
@@ -570,7 +571,7 @@ int32_t GraphBrpcService::sample_neighboors_across_multi_servers(
 
   for (int request_idx = 0; request_idx < remote_call_num; ++request_idx) {
     int server_index = request2server[request_idx];
-    closure->request(request_idx)->set_cmd_id(PS_GRAPH_SAMPLE_NEIGHBOORS);
+    closure->request(request_idx)->set_cmd_id(PS_GRAPH_SAMPLE_NEIGHBORS);
     closure->request(request_idx)->set_table_id(request.table_id());
     closure->request(request_idx)->set_client_id(rank);
     size_t node_num = node_id_buckets[request_idx].size();
@@ -590,8 +591,8 @@ int32_t GraphBrpcService::sample_neighboors_across_multi_servers(
   }
   if (server2request[rank] != -1) {
     ((GraphTable *)table)
-        ->random_sample_neighboors(node_id_buckets.back().data(), sample_size,
-                                   local_buffers, local_actual_sizes);
+        ->random_sample_neighbors(node_id_buckets.back().data(), sample_size,
+                                  local_buffers, local_actual_sizes);
   }
   local_promise.get()->set_value(0);
   if (remote_call_num == 0) func(closure);
@@ -636,5 +637,20 @@ int32_t GraphBrpcService::graph_set_node_feat(Table *table,
   return 0;
 }
 
+int32_t GraphBrpcService::use_neighbors_sample_cache(
+    Table *table, const PsRequestMessage &request, PsResponseMessage &response,
+    brpc::Controller *cntl) {
+  CHECK_TABLE_EXIST(table, request, response)
+  if (request.params_size() < 2) {
+    set_response_code(response, -1,
+                      "use_neighbors_sample_cache request requires at least 2 "
+                      "arguments[cache_size, ttl]");
+    return 0;
+  }
+  size_t size_limit = *(size_t *)(request.params(0).c_str());
+  size_t ttl = *(size_t *)(request.params(1).c_str());
+  ((GraphTable *)table)->make_neighbor_sample_cache(size_limit, ttl);
+  return 0;
+}
 }  // namespace distributed
 }  // namespace paddle
diff --git a/paddle/fluid/distributed/service/graph_brpc_server.h b/paddle/fluid/distributed/service/graph_brpc_server.h
index 817fe08331165..d1a6aa63604f3 100644
--- a/paddle/fluid/distributed/service/graph_brpc_server.h
+++ b/paddle/fluid/distributed/service/graph_brpc_server.h
@@ -78,10 +78,10 @@ class GraphBrpcService : public PsBaseService {
   int32_t initialize_shard_info();
   int32_t pull_graph_list(Table *table, const PsRequestMessage &request,
                           PsResponseMessage &response, brpc::Controller *cntl);
-  int32_t graph_random_sample_neighboors(Table *table,
-                                         const PsRequestMessage &request,
-                                         PsResponseMessage &response,
-                                         brpc::Controller *cntl);
+  int32_t graph_random_sample_neighbors(Table *table,
+                                        const PsRequestMessage &request,
+                                        PsResponseMessage &response,
+                                        brpc::Controller *cntl);
   int32_t graph_random_sample_nodes(Table *table,
                                     const PsRequestMessage &request,
                                     PsResponseMessage &response,
@@ -116,9 +116,15 @@ class GraphBrpcService : public PsBaseService {
   int32_t print_table_stat(Table *table, const PsRequestMessage &request,
                            PsResponseMessage &response, brpc::Controller *cntl);
 
-  int32_t sample_neighboors_across_multi_servers(
-      Table *table, const PsRequestMessage &request,
-      PsResponseMessage &response, brpc::Controller *cntl);
+  int32_t sample_neighbors_across_multi_servers(Table *table,
+                                                const PsRequestMessage &request,
+                                                PsResponseMessage &response,
+                                                brpc::Controller *cntl);
+
+  int32_t use_neighbors_sample_cache(Table *table,
+                                     const PsRequestMessage &request,
+                                     PsResponseMessage &response,
+                                     brpc::Controller *cntl);
 
  private:
   bool _is_initialize_shard_info;
diff --git a/paddle/fluid/distributed/service/graph_py_service.cc b/paddle/fluid/distributed/service/graph_py_service.cc
index 498805136417f..78f239f80d445 100644
--- a/paddle/fluid/distributed/service/graph_py_service.cc
+++ b/paddle/fluid/distributed/service/graph_py_service.cc
@@ -290,19 +290,29 @@ void GraphPyClient::load_node_file(std::string name, std::string filepath) {
   }
 }
 std::vector<std::vector<std::pair<uint64_t, float>>>
-GraphPyClient::batch_sample_neighboors(std::string name,
-                                       std::vector<uint64_t> node_ids,
-                                       int sample_size) {
+GraphPyClient::batch_sample_neighbors(std::string name,
+                                      std::vector<uint64_t> node_ids,
+                                      int sample_size) {
   std::vector<std::vector<std::pair<uint64_t, float>>> v;
   if (this->table_id_map.count(name)) {
     uint32_t table_id = this->table_id_map[name];
     auto status =
-        worker_ptr->batch_sample_neighboors(table_id, node_ids, sample_size, v);
+        worker_ptr->batch_sample_neighbors(table_id, node_ids, sample_size, v);
     status.wait();
   }
   return v;
 }
 
+void GraphPyClient::use_neighbors_sample_cache(std::string name,
+                                               size_t total_size_limit,
+                                               size_t ttl) {
+  if (this->table_id_map.count(name)) {
+    uint32_t table_id = this->table_id_map[name];
+    auto status =
+        worker_ptr->use_neighbors_sample_cache(table_id, total_size_limit, ttl);
+    status.wait();
+  }
+}
 std::vector<uint64_t> GraphPyClient::random_sample_nodes(std::string name,
                                                          int server_index,
                                                          int sample_size) {
diff --git a/paddle/fluid/distributed/service/graph_py_service.h b/paddle/fluid/distributed/service/graph_py_service.h
index 8e03938801ce9..2d36edbf9c17d 100644
--- a/paddle/fluid/distributed/service/graph_py_service.h
+++ b/paddle/fluid/distributed/service/graph_py_service.h
@@ -148,13 +148,15 @@ class GraphPyClient : public GraphPyService {
   int get_client_id() { return client_id; }
   void set_client_id(int client_id) { this->client_id = client_id; }
   void start_client();
-  std::vector<std::vector<std::pair<uint64_t, float>>> batch_sample_neighboors(
+  std::vector<std::vector<std::pair<uint64_t, float>>> batch_sample_neighbors(
       std::string name, std::vector<uint64_t> node_ids, int sample_size);
   std::vector<uint64_t> random_sample_nodes(std::string name, int server_index,
                                             int sample_size);
   std::vector<std::vector<std::string>> get_node_feat(
       std::string node_type, std::vector<uint64_t> node_ids,
       std::vector<std::string> feature_names);
+  void use_neighbors_sample_cache(std::string name, size_t total_size_limit,
+                                  size_t ttl);
   void set_node_feat(std::string node_type, std::vector<uint64_t> node_ids,
                      std::vector<std::string> feature_names,
                      const std::vector<std::vector<std::string>> features);
diff --git a/paddle/fluid/distributed/service/ps_client.h b/paddle/fluid/distributed/service/ps_client.h
index 74a1e0dde71fc..3be83436cec34 100644
--- a/paddle/fluid/distributed/service/ps_client.h
+++ b/paddle/fluid/distributed/service/ps_client.h
@@ -20,11 +20,13 @@
 #include <string>
 #include <unordered_map>
 #include <vector>
+#include "paddle/fluid/distributed/common/cost_timer.h"
 #include "paddle/fluid/distributed/ps.pb.h"
 #include "paddle/fluid/distributed/service/env.h"
 #include "paddle/fluid/distributed/service/sendrecv.pb.h"
 #include "paddle/fluid/distributed/table/accessor.h"
 #include "paddle/fluid/distributed/table/graph/graph_node.h"
+#include "paddle/fluid/platform/timer.h"
 
 namespace paddle {
 namespace distributed {
@@ -35,7 +37,7 @@ using paddle::distributed::PsResponseMessage;
 typedef std::function<void(void *)> PSClientCallBack;
 class PSClientClosure : public google::protobuf::Closure {
  public:
-  PSClientClosure(PSClientCallBack callback) : _callback(callback) {}
+  explicit PSClientClosure(PSClientCallBack callback) : _callback(callback) {}
   virtual ~PSClientClosure() {}
   virtual void set_promise_value(int value) {
     for (auto &promise : _promises) {
@@ -43,12 +45,17 @@ class PSClientClosure : public google::protobuf::Closure {
     }
   }
 
-  void add_promise(std::shared_ptr<std::promise<int32_t>> &promise) {
+  void add_promise(std::shared_ptr<std::promise<int32_t>> &promise) {  // NOLINT
     _promises.push_back(promise);
   }
 
+  void add_timer(std::shared_ptr<CostTimer> &timer) {  // NOLINT
+    _timers.push_back(timer);
+  }
+
  protected:
   PSClientCallBack _callback;
+  std::vector<std::shared_ptr<CostTimer>> _timers;
   std::vector<std::shared_ptr<std::promise<int32_t>>> _promises;
 };
 
@@ -59,11 +66,11 @@ class PSClient {
   PSClient(PSClient &&) = delete;
   PSClient(const PSClient &) = delete;
 
-  virtual int32_t configure(
+  virtual int32_t configure(  // NOLINT
       const PSParameter &config,
       const std::map<uint64_t, std::vector<paddle::distributed::Region>>
           &regions,
-      PSEnvironment &_env, size_t client_id) final;
+      PSEnvironment &_env, size_t client_id) final;  // NOLINT
 
   virtual int32_t create_client2client_connection(
       int pserver_timeout_ms, int pserver_connect_timeout_ms,
@@ -86,7 +93,7 @@ class PSClient {
   virtual std::future<int32_t> save(uint32_t table_id, const std::string &epoch,
                                     const std::string &mode) = 0;
 
-  //清空table数据
+  // 清空table数据
   virtual std::future<int32_t> clear() = 0;
   virtual std::future<int32_t> clear(uint32_t table_id) = 0;
 
@@ -98,7 +105,7 @@ class PSClient {
   // server将参数区块中配置的某一维提取返回
   // 返回数据解包后填充到累计的多个buffer中
   virtual std::future<int32_t> pull_dense(Region *regions, size_t region_num,
-                                          size_t table_id) = 0;  //保留
+                                          size_t table_id) = 0;  // 保留
 
   // firstly push dense param for parameter server
   // this is neccessary because dense weight initialized in trainer on cold
@@ -107,6 +114,9 @@ class PSClient {
                                                 size_t region_num,
                                                 size_t table_id) = 0;
 
+  //  virtual std::future<int32_t> push_dense(const Region *regions,
+  //                                          size_t region_num,
+  //                                          size_t table_id) = 0;
   // 使用keys进行pull请求，结果填充values
   // keys和values的个数均为num个，每个value占用select_size空间
   // future结束前keys和values缓冲区不能再次使用
@@ -212,6 +222,10 @@ class PSClient {
                                                  const uint64_t *keys,
                                                  const float **update_values,
                                                  size_t num, void *done) = 0;
+  //  virtual std::future<int32_t> push_sparse(size_t table_id,
+  //                                           const uint64_t *keys,
+  //                                           const float **update_values,
+  //                                           size_t num) = 0;
 
  protected:
   virtual int32_t initialize() = 0;
@@ -222,8 +236,42 @@ class PSClient {
   PSEnvironment *_env;
   std::unordered_map<uint32_t, std::shared_ptr<ValueAccessor>> _table_accessors;
   std::unordered_map<int32_t, MsgHandlerFunc>
-      _msg_handler_map;  //处理client2client消息
+      _msg_handler_map;  // 处理client2client消息
+};
+
+template <class T>
+class AsyncRequestTask {
+ public:
+  AsyncRequestTask() : _promise(std::make_shared<std::promise<int32_t>>()) {}
+  AsyncRequestTask(T &data, size_t table_id, std::shared_ptr<CostTimer> &timer)
+      : _table_id(table_id),
+        _timer(timer),
+        _promise(std::make_shared<std::promise<int32_t>>()) {
+    _data = std::move(data);
+  }
+
+  AsyncRequestTask(AsyncRequestTask &data)  // NOLINT
+      : _table_id(data.table_id()),
+        _timer(data.timer()),
+        _promise(data.promise()) {
+    _data = std::move(data.data());
+  }
+
+  ~AsyncRequestTask() {}
+
+  inline T &data() { return _data; }
+  inline size_t table_id() { return _table_id; }
+  inline std::shared_ptr<CostTimer> &timer() { return _timer; }
+  inline std::future<int32_t> get_future() { return _promise->get_future(); }
+  inline std::shared_ptr<std::promise<int32_t>> &promise() { return _promise; }
+
+ private:
+  T _data;
+  size_t _table_id;
+  std::shared_ptr<CostTimer> _timer;
+  std::shared_ptr<std::promise<int32_t>> _promise;
 };
+
 REGISTER_PSCORE_REGISTERER(PSClient);
 
 class PSClientFactory {
diff --git a/paddle/fluid/distributed/service/sendrecv.proto b/paddle/fluid/distributed/service/sendrecv.proto
index 42e25258ec3fe..8ee9b3590721a 100644
--- a/paddle/fluid/distributed/service/sendrecv.proto
+++ b/paddle/fluid/distributed/service/sendrecv.proto
@@ -49,7 +49,7 @@ enum PsCmdID {
   PS_STOP_PROFILER = 28;
   PS_PUSH_GLOBAL_STEP = 29;
   PS_PULL_GRAPH_LIST = 30;
-  PS_GRAPH_SAMPLE_NEIGHBOORS = 31;
+  PS_GRAPH_SAMPLE_NEIGHBORS = 31;
   PS_GRAPH_SAMPLE_NODES = 32;
   PS_GRAPH_GET_NODE_FEAT = 33;
   PS_GRAPH_CLEAR = 34;
@@ -57,6 +57,7 @@ enum PsCmdID {
   PS_GRAPH_REMOVE_GRAPH_NODE = 36;
   PS_GRAPH_SET_NODE_FEAT = 37;
   PS_GRAPH_SAMPLE_NODES_FROM_ONE_SERVER = 38;
+  PS_GRAPH_USE_NEIGHBORS_SAMPLE_CACHE = 39;
 }
 
 message PsRequestMessage {
diff --git a/paddle/fluid/distributed/table/CMakeLists.txt b/paddle/fluid/distributed/table/CMakeLists.txt
index 7ec7041b63ba1..0201b627801cb 100644
--- a/paddle/fluid/distributed/table/CMakeLists.txt
+++ b/paddle/fluid/distributed/table/CMakeLists.txt
@@ -37,7 +37,9 @@ set_source_files_properties(table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPI
 
 set_source_files_properties(sparse_sgd_rule.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 set_source_files_properties(ctr_accessor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+set_source_files_properties(memory_sparse_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 cc_library(sparse_sgd_rule SRCS sparse_sgd_rule.cc DEPS ${TABLE_DEPS} ps_framework_proto)
 cc_library(ctr_accessor SRCS ctr_accessor.cc DEPS ${TABLE_DEPS} ps_framework_proto sparse_sgd_rule)
+cc_library(memory_sparse_table SRCS memory_sparse_table.cc DEPS ps_framework_proto ${TABLE_DEPS} fs afs_wrapper ctr_accessor common_table)
 
-cc_library(table SRCS table.cc DEPS common_table tensor_accessor tensor_table ps_framework_proto string_helper device_context gflags glog boost ctr_accessor)
+cc_library(table SRCS table.cc DEPS memory_sparse_table common_table tensor_accessor tensor_table ps_framework_proto string_helper device_context gflags glog boost)
diff --git a/paddle/fluid/distributed/table/accessor.h b/paddle/fluid/distributed/table/accessor.h
index 7cc92ce98ba69..8929e8cd64e84 100644
--- a/paddle/fluid/distributed/table/accessor.h
+++ b/paddle/fluid/distributed/table/accessor.h
@@ -17,15 +17,12 @@
 #include <stdio.h>
 #include <unordered_map>
 #include <vector>
+#include "paddle/fluid/distributed/common/afs_warpper.h"
 #include "paddle/fluid/distributed/common/registerer.h"
 #include "paddle/fluid/distributed/ps.pb.h"
 
 namespace paddle {
 namespace distributed {
-struct FsDataConverter {
-  std::string converter;
-  std::string deconverter;
-};
 
 struct Region {
   Region() : data(NULL), size(0) {}
@@ -50,8 +47,8 @@ struct DataConverter {
 
 class ValueAccessor {
  public:
-  explicit ValueAccessor(){};
-  virtual ~ValueAccessor(){};
+  ValueAccessor() {}
+  virtual ~ValueAccessor() {}
 
   virtual int configure(const TableAccessorParameter& parameter) {
     _config = parameter;
diff --git a/paddle/fluid/distributed/table/common_graph_table.cc b/paddle/fluid/distributed/table/common_graph_table.cc
index 2c20e79b3b2d3..96ebf039aae77 100644
--- a/paddle/fluid/distributed/table/common_graph_table.cc
+++ b/paddle/fluid/distributed/table/common_graph_table.cc
@@ -392,15 +392,89 @@ int32_t GraphTable::random_sample_nodes(int sample_size,
   memcpy(pointer, res.data(), actual_size);
   return 0;
 }
-int32_t GraphTable::random_sample_neighboors(
+int32_t GraphTable::random_sample_neighbors(
     uint64_t *node_ids, int sample_size,
-    std::vector<std::unique_ptr<char[]>> &buffers,
+    std::vector<std::shared_ptr<char>> &buffers,
     std::vector<int> &actual_sizes) {
   size_t node_num = buffers.size();
+  std::function<void(char *)> char_del = [](char *c) { delete[] c; };
   std::vector<std::future<int>> tasks;
+  if (use_cache) {
+    std::vector<std::vector<uint32_t>> seq_id(shard_end - shard_start);
+    std::vector<std::vector<SampleKey>> id_list(shard_end - shard_start);
+    size_t index;
+    for (size_t idx = 0; idx < node_num; ++idx) {
+      index = get_thread_pool_index(node_ids[idx]);
+      seq_id[index].emplace_back(idx);
+      id_list[index].emplace_back(node_ids[idx], sample_size);
+    }
+    for (int i = 0; i < seq_id.size(); i++) {
+      if (seq_id[i].size() == 0) continue;
+      tasks.push_back(_shards_task_pool[i]->enqueue([&, i, this]() -> int {
+        uint64_t node_id;
+        std::vector<std::pair<SampleKey, SampleResult>> r;
+        auto response =
+            scaled_lru->query(i, id_list[i].data(), id_list[i].size(), r);
+        int index = 0;
+        uint32_t idx;
+        std::vector<SampleResult> sample_res;
+        std::vector<SampleKey> sample_keys;
+        auto &rng = _shards_task_rng_pool[i];
+        for (size_t k = 0; k < id_list[i].size(); k++) {
+          if (index < r.size() &&
+              r[index].first.node_key == id_list[i][k].node_key) {
+            idx = seq_id[i][k];
+            actual_sizes[idx] = r[index].second.actual_size;
+            buffers[idx] = r[index].second.buffer;
+            index++;
+          } else {
+            node_id = id_list[i][k].node_key;
+            Node *node = find_node(node_id);
+            idx = seq_id[i][k];
+            int &actual_size = actual_sizes[idx];
+            if (node == nullptr) {
+              actual_size = 0;
+              continue;
+            }
+            std::shared_ptr<char> &buffer = buffers[idx];
+            std::vector<int> res = node->sample_k(sample_size, rng);
+            actual_size = res.size() * (Node::id_size + Node::weight_size);
+            int offset = 0;
+            uint64_t id;
+            float weight;
+            char *buffer_addr = new char[actual_size];
+            if (response == LRUResponse::ok) {
+              sample_keys.emplace_back(node_id, sample_size);
+              sample_res.emplace_back(actual_size, buffer_addr);
+              buffer = sample_res.back().buffer;
+            } else {
+              buffer.reset(buffer_addr, char_del);
+            }
+            for (int &x : res) {
+              id = node->get_neighbor_id(x);
+              weight = node->get_neighbor_weight(x);
+              memcpy(buffer_addr + offset, &id, Node::id_size);
+              offset += Node::id_size;
+              memcpy(buffer_addr + offset, &weight, Node::weight_size);
+              offset += Node::weight_size;
+            }
+          }
+        }
+        if (sample_res.size()) {
+          scaled_lru->insert(i, sample_keys.data(), sample_res.data(),
+                             sample_keys.size());
+        }
+        return 0;
+      }));
+    }
+    for (auto &t : tasks) {
+      t.get();
+    }
+    return 0;
+  }
   for (size_t idx = 0; idx < node_num; ++idx) {
     uint64_t &node_id = node_ids[idx];
-    std::unique_ptr<char[]> &buffer = buffers[idx];
+    std::shared_ptr<char> &buffer = buffers[idx];
     int &actual_size = actual_sizes[idx];
 
     int thread_pool_index = get_thread_pool_index(node_id);
@@ -419,7 +493,7 @@ int32_t GraphTable::random_sample_neighboors(
       uint64_t id;
       float weight;
       char *buffer_addr = new char[actual_size];
-      buffer.reset(buffer_addr);
+      buffer.reset(buffer_addr, char_del);
       for (int &x : res) {
         id = node->get_neighbor_id(x);
         weight = node->get_neighbor_weight(x);
diff --git a/paddle/fluid/distributed/table/common_graph_table.h b/paddle/fluid/distributed/table/common_graph_table.h
index d681262c66480..91f2b1c029d80 100644
--- a/paddle/fluid/distributed/table/common_graph_table.h
+++ b/paddle/fluid/distributed/table/common_graph_table.h
@@ -17,11 +17,23 @@
 #include <ThreadPool.h>
 #include <assert.h>
 #include <pthread.h>
+#include <algorithm>
+#include <cassert>
+#include <cstdio>
+#include <ctime>
+#include <functional>
+#include <iostream>
 #include <list>
+#include <map>
 #include <memory>
 #include <mutex>  // NOLINT
+#include <numeric>
+#include <queue>
+#include <set>
 #include <string>
+#include <thread>
 #include <unordered_map>
+#include <unordered_set>
 #include <utility>
 #include <vector>
 #include "paddle/fluid/distributed/table/accessor.h"
@@ -62,18 +74,302 @@ class GraphShard {
   int shard_num;
   std::vector<Node *> bucket;
 };
+
+enum LRUResponse { ok = 0, blocked = 1, err = 2 };
+
+struct SampleKey {
+  uint64_t node_key;
+  size_t sample_size;
+  SampleKey(uint64_t _node_key, size_t _sample_size)
+      : node_key(_node_key), sample_size(_sample_size) {
+    // std::cerr<<"in constructor of samplekey\n";
+  }
+  bool operator==(const SampleKey &s) const {
+    return node_key == s.node_key && sample_size == s.sample_size;
+  }
+};
+
+class SampleResult {
+ public:
+  size_t actual_size;
+  std::shared_ptr<char> buffer;
+  SampleResult(size_t _actual_size, std::shared_ptr<char> &_buffer)
+      : actual_size(_actual_size), buffer(_buffer) {}
+  SampleResult(size_t _actual_size, char *_buffer)
+      : actual_size(_actual_size),
+        buffer(_buffer, [](char *p) { delete[] p; }) {}
+  ~SampleResult() {}
+};
+
+template <typename K, typename V>
+class LRUNode {
+ public:
+  LRUNode(K _key, V _data, size_t _ttl) : key(_key), data(_data), ttl(_ttl) {
+    next = pre = NULL;
+  }
+  std::chrono::milliseconds ms;
+  // the last hit time
+  K key;
+  V data;
+  size_t ttl;
+  // time to live
+  LRUNode<K, V> *pre, *next;
+};
+template <typename K, typename V>
+class ScaledLRU;
+
+template <typename K, typename V>
+class RandomSampleLRU {
+ public:
+  RandomSampleLRU(ScaledLRU<K, V> *_father) : father(_father) {
+    node_size = 0;
+    node_head = node_end = NULL;
+    global_ttl = father->ttl;
+  }
+
+  ~RandomSampleLRU() {
+    LRUNode<K, V> *p;
+    while (node_head != NULL) {
+      p = node_head->next;
+      delete node_head;
+      node_head = p;
+    }
+  }
+  LRUResponse query(K *keys, size_t length, std::vector<std::pair<K, V>> &res) {
+    if (pthread_rwlock_tryrdlock(&father->rwlock) != 0)
+      return LRUResponse::blocked;
+    int init_node_size = node_size;
+    try {
+      for (size_t i = 0; i < length; i++) {
+        auto iter = key_map.find(keys[i]);
+        if (iter != key_map.end()) {
+          res.push_back({keys[i], iter->second->data});
+          iter->second->ttl--;
+          if (iter->second->ttl == 0) {
+            remove(iter->second, true);
+          } else {
+            remove(iter->second);
+            add_to_tail(iter->second);
+          }
+        }
+      }
+    } catch (...) {
+      pthread_rwlock_unlock(&father->rwlock);
+      father->handle_size_diff(node_size - init_node_size);
+      return LRUResponse::err;
+    }
+    pthread_rwlock_unlock(&father->rwlock);
+    father->handle_size_diff(node_size - init_node_size);
+    return LRUResponse::ok;
+  }
+  LRUResponse insert(K *keys, V *data, size_t length) {
+    if (pthread_rwlock_tryrdlock(&father->rwlock) != 0)
+      return LRUResponse::blocked;
+    int init_node_size = node_size;
+    try {
+      for (size_t i = 0; i < length; i++) {
+        auto iter = key_map.find(keys[i]);
+        if (iter != key_map.end()) {
+          iter->second->ttl = global_ttl;
+          remove(iter->second);
+          add_to_tail(iter->second);
+          iter->second->data = data[i];
+        } else {
+          LRUNode<K, V> *temp = new LRUNode<K, V>(keys[i], data[i], global_ttl);
+          add_to_tail(temp);
+          key_map[keys[i]] = temp;
+        }
+      }
+    } catch (...) {
+      pthread_rwlock_unlock(&father->rwlock);
+      father->handle_size_diff(node_size - init_node_size);
+      return LRUResponse::err;
+    }
+    pthread_rwlock_unlock(&father->rwlock);
+    father->handle_size_diff(node_size - init_node_size);
+    return LRUResponse::ok;
+  }
+  void remove(LRUNode<K, V> *node, bool del = false) {
+    if (node->pre) {
+      node->pre->next = node->next;
+    } else {
+      node_head = node->next;
+    }
+    if (node->next) {
+      node->next->pre = node->pre;
+    } else {
+      node_end = node->pre;
+    }
+    node_size--;
+    if (del) {
+      delete node;
+      key_map.erase(node->key);
+    }
+  }
+
+  void add_to_tail(LRUNode<K, V> *node) {
+    if (node_end == NULL) {
+      node_head = node_end = node;
+      node->next = node->pre = NULL;
+    } else {
+      node_end->next = node;
+      node->pre = node_end;
+      node->next = NULL;
+      node_end = node;
+    }
+    node_size++;
+    node->ms = std::chrono::duration_cast<std::chrono::milliseconds>(
+        std::chrono::system_clock::now().time_since_epoch());
+  }
+
+ private:
+  std::unordered_map<K, LRUNode<K, V> *> key_map;
+  ScaledLRU<K, V> *father;
+  size_t global_ttl;
+  int node_size;
+  LRUNode<K, V> *node_head, *node_end;
+  friend class ScaledLRU<K, V>;
+};
+
+template <typename K, typename V>
+class ScaledLRU {
+ public:
+  ScaledLRU(size_t shard_num, size_t size_limit, size_t _ttl)
+      : size_limit(size_limit), ttl(_ttl) {
+    pthread_rwlock_init(&rwlock, NULL);
+    stop = false;
+    thread_pool.reset(new ::ThreadPool(1));
+    global_count = 0;
+    lru_pool = std::vector<RandomSampleLRU<K, V>>(shard_num,
+                                                  RandomSampleLRU<K, V>(this));
+    shrink_job = std::thread([this]() -> void {
+      while (true) {
+        {
+          std::unique_lock<std::mutex> lock(mutex_);
+          cv_.wait_for(lock, std::chrono::milliseconds(3000));
+          if (stop) {
+            return;
+          }
+        }
+
+        // shrink();
+        // std::cerr<<"shrink job in queue\n";
+        auto status =
+            thread_pool->enqueue([this]() -> int { return shrink(); });
+        status.wait();
+      }
+    });
+    shrink_job.detach();
+  }
+  ~ScaledLRU() {
+    std::unique_lock<std::mutex> lock(mutex_);
+    // std::cerr<<"cancel shrink job\n";
+    stop = true;
+    cv_.notify_one();
+    // pthread_cancel(shrink_job.native_handle());
+  }
+  LRUResponse query(size_t index, K *keys, size_t length,
+                    std::vector<std::pair<K, V>> &res) {
+    return lru_pool[index].query(keys, length, res);
+  }
+  LRUResponse insert(size_t index, K *keys, V *data, size_t length) {
+    return lru_pool[index].insert(keys, data, length);
+  }
+  int shrink() {
+    int node_size = 0;
+    std::string t = "";
+    for (size_t i = 0; i < lru_pool.size(); i++) {
+      node_size += lru_pool[i].node_size;
+      // t += std::to_string(i) + "->" + std::to_string(lru_pool[i].node_size) +
+      // " ";
+    }
+    // std::cout<<t<<std::endl;
+
+    if (node_size <= size_limit) return 0;
+    if (pthread_rwlock_wrlock(&rwlock) == 0) {
+      try {
+        global_count = 0;
+        std::priority_queue<RemovedNode, std::vector<RemovedNode>,
+                            std::greater<RemovedNode>>
+            q;
+        for (size_t i = 0; i < lru_pool.size(); i++) {
+          if (lru_pool[i].node_size > 0) {
+            global_count += lru_pool[i].node_size;
+            q.push({lru_pool[i].node_head, &lru_pool[i]});
+          }
+        }
+        if (global_count > size_limit) {
+          // std::cout<<"before shrinking cache, cached nodes count =
+          // "<<global_count<<std::endl;
+          size_t remove = global_count - size_limit;
+          while (remove--) {
+            RemovedNode remove_node = q.top();
+            q.pop();
+            auto next = remove_node.node->next;
+            if (next) {
+              q.push({next, remove_node.lru_pointer});
+            }
+            global_count--;
+            remove_node.lru_pointer->key_map.erase(remove_node.node->key);
+            remove_node.lru_pointer->remove(remove_node.node, true);
+          }
+          // std::cout<<"after shrinking cache, cached nodes count =
+          // "<<global_count<<std::endl;
+        }
+      } catch (...) {
+        // std::cout << "shrink cache failed"<<std::endl;
+        pthread_rwlock_unlock(&rwlock);
+        return -1;
+      }
+      pthread_rwlock_unlock(&rwlock);
+      return 0;
+    }
+    return 0;
+  }
+  void handle_size_diff(int diff) {
+    if (diff != 0) {
+      __sync_fetch_and_add(&global_count, diff);
+      if (global_count > int(1.5 * size_limit)) {
+        // std::cout<<"global_count too large "<<global_count<<" enter start
+        // shrink task\n";
+        thread_pool->enqueue([this]() -> int { return shrink(); });
+      }
+    }
+  }
+
+  size_t get_ttl() { return ttl; }
+
+ private:
+  pthread_rwlock_t rwlock;
+  int global_count;
+  size_t size_limit;
+  size_t ttl;
+  bool stop;
+  std::thread shrink_job;
+  std::vector<RandomSampleLRU<K, V>> lru_pool;
+  mutable std::mutex mutex_;
+  std::condition_variable cv_;
+  struct RemovedNode {
+    LRUNode<K, V> *node;
+    RandomSampleLRU<K, V> *lru_pointer;
+    bool operator>(const RemovedNode &a) const { return node->ms > a.node->ms; }
+  };
+  std::shared_ptr<::ThreadPool> thread_pool;
+  friend class RandomSampleLRU<K, V>;
+};
+
 class GraphTable : public SparseTable {
  public:
-  GraphTable() {}
+  GraphTable() { use_cache = false; }
   virtual ~GraphTable() {}
   virtual int32_t pull_graph_list(int start, int size,
                                   std::unique_ptr<char[]> &buffer,
                                   int &actual_size, bool need_feature,
                                   int step);
 
-  virtual int32_t random_sample_neighboors(
+  virtual int32_t random_sample_neighbors(
       uint64_t *node_ids, int sample_size,
-      std::vector<std::unique_ptr<char[]>> &buffers,
+      std::vector<std::shared_ptr<char>> &buffers,
       std::vector<int> &actual_sizes);
 
   int32_t random_sample_nodes(int sample_size, std::unique_ptr<char[]> &buffers,
@@ -131,6 +427,18 @@ class GraphTable : public SparseTable {
 
   size_t get_server_num() { return server_num; }
 
+  virtual int32_t make_neighbor_sample_cache(size_t size_limit, size_t ttl) {
+    {
+      std::unique_lock<std::mutex> lock(mutex_);
+      if (use_cache == false) {
+        scaled_lru.reset(new ScaledLRU<SampleKey, SampleResult>(
+            shard_end - shard_start, size_limit, ttl));
+        use_cache = true;
+      }
+    }
+    return 0;
+  }
+
  protected:
   std::vector<GraphShard> shards;
   size_t shard_start, shard_end, server_num, shard_num_per_server, shard_num;
@@ -146,7 +454,20 @@ class GraphTable : public SparseTable {
 
   std::vector<std::shared_ptr<::ThreadPool>> _shards_task_pool;
   std::vector<std::shared_ptr<std::mt19937_64>> _shards_task_rng_pool;
+  std::shared_ptr<ScaledLRU<SampleKey, SampleResult>> scaled_lru;
+  bool use_cache;
+  mutable std::mutex mutex_;
 };
 }  // namespace distributed
 
 };  // namespace paddle
+
+namespace std {
+
+template <>
+struct hash<paddle::distributed::SampleKey> {
+  size_t operator()(const paddle::distributed::SampleKey &s) const {
+    return s.node_key ^ s.sample_size;
+  }
+};
+}
diff --git a/paddle/fluid/distributed/table/common_sparse_table.cc b/paddle/fluid/distributed/table/common_sparse_table.cc
index 8b79b1c02fce5..e124160e712e0 100644
--- a/paddle/fluid/distributed/table/common_sparse_table.cc
+++ b/paddle/fluid/distributed/table/common_sparse_table.cc
@@ -279,18 +279,25 @@ int32_t CommonSparseTable::set_global_lr(float* lr) {
   return 0;
 }
 
-int32_t CommonSparseTable::load(const std::string& path,
+int32_t CommonSparseTable::load(const std::string& dirname,
                                 const std::string& param) {
   auto begin = GetCurrentUS();
   rwlock_->WRLock();
-  LoadFromText(path, param, _shard_idx, _shard_num, task_pool_size_,
+  auto varname = _config.common().table_name();
+  std::string var_store =
+      string::Sprintf("%s/%s%s", dirname, varname, PSERVER_SAVE_SUFFIX);
+  std::string shard_var_pre =
+      string::Sprintf("%s.block%d", varname, _shard_idx);
+  std::string value_ = string::Sprintf("%s/%s.txt", var_store, shard_var_pre);
+  std::string meta_ = string::Sprintf("%s/%s.meta", var_store, shard_var_pre);
+
+  LoadFromText(value_, meta_, _shard_idx, _shard_num, task_pool_size_,
                &shard_values_);
   rwlock_->UNLock();
   auto end = GetCurrentUS();
 
-  auto varname = _config.common().table_name();
-  VLOG(0) << "load " << varname << " with value: " << path
-          << " , meta: " << param
+  VLOG(0) << "load " << varname << " with value: " << value_
+          << " , meta: " << meta_
           << " using: " << std::to_string((end - begin) / 1e+6) << " seconds";
 
   return 0;
diff --git a/paddle/fluid/distributed/table/depends/dense.h b/paddle/fluid/distributed/table/depends/dense.h
index 8079003d1bf8f..8e507842bc330 100644
--- a/paddle/fluid/distributed/table/depends/dense.h
+++ b/paddle/fluid/distributed/table/depends/dense.h
@@ -183,5 +183,97 @@ class DAdam : public DenseOptimizer {
   float epsilon;
 };
 
+// adam optimizer for dense tensor
+class DAdamD2Sum : public DenseOptimizer {
+ public:
+  explicit DAdamD2Sum(const CommonAccessorParameter& accessor,
+                      std::vector<std::vector<float>>* values) {
+    lr_hardcode = 5e-6;
+    auto& names = accessor.params();
+    for (int x = 0; x < static_cast<int>(names.size()); ++x) {
+      if (names[x] == "LearningRate") {
+        learning_rate = (*values)[x].data();
+      }
+      if (names[x] == "Param") {
+        param = (*values)[x].data();
+      }
+      if (names[x] == "Moment") {
+        mom_velocity = (*values)[x].data();
+      }
+      if (names[x] == "G2Sum") {
+        ada_g2sum = (*values)[x].data();
+      }
+      if (names[x] == "D2Sum") {
+        ada_d2sum = (*values)[x].data();
+      }
+      if (names[x] == "MomentDecayRate") {
+        mom_decay_rate = (*values)[x].data();
+      }
+      if (names[x] == "AdaDecayRate") {
+        ada_decay_rate = (*values)[x].data();
+      }
+      if (names[x] == "AdaEpsilon") {
+        ada_epsilon = (*values)[x].data();
+      }
+    }
+  }
+
+  void update(const float* update_values, size_t num, int begin,
+              int end) override {
+    auto update_numel = end - begin;
+    std::vector<float> grad, grad2, scale;
+    grad.resize(update_numel);
+    grad2.resize(update_numel);
+    scale.resize(update_numel);
+
+    auto blas = GetBlas<float>();
+    // copy grad
+    blas.VCOPY(update_numel, update_values + begin, grad.data());
+    blas.VCOPY(update_numel, update_values + begin, grad2.data());
+
+    // d2sum
+    blas.SCAL(update_numel, ada_decay_rate[0], ada_d2sum + begin);
+    ADD<float>(update_numel, ada_d2sum + begin, 1, ada_d2sum + begin);
+
+    // g2sum
+    blas.SCAL(update_numel, ada_decay_rate[0], ada_g2sum + begin);
+    blas.VSQUARE(update_numel, grad2.data(), grad2.data());
+    blas.VADD(update_numel, ada_g2sum + begin, grad2.data(), ada_g2sum + begin);
+
+    // mom
+    blas.SCAL(update_numel, mom_decay_rate[0], mom_velocity + begin);
+    blas.SCAL(update_numel, 1 - mom_decay_rate[0], grad.data());
+    blas.VADD(update_numel, mom_velocity + begin, grad.data(),
+              mom_velocity + begin);
+
+    // scale
+    float* scale_ = scale.data();
+    blas.VDIV(update_numel, ada_g2sum + begin, ada_d2sum + begin, scale_);
+    ADD<float>(update_numel, scale_, ada_epsilon[0], scale_);
+    DIV<float>(update_numel, 1 + ada_epsilon[0], scale_, scale_);
+    SQRT<float>(update_numel, scale_, scale_);
+
+    blas.SCAL(update_numel, learning_rate[0], scale_);
+
+    // TODO(zhaocaibei123): check if there exists elementwise_multiply in blas
+    // TODO(zhaocaibei123): blas.VMUL
+    ELE_MUL<float>(update_numel, scale_, mom_velocity + begin, scale_);
+
+    blas.VSUB(update_numel, param + begin, scale_, param + begin);
+  }
+
+  float* learning_rate;
+  float lr_hardcode;
+
+  float* param;
+  float* mom_velocity;
+  float* ada_g2sum;
+  float* ada_d2sum;
+
+  float* mom_decay_rate;
+  float* ada_decay_rate;
+  float* ada_epsilon;
+};
+
 }  // namespace distributed
 }  // namespace paddle
diff --git a/paddle/fluid/distributed/table/memory_sparse_table.cc b/paddle/fluid/distributed/table/memory_sparse_table.cc
new file mode 100644
index 0000000000000..da5c51dfd560a
--- /dev/null
+++ b/paddle/fluid/distributed/table/memory_sparse_table.cc
@@ -0,0 +1,615 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sstream>
+
+#include "paddle/fluid/distributed/table/memory_sparse_table.h"
+#include "paddle/fluid/framework/io/fs.h"
+
+#include "boost/lexical_cast.hpp"
+#include "glog/logging.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace distributed {
+
+// TODO(zhaocaibei123): configure
+bool FLAGS_pslib_create_value_when_push = false;
+int FLAGS_pslib_table_save_max_retry = 3;
+bool FLAGS_pslib_enable_create_feasign_randomly = false;
+
+int32_t MemorySparseTable::initialize() {
+  shards_task_pool_.resize(task_pool_size_);
+  for (int i = 0; i < shards_task_pool_.size(); ++i) {
+    shards_task_pool_[i].reset(new ::ThreadPool(1));
+  }
+  initialize_value();
+  VLOG(0) << "initalize MemorySparseTable succ";
+  return 0;
+}
+
+int32_t MemorySparseTable::initialize_value() {
+  sparse_table_shard_num_ = static_cast<int>(_config.shard_num());
+  avg_local_shard_num_ =
+      SparseTable::sparse_local_shard_num(sparse_table_shard_num_, _shard_num);
+  real_local_shard_num_ = avg_local_shard_num_;
+  if (real_local_shard_num_ * (_shard_idx + 1) > sparse_table_shard_num_) {
+    real_local_shard_num_ =
+        sparse_table_shard_num_ - real_local_shard_num_ * _shard_idx;
+    real_local_shard_num_ =
+        real_local_shard_num_ < 0 ? 0 : real_local_shard_num_;
+  }
+  VLOG(1) << "memory sparse table avg_local_shard_num_: "
+          << avg_local_shard_num_
+          << " real_local_shard_num_: " << real_local_shard_num_;
+
+  shard_values_.reserve(real_local_shard_num_);
+
+  for (int x = 0; x < real_local_shard_num_; ++x) {
+    auto shard = std::make_shared<SparseTableShard>();
+    shard_values_.emplace_back(shard);
+  }
+  return 0;
+}
+
+int32_t MemorySparseTable::load(const std::string& path,
+                                const std::string& param) {
+  std::string table_path = table_dir(path);
+  auto file_list = _afs_client.list(table_path);
+
+  std::sort(file_list.begin(), file_list.end());
+  for (auto file : file_list) {
+    VLOG(1) << "MemorySparseTable::load() file list: " << file;
+  }
+
+  int load_param = atoi(param.c_str());
+  auto expect_shard_num = sparse_table_shard_num_;
+  if (file_list.size() != expect_shard_num) {
+    LOG(WARNING) << "MemorySparseTable file_size:" << file_list.size()
+                 << " not equal to expect_shard_num:" << expect_shard_num;
+    return -1;
+  }
+  if (file_list.size() == 0) {
+    LOG(WARNING) << "MemorySparseTable load file is empty, path:" << path;
+    return -1;
+  }
+
+  size_t file_start_idx = _shard_idx * avg_local_shard_num_;
+
+  size_t feature_value_size = _value_accesor->size() / sizeof(float);
+  // TODO(zhaocaibei123): multi-thread
+  // int thread_num = shard_values_.size() < 15 ? shard_values_.size() : 15;
+  // omp_set_num_threads(thread_num);
+  // #pragma omp parallel for schedule(dynamic)
+  for (size_t i = 0; i < real_local_shard_num_; ++i) {
+    FsChannelConfig channel_config;
+    channel_config.path = file_list[file_start_idx + i];
+    VLOG(1) << "MemorySparseTable::load begin load " << channel_config.path
+            << " into local shard " << i;
+    channel_config.converter = _value_accesor->converter(load_param).converter;
+    channel_config.deconverter =
+        _value_accesor->converter(load_param).deconverter;
+
+    bool is_read_failed = false;
+    int retry_num = 0;
+    int err_no = 0;
+    do {
+      is_read_failed = false;
+      err_no = 0;
+      std::string line_data;
+      auto read_channel = _afs_client.open_r(channel_config, 0, &err_no);
+      char* end = NULL;
+      auto& shard = shard_values_[i];
+      try {
+        while (read_channel->read_line(line_data) == 0 &&
+               line_data.size() > 1) {
+          uint64_t key = std::strtoul(line_data.data(), &end, 10);
+          auto* value = shard->Init(key);
+          value->resize(feature_value_size);
+          int parse_size =
+              _value_accesor->parse_from_string(++end, value->data());
+          value->resize(parse_size);
+
+          // for debug
+          for (int ii = 0; ii < parse_size; ++ii) {
+            VLOG(2) << "MemorySparseTable::load key: " << key << " value " << ii
+                    << ": " << value->data()[ii] << " local_shard: " << i;
+          }
+        }
+        read_channel->close();
+        if (err_no == -1) {
+          ++retry_num;
+          is_read_failed = true;
+          LOG(ERROR)
+              << "MemorySparseTable load failed after read, retry it! path:"
+              << channel_config.path << " , retry_num=" << retry_num;
+        }
+      } catch (...) {
+        ++retry_num;
+        is_read_failed = true;
+        LOG(ERROR) << "MemorySparseTable load failed, retry it! path:"
+                   << channel_config.path << " , retry_num=" << retry_num;
+      }
+      if (retry_num > paddle::distributed::FLAGS_pslib_table_save_max_retry) {
+        LOG(ERROR) << "MemorySparseTable load failed reach max limit!";
+        exit(-1);
+      }
+    } while (is_read_failed);
+  }
+  LOG(INFO) << "MemorySparseTable load success, path from "
+            << file_list[file_start_idx] << " to "
+            << file_list[file_start_idx + real_local_shard_num_ - 1];
+  return 0;
+}
+
+int32_t MemorySparseTable::load_local_fs(const std::string& path,
+                                         const std::string& param) {
+  std::string table_path = table_dir(path);
+  auto file_list = paddle::framework::localfs_list(table_path);
+
+  int load_param = atoi(param.c_str());
+  auto expect_shard_num = sparse_table_shard_num_;
+  if (file_list.size() != expect_shard_num) {
+    LOG(WARNING) << "MemorySparseTable file_size:" << file_list.size()
+                 << " not equal to expect_shard_num:" << expect_shard_num;
+    return -1;
+  }
+  if (file_list.size() == 0) {
+    LOG(WARNING) << "MemorySparseTable load file is empty, path:" << path;
+    return -1;
+  }
+
+  size_t file_start_idx = _shard_idx * avg_local_shard_num_;
+
+  size_t feature_value_size = _value_accesor->size() / sizeof(float);
+
+  // int thread_num = shard_values_.size() < 15 ? shard_values_.size() : 15;
+  // omp_set_num_threads(thread_num);
+  // #pragma omp parallel for schedule(dynamic)
+  for (size_t i = 0; i < real_local_shard_num_; ++i) {
+    bool is_read_failed = false;
+    int retry_num = 0;
+    int err_no = 0;
+    do {
+      is_read_failed = false;
+      err_no = 0;
+      std::string line_data;
+      std::ifstream file(file_list[file_start_idx + i]);
+      char* end = NULL;
+      auto& shard = shard_values_[i];
+      try {
+        while (std::getline(file, line_data) && line_data.size() > 1) {
+          uint64_t key = std::strtoul(line_data.data(), &end, 10);
+          auto* value = shard->Init(key);
+          value->resize(feature_value_size);
+          int parse_size =
+              _value_accesor->parse_from_string(++end, value->data());
+          value->resize(parse_size);
+          // value->shrink_to_fit();
+        }
+        file.close();
+        if (err_no == -1) {
+          ++retry_num;
+          is_read_failed = true;
+          LOG(ERROR)
+              << "MemorySparseTable load failed after read, retry it! path:"
+              << file_list[file_start_idx + i] << " , retry_num=" << retry_num;
+        }
+      } catch (...) {
+        ++retry_num;
+        is_read_failed = true;
+        LOG(ERROR) << "MemorySparseTable load failed, retry it! path:"
+                   << file_list[file_start_idx + i]
+                   << " , retry_num=" << retry_num;
+      }
+      if (retry_num > paddle::distributed::FLAGS_pslib_table_save_max_retry) {
+        LOG(ERROR) << "MemorySparseTable load failed reach max limit!";
+        exit(-1);
+      }
+    } while (is_read_failed);
+  }
+  LOG(INFO) << "MemorySparseTable load success, path from "
+            << file_list[file_start_idx] << " to "
+            << file_list[file_start_idx + real_local_shard_num_ - 1];
+  return 0;
+}
+
+int32_t MemorySparseTable::save(const std::string& dirname,
+                                const std::string& param) {
+  VLOG(0) << "MemorySparseTable::save dirname: " << dirname;
+  int save_param =
+      atoi(param.c_str());  // checkpoint:0  xbox delta:1  xbox base:2
+  std::string table_path = table_dir(dirname);
+  _afs_client.remove(paddle::string::format_string(
+      "%s/part-%03d-*", table_path.c_str(), _shard_idx));
+  // int thread_num = shard_values_.size() < 20 ? shard_values_.size() : 20;
+  std::atomic<uint32_t> feasign_size_all{0};
+
+  size_t file_start_idx = avg_local_shard_num_ * _shard_idx;
+
+  // TODO(zhaocaibei123): openmp
+  // omp_set_num_threads(thread_num);
+  // #pragma omp parallel for schedule(dynamic)
+  for (size_t i = 0; i < real_local_shard_num_; ++i) {
+    FsChannelConfig channel_config;
+    if (_config.compress_in_save() && (save_param == 0 || save_param == 3)) {
+      channel_config.path = paddle::string::format_string(
+          "%s/part-%03d-%05d.gz", table_path.c_str(), _shard_idx,
+          file_start_idx + i);
+    } else {
+      channel_config.path =
+          paddle::string::format_string("%s/part-%03d-%05d", table_path.c_str(),
+                                        _shard_idx, file_start_idx + i);
+    }
+    channel_config.converter = _value_accesor->converter(save_param).converter;
+    channel_config.deconverter =
+        _value_accesor->converter(save_param).deconverter;
+    bool is_write_failed = false;
+    int feasign_size = 0;
+    int retry_num = 0;
+    int err_no = 0;
+    auto& shard = shard_values_[i];
+    do {
+      err_no = 0;
+      feasign_size = 0;
+      is_write_failed = false;
+      auto write_channel =
+          _afs_client.open_w(channel_config, 1024 * 1024 * 40, &err_no);
+      for (auto& table : shard->values_) {
+        for (auto& value : table) {
+          if (_value_accesor->save(value.second->data(), save_param)) {
+            std::string format_value = _value_accesor->parse_to_string(
+                value.second->data(), value.second->size());
+            if (0 !=
+                write_channel->write_line(paddle::string::format_string(
+                    "%lu %s", value.first, format_value.c_str()))) {
+              ++retry_num;
+              is_write_failed = true;
+              LOG(ERROR)
+                  << "MemorySparseTable save prefix failed, retry it! path:"
+                  << channel_config.path << " , retry_num=" << retry_num;
+              break;
+            }
+            ++feasign_size;
+          }
+        }
+      }
+      write_channel->close();
+      if (err_no == -1) {
+        ++retry_num;
+        is_write_failed = true;
+        LOG(ERROR)
+            << "MemorySparseTable save prefix failed after write, retry it! "
+            << "path:" << channel_config.path << " , retry_num=" << retry_num;
+      }
+      if (is_write_failed) {
+        _afs_client.remove(channel_config.path);
+      }
+      if (retry_num > paddle::distributed::FLAGS_pslib_table_save_max_retry) {
+        LOG(ERROR) << "MemorySparseTable save prefix failed reach max limit!";
+        exit(-1);
+      }
+    } while (is_write_failed);
+    feasign_size_all += feasign_size;
+    for (auto& table : shard->values_) {
+      for (auto& value : table) {
+        _value_accesor->update_stat_after_save(value.second->data(),
+                                               save_param);
+      }
+    }
+    LOG(INFO) << "MemorySparseTable save prefix success, path: "
+              << channel_config.path;
+  }
+  // int32 may overflow need to change return value
+  return 0;
+}
+
+int32_t MemorySparseTable::save_local_fs(const std::string& dirname,
+                                         const std::string& param,
+                                         const std::string& prefix) {
+  int save_param =
+      atoi(param.c_str());  // checkpoint:0  xbox delta:1  xbox base:2
+  std::string table_path = table_dir(dirname);
+  int feasign_cnt = 0;
+  size_t file_start_idx = avg_local_shard_num_ * _shard_idx;
+  for (size_t i = 0; i < real_local_shard_num_; ++i) {
+    feasign_cnt = 0;
+    auto& shard = shard_values_[i];
+    std::string file_name = paddle::string::format_string(
+        "%s/part-%s-%03d-%05d", table_path.c_str(), prefix.c_str(), _shard_idx,
+        file_start_idx + i);
+    std::ofstream os;
+    os.open(file_name);
+    for (auto& table : shard->values_) {
+      for (auto& value : table) {
+        if (_value_accesor->save(value.second->data(), save_param)) {
+          std::string format_value = _value_accesor->parse_to_string(
+              value.second->data(), value.second->size());
+          std::string out_line = paddle::string::format_string(
+              "%lu %s\n", value.first, format_value.c_str());
+          // VLOG(2) << out_line.c_str();
+          os.write(out_line.c_str(), sizeof(char) * out_line.size());
+          ++feasign_cnt;
+        }
+      }
+    }
+    os.close();
+    LOG(INFO) << "MemorySparseTable save prefix success, path:" << file_name
+              << "feasign_cnt: " << feasign_cnt;
+  }
+  return 0;
+}
+
+std::pair<int64_t, int64_t> MemorySparseTable::print_table_stat() {
+  int64_t feasign_size = 0;
+  int64_t mf_size = 0;
+
+  for (auto& shard : shard_values_) {
+    for (auto& table : shard->values_) {
+      feasign_size += table.size();
+    }
+  }
+
+  return {feasign_size, mf_size};
+}
+
+int32_t MemorySparseTable::pull_sparse(float* pull_values,
+                                       const PullSparseValue& pull_value) {
+  std::vector<std::future<int>> tasks(real_local_shard_num_);
+
+  const size_t value_size = _value_accesor->size() / sizeof(float);
+  size_t mf_value_size = _value_accesor->mf_size() / sizeof(float);
+  size_t select_value_size = _value_accesor->select_size() / sizeof(float);
+  // std::atomic<uint32_t> missed_keys{0};
+
+  std::vector<std::vector<std::pair<uint64_t, int>>> task_keys(
+      real_local_shard_num_);
+  size_t num = pull_value.numel_;
+  for (size_t i = 0; i < num; ++i) {
+    int shard_id = (pull_value.feasigns_[i] % sparse_table_shard_num_) %
+                   avg_local_shard_num_;
+    task_keys[shard_id].push_back({pull_value.feasigns_[i], i});
+  }
+  for (int shard_id = 0; shard_id < real_local_shard_num_; ++shard_id) {
+    tasks[shard_id] =
+        shards_task_pool_[shard_id % shards_task_pool_.size()]->enqueue(
+            [this, shard_id, &task_keys, value_size, pull_values, mf_value_size,
+             select_value_size]() -> int {
+              auto& local_shard = shard_values_[shard_id];
+              float data_buffer[value_size];  // NOLINT
+              float* data_buffer_ptr = data_buffer;
+
+              auto& keys = task_keys[shard_id];
+              for (size_t i = 0; i < keys.size(); i++) {
+                uint64_t key = keys[i].first;
+                auto itr = local_shard->Find(key);
+                size_t data_size = value_size - mf_value_size;
+                if (itr == local_shard->end()) {
+                  // ++missed_keys;
+                  if (FLAGS_pslib_create_value_when_push) {
+                    memset(data_buffer, 0, sizeof(float) * data_size);
+                  } else {
+                    auto* feature_value = local_shard->Init(key);
+                    feature_value->resize(data_size);
+                    float* data_ptr = feature_value->data();
+                    _value_accesor->create(&data_buffer_ptr, 1);
+                    memcpy(data_ptr, data_buffer_ptr,
+                           data_size * sizeof(float));
+                  }
+                } else {
+                  data_size = itr->second->size();
+                  memcpy(data_buffer_ptr, itr->second->data(),
+                         data_size * sizeof(float));
+                }
+                for (int mf_idx = data_size; mf_idx < value_size; ++mf_idx) {
+                  data_buffer[mf_idx] = 0.0;
+                }
+                auto offset = keys[i].second;
+                float* select_data = pull_values + select_value_size * offset;
+                _value_accesor->select(&select_data,
+                                       (const float**)&data_buffer_ptr, 1);
+              }
+
+              return 0;
+            });
+  }
+
+  for (size_t shard_id = 0; shard_id < tasks.size(); ++shard_id) {
+    tasks[shard_id].wait();
+  }
+
+  return 0;
+}
+
+int32_t MemorySparseTable::pull_sparse_ptr(char** pull_values,
+                                           const uint64_t* keys, size_t num) {
+  return 0;
+}
+
+int32_t MemorySparseTable::push_sparse(const uint64_t* keys,
+                                       const float* values, size_t num) {
+  std::vector<std::future<int>> tasks(real_local_shard_num_);
+  std::vector<std::vector<std::pair<uint64_t, int>>> task_keys(
+      real_local_shard_num_);
+  for (size_t i = 0; i < num; ++i) {
+    int shard_id = (keys[i] % sparse_table_shard_num_) % avg_local_shard_num_;
+    task_keys[shard_id].push_back({keys[i], i});
+  }
+
+  const size_t value_col = _value_accesor->size() / sizeof(float);
+  size_t mf_value_col = _value_accesor->mf_size() / sizeof(float);
+  size_t update_value_col = _value_accesor->update_size() / sizeof(float);
+
+  for (size_t shard_id = 0; shard_id < real_local_shard_num_; ++shard_id) {
+    tasks[shard_id] = shards_task_pool_[shard_id % task_pool_size_]->enqueue(
+        [this, shard_id, value_col, mf_value_col, update_value_col, values,
+         &task_keys]() -> int {
+          auto& keys = task_keys[shard_id];
+          auto& local_shard = shard_values_[shard_id];
+          float data_buffer[value_col];  // NOLINT
+          float* data_buffer_ptr = data_buffer;
+
+          for (int i = 0; i < keys.size(); ++i) {
+            uint64_t key = keys[i].first;
+            uint64_t push_data_idx = keys[i].second;
+            const float* update_data =
+                values + push_data_idx * update_value_col;
+            auto itr = local_shard->Find(key);
+            if (itr == local_shard->end()) {
+              VLOG(0) << "sparse table push_sparse: " << key << "not found!";
+              if (FLAGS_pslib_enable_create_feasign_randomly &&
+                  !_value_accesor->create_value(1, update_data)) {
+                continue;
+              }
+              auto value_size = value_col - mf_value_col;
+              auto* feature_value = local_shard->Init(key);
+              feature_value->resize(value_size);
+              _value_accesor->create(&data_buffer_ptr, 1);
+              memcpy(feature_value->data(), data_buffer_ptr,
+                     value_size * sizeof(float));
+              itr = local_shard->Find(key);
+            } else {
+              VLOG(2) << "sparse table debug push_sparse: " << key << " found!";
+            }
+
+            auto* feature_value = itr->second;
+            float* value_data = feature_value->data();
+            size_t value_size = feature_value->size();
+
+            if (value_size == value_col) {  // 已拓展到最大size, 则就地update
+              _value_accesor->update(&value_data, &update_data, 1);
+            } else {
+              // 拷入buffer区进行update，然后再回填，不需要的mf则回填时抛弃了
+              memcpy(data_buffer_ptr, value_data, value_size * sizeof(float));
+              _value_accesor->update(&data_buffer_ptr, &update_data, 1);
+
+              if (_value_accesor->need_extend_mf(data_buffer)) {
+                feature_value->resize(value_col);
+                value_data = feature_value->data();
+                _value_accesor->create(&value_data, 1);
+              }
+              memcpy(value_data, data_buffer_ptr, value_size * sizeof(float));
+            }
+          }
+          return 0;
+        });
+  }
+
+  for (size_t shard_id = 0; shard_id < tasks.size(); ++shard_id) {
+    tasks[shard_id].wait();
+  }
+  return 0;
+}
+
+int32_t MemorySparseTable::push_sparse(const uint64_t* keys,
+                                       const float** values, size_t num) {
+  _push_sparse(keys, values, num);
+  return 0;
+}
+
+int32_t MemorySparseTable::_push_sparse(const uint64_t* keys,
+                                        const float** values, size_t num) {
+  std::vector<std::future<int>> tasks(real_local_shard_num_);
+  std::vector<std::vector<std::pair<uint64_t, int>>> task_keys(
+      real_local_shard_num_);
+  for (size_t i = 0; i < num; ++i) {
+    int shard_id = (keys[i] % sparse_table_shard_num_) % avg_local_shard_num_;
+    task_keys[shard_id].push_back({keys[i], i});
+  }
+
+  size_t value_col = _value_accesor->size() / sizeof(float);
+  size_t mf_value_col = _value_accesor->mf_size() / sizeof(float);
+  size_t update_value_col = _value_accesor->update_size() / sizeof(float);
+
+  for (int shard_id = 0; shard_id < real_local_shard_num_; ++shard_id) {
+    tasks[shard_id] = shards_task_pool_[shard_id % task_pool_size_]->enqueue(
+        [this, shard_id, value_col, mf_value_col, update_value_col, values,
+         &task_keys]() -> int {
+          auto& keys = task_keys[shard_id];
+          auto& local_shard = shard_values_[shard_id];
+          float data_buffer[value_col];  // NOLINT
+          float* data_buffer_ptr = data_buffer;
+
+          for (int i = 0; i < keys.size(); ++i) {
+            uint64_t key = keys[i].first;
+            uint64_t push_data_idx = keys[i].second;
+            const float* update_data = values[push_data_idx];
+            auto itr = local_shard->Find(key);
+            if (itr == local_shard->end()) {
+              if (FLAGS_pslib_enable_create_feasign_randomly &&
+                  !_value_accesor->create_value(1, update_data)) {
+                continue;
+              }
+              auto value_size = value_col - mf_value_col;
+              auto* feature_value = local_shard->Init(key);
+              feature_value->resize(value_size);
+              _value_accesor->create(&data_buffer_ptr, 1);
+              memcpy(feature_value->data(), data_buffer_ptr,
+                     value_size * sizeof(float));
+              itr = local_shard->Find(key);
+            }
+            auto* feature_value = itr->second;
+            float* value_data = feature_value->data();
+            size_t value_size = feature_value->size();
+            if (value_size == value_col) {  // 已拓展到最大size, 则就地update
+              _value_accesor->update(&value_data, &update_data, 1);
+            } else {
+              // 拷入buffer区进行update，然后再回填，不需要的mf则回填时抛弃了
+              memcpy(data_buffer_ptr, value_data, value_size * sizeof(float));
+              _value_accesor->update(&data_buffer_ptr, &update_data, 1);
+              if (_value_accesor->need_extend_mf(data_buffer)) {
+                feature_value->resize(value_col);
+                value_data = feature_value->data();
+                _value_accesor->create(&value_data, 1);
+              }
+              memcpy(value_data, data_buffer_ptr, value_size * sizeof(float));
+            }
+          }
+          return 0;
+        });
+  }
+
+  for (size_t shard_id = 0; shard_id < tasks.size(); ++shard_id) {
+    tasks[shard_id].wait();
+  }
+  return 0;
+}
+
+int32_t MemorySparseTable::flush() { return 0; }
+
+int32_t MemorySparseTable::shrink(const std::string& param) {
+  VLOG(0) << "MemorySparseTable::shrink";
+  // TODO(zhaocaibei123): implement with multi-thread
+  for (int shard_id = 0; shard_id < real_local_shard_num_; ++shard_id) {
+    // shrink
+    auto& shard = shard_values_[shard_id];
+    for (auto& table : shard->values_) {
+      for (auto iter = table.begin(); iter != table.end();) {
+        if (_value_accesor->shrink(iter->second->data())) {
+          butil::return_object(iter->second);
+          iter = table.erase(iter);
+          VLOG(1) << "shrink erase key: " << iter->first;
+        } else {
+          ++iter;
+        }
+      }
+    }
+  }
+  return 0;
+}
+
+void MemorySparseTable::clear() { VLOG(0) << "clear coming soon"; }
+
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/table/memory_sparse_table.h b/paddle/fluid/distributed/table/memory_sparse_table.h
new file mode 100644
index 0000000000000..409757ebec22a
--- /dev/null
+++ b/paddle/fluid/distributed/table/memory_sparse_table.h
@@ -0,0 +1,92 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <ThreadPool.h>
+#include <assert.h>
+#include <pthread.h>
+#include <memory>
+#include <mutex>  // NOLINT
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+#include "Eigen/Dense"
+#include "paddle/fluid/distributed/table/accessor.h"
+#include "paddle/fluid/distributed/table/common_table.h"
+#include "paddle/fluid/distributed/table/depends/feature_value.h"
+#include "paddle/fluid/string/string_helper.h"
+
+#define PSERVER_SAVE_SUFFIX ".shard"
+
+namespace paddle {
+namespace distributed {
+
+class MemorySparseTable : public SparseTable {
+ public:
+  MemorySparseTable() {}
+  virtual ~MemorySparseTable() {}
+
+  // unused method begin
+  virtual int32_t pull_dense(float* pull_values, size_t num) { return 0; }
+  virtual int32_t push_dense_param(const float* values, size_t num) {
+    return 0;
+  }
+  virtual int32_t push_dense(const float* values, size_t num) { return 0; }
+  // unused method end
+
+  virtual int32_t initialize();
+  virtual int32_t initialize_shard() { return 0; }
+  virtual int32_t initialize_value();
+
+  virtual int32_t load(const std::string& path, const std::string& param);
+
+  virtual int32_t save(const std::string& path, const std::string& param);
+
+  int32_t load_local_fs(const std::string& path, const std::string& param);
+  int32_t save_local_fs(const std::string& path, const std::string& param,
+                        const std::string& prefix);
+
+  virtual std::pair<int64_t, int64_t> print_table_stat();
+  virtual int32_t pull_sparse(float* values, const PullSparseValue& pull_value);
+
+  virtual int32_t pull_sparse_ptr(char** pull_values, const uint64_t* keys,
+                                  size_t num);
+
+  virtual int32_t push_sparse(const uint64_t* keys, const float* values,
+                              size_t num);
+
+  virtual int32_t push_sparse(const uint64_t* keys, const float** values,
+                              size_t num);
+
+  virtual int32_t flush();
+  virtual int32_t shrink(const std::string& param);
+  virtual void clear();
+
+ protected:
+  virtual int32_t _push_sparse(const uint64_t* keys, const float** values,
+                               size_t num);
+
+ protected:
+  const int task_pool_size_ = 24;
+  size_t avg_local_shard_num_;
+  size_t real_local_shard_num_;
+  size_t sparse_table_shard_num_;
+  std::vector<std::shared_ptr<::ThreadPool>> shards_task_pool_;
+  std::vector<std::shared_ptr<SparseTableShard>> shard_values_;
+};
+
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/table/table.cc b/paddle/fluid/distributed/table/table.cc
index 0f8753c074634..ac026184b8864 100644
--- a/paddle/fluid/distributed/table/table.cc
+++ b/paddle/fluid/distributed/table/table.cc
@@ -24,6 +24,8 @@
 #ifdef PADDLE_WITH_HETERPS
 #include "paddle/fluid/distributed/table/ssd_sparse_table.h"
 #endif
+#include "paddle/fluid/distributed/table/ctr_accessor.h"
+#include "paddle/fluid/distributed/table/memory_sparse_table.h"
 #include "paddle/fluid/distributed/table/tensor_accessor.h"
 #include "paddle/fluid/distributed/table/tensor_table.h"
 
@@ -40,7 +42,13 @@ REGISTER_PSCORE_CLASS(Table, BarrierTable);
 REGISTER_PSCORE_CLASS(Table, TensorTable);
 REGISTER_PSCORE_CLASS(Table, DenseTensorTable);
 REGISTER_PSCORE_CLASS(Table, GlobalStepTable);
+REGISTER_PSCORE_CLASS(Table, MemorySparseTable);
 REGISTER_PSCORE_CLASS(ValueAccessor, CommMergeAccessor);
+REGISTER_PSCORE_CLASS(ValueAccessor, CtrCommonAccessor);
+REGISTER_PSCORE_CLASS(SparseValueSGDRule, StdAdaGradSGDRule);
+REGISTER_PSCORE_CLASS(SparseValueSGDRule, SparseAdamSGDRule);
+REGISTER_PSCORE_CLASS(SparseValueSGDRule, SparseNaiveSGDRule);
+REGISTER_PSCORE_CLASS(SparseValueSGDRule, SparseAdaGradSGDRule);
 
 int32_t TableManager::initialize() {
   static bool initialized = false;
@@ -58,6 +66,11 @@ int32_t Table::initialize(const TableParameter &config,
     LOG(WARNING) << "Table accessor initialize failed";
     return -1;
   }
+
+  if (_afs_client.initialize(fs_config) != 0) {
+    LOG(WARNING) << "Table fs_client initialize failed";
+    // return -1;
+  }
   return initialize();
 }
 
@@ -67,6 +80,9 @@ int32_t Table::initialize_accessor() {
                << _config.table_id();
     return -1;
   }
+
+  LOG(INFO) << "accessor initializing: table_id: " << _config.table_id()
+            << ", accessor_name: " << _config.accessor().accessor_class();
   auto *accessor = CREATE_PSCORE_CLASS(
       ValueAccessor,
       _config.accessor().accessor_class()) if (accessor == NULL) {
diff --git a/paddle/fluid/distributed/table/table.h b/paddle/fluid/distributed/table/table.h
index 55fc92c9b5785..f6568b4336fbb 100644
--- a/paddle/fluid/distributed/table/table.h
+++ b/paddle/fluid/distributed/table/table.h
@@ -20,6 +20,7 @@
 #include <memory>
 #include <string>
 #include <utility>
+#include "paddle/fluid/distributed/common/afs_warpper.h"
 #include "paddle/fluid/distributed/table/accessor.h"
 #include "paddle/fluid/distributed/table/depends/sparse_utils.h"
 #include "paddle/fluid/distributed/table/graph/graph_node.h"
@@ -103,10 +104,10 @@ class Table {
   virtual int32_t flush() = 0;
   virtual int32_t shrink(const std::string &param) = 0;
 
-  //指定加载路径
+  // 指定加载路径
   virtual int32_t load(const std::string &path,
                        const std::string &converter) = 0;
-  //指定保存路径
+  // 指定保存路径
   virtual int32_t save(const std::string &path,
                        const std::string &converter) = 0;
 
@@ -137,6 +138,7 @@ class Table {
   TableParameter _config;
   float *_global_lr = nullptr;
   std::shared_ptr<ValueAccessor> _value_accesor;
+  AfsClient _afs_client;
 };
 REGISTER_PSCORE_REGISTERER(Table);
 
diff --git a/paddle/fluid/distributed/test/CMakeLists.txt b/paddle/fluid/distributed/test/CMakeLists.txt
index f8cd9af4774ec..597a08973b957 100644
--- a/paddle/fluid/distributed/test/CMakeLists.txt
+++ b/paddle/fluid/distributed/test/CMakeLists.txt
@@ -29,3 +29,6 @@ cc_test(sparse_sgd_rule_test SRCS sparse_sgd_rule_test.cc DEPS ${COMMON_DEPS} bo
 
 set_source_files_properties(ctr_accessor_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 cc_test(ctr_accessor_test SRCS ctr_accessor_test.cc DEPS ${COMMON_DEPS} boost table)
+
+set_source_files_properties(memory_sparse_table_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+cc_test(memory_sparse_table_test SRCS memory_sparse_table_test.cc DEPS ${COMMON_DEPS} boost table)
diff --git a/paddle/fluid/distributed/test/graph_node_test.cc b/paddle/fluid/distributed/test/graph_node_test.cc
index 613770220f9d7..c061fe0bb909d 100644
--- a/paddle/fluid/distributed/test/graph_node_test.cc
+++ b/paddle/fluid/distributed/test/graph_node_test.cc
@@ -111,7 +111,7 @@ void testFeatureNodeSerializeFloat64() {
 void testSingleSampleNeighboor(
     std::shared_ptr<paddle::distributed::GraphBrpcClient>& worker_ptr_) {
   std::vector<std::vector<std::pair<uint64_t, float>>> vs;
-  auto pull_status = worker_ptr_->batch_sample_neighboors(
+  auto pull_status = worker_ptr_->batch_sample_neighbors(
       0, std::vector<uint64_t>(1, 37), 4, vs);
   pull_status.wait();
 
@@ -127,7 +127,7 @@ void testSingleSampleNeighboor(
   s.clear();
   s1.clear();
   vs.clear();
-  pull_status = worker_ptr_->batch_sample_neighboors(
+  pull_status = worker_ptr_->batch_sample_neighbors(
       0, std::vector<uint64_t>(1, 96), 4, vs);
   pull_status.wait();
   s1 = {111, 48, 247};
@@ -139,7 +139,7 @@ void testSingleSampleNeighboor(
     ASSERT_EQ(true, s1.find(g) != s1.end());
   }
   vs.clear();
-  pull_status = worker_ptr_->batch_sample_neighboors(0, {96, 37}, 4, vs, 0);
+  pull_status = worker_ptr_->batch_sample_neighbors(0, {96, 37}, 4, vs, 0);
   pull_status.wait();
   ASSERT_EQ(vs.size(), 2);
 }
@@ -199,7 +199,7 @@ void testBatchSampleNeighboor(
     std::shared_ptr<paddle::distributed::GraphBrpcClient>& worker_ptr_) {
   std::vector<std::vector<std::pair<uint64_t, float>>> vs;
   std::vector<std::uint64_t> v = {37, 96};
-  auto pull_status = worker_ptr_->batch_sample_neighboors(0, v, 4, vs);
+  auto pull_status = worker_ptr_->batch_sample_neighbors(0, v, 4, vs);
   pull_status.wait();
   std::unordered_set<uint64_t> s;
   std::unordered_set<uint64_t> s1 = {112, 45, 145};
@@ -222,6 +222,7 @@ void testBatchSampleNeighboor(
   }
 }
 
+void testCache();
 void testGraphToBuffer();
 // std::string nodes[] = {std::string("37\taa\t45;0.34\t145;0.31\t112;0.21"),
 //                        std::string("96\tfeature\t48;1.4\t247;0.31\t111;1.21"),
@@ -400,6 +401,7 @@ void RunClient(
 }
 
 void RunBrpcPushSparse() {
+  testCache();
   setenv("http_proxy", "", 1);
   setenv("https_proxy", "", 1);
   prepare_file(edge_file_name, 1);
@@ -433,10 +435,33 @@ void RunBrpcPushSparse() {
   sleep(5);
   testSingleSampleNeighboor(worker_ptr_);
   testBatchSampleNeighboor(worker_ptr_);
-  pull_status = worker_ptr_->batch_sample_neighboors(
+  pull_status = worker_ptr_->batch_sample_neighbors(
       0, std::vector<uint64_t>(1, 10240001024), 4, vs);
   pull_status.wait();
   ASSERT_EQ(0, vs[0].size());
+  paddle::distributed::GraphTable* g =
+      (paddle::distributed::GraphTable*)pserver_ptr_->table(0);
+  size_t ttl = 6;
+  g->make_neighbor_sample_cache(4, ttl);
+  int round = 5;
+  while (round--) {
+    vs.clear();
+    pull_status = worker_ptr_->batch_sample_neighbors(
+        0, std::vector<uint64_t>(1, 37), 1, vs);
+    pull_status.wait();
+
+    for (int i = 0; i < ttl; i++) {
+      std::vector<std::vector<std::pair<uint64_t, float>>> vs1;
+      pull_status = worker_ptr_->batch_sample_neighbors(
+          0, std::vector<uint64_t>(1, 37), 1, vs1);
+      pull_status.wait();
+      ASSERT_EQ(vs[0].size(), vs1[0].size());
+
+      for (int j = 0; j < vs[0].size(); j++) {
+        ASSERT_EQ(vs[0][j].first, vs1[0][j].first);
+      }
+    }
+  }
 
   std::vector<distributed::FeatureNode> nodes;
   pull_status = worker_ptr_->pull_graph_list(0, 0, 0, 1, 1, nodes);
@@ -534,13 +559,13 @@ void RunBrpcPushSparse() {
     ASSERT_EQ(count_item_nodes.size(), 12);
   }
 
-  vs = client1.batch_sample_neighboors(std::string("user2item"),
-                                       std::vector<uint64_t>(1, 96), 4);
+  vs = client1.batch_sample_neighbors(std::string("user2item"),
+                                      std::vector<uint64_t>(1, 96), 4);
   ASSERT_EQ(vs[0].size(), 3);
   std::vector<uint64_t> node_ids;
   node_ids.push_back(96);
   node_ids.push_back(37);
-  vs = client1.batch_sample_neighboors(std::string("user2item"), node_ids, 4);
+  vs = client1.batch_sample_neighbors(std::string("user2item"), node_ids, 4);
 
   ASSERT_EQ(vs.size(), 2);
   std::vector<uint64_t> nodes_ids = client2.random_sample_nodes("user", 0, 6);
@@ -607,6 +632,56 @@ void RunBrpcPushSparse() {
   client1.stop_server();
 }
 
+void testCache() {
+  ::paddle::distributed::ScaledLRU<::paddle::distributed::SampleKey,
+                                   ::paddle::distributed::SampleResult>
+      st(1, 2, 4);
+  char* str = new char[7];
+  strcpy(str, "54321");
+  ::paddle::distributed::SampleResult* result =
+      new ::paddle::distributed::SampleResult(5, str);
+  ::paddle::distributed::SampleKey skey = {6, 1};
+  std::vector<std::pair<::paddle::distributed::SampleKey,
+                        paddle::distributed::SampleResult>>
+      r;
+  st.query(0, &skey, 1, r);
+  ASSERT_EQ((int)r.size(), 0);
+
+  st.insert(0, &skey, result, 1);
+  for (int i = 0; i < st.get_ttl(); i++) {
+    st.query(0, &skey, 1, r);
+    ASSERT_EQ((int)r.size(), 1);
+    char* p = (char*)r[0].second.buffer.get();
+    for (int j = 0; j < r[0].second.actual_size; j++) ASSERT_EQ(p[j], str[j]);
+    r.clear();
+  }
+  st.query(0, &skey, 1, r);
+  ASSERT_EQ((int)r.size(), 0);
+  str = new char[10];
+  strcpy(str, "54321678");
+  result = new ::paddle::distributed::SampleResult(strlen(str), str);
+  st.insert(0, &skey, result, 1);
+  for (int i = 0; i < st.get_ttl() / 2; i++) {
+    st.query(0, &skey, 1, r);
+    ASSERT_EQ((int)r.size(), 1);
+    char* p = (char*)r[0].second.buffer.get();
+    for (int j = 0; j < r[0].second.actual_size; j++) ASSERT_EQ(p[j], str[j]);
+    r.clear();
+  }
+  str = new char[18];
+  strcpy(str, "343332d4321");
+  result = new ::paddle::distributed::SampleResult(strlen(str), str);
+  st.insert(0, &skey, result, 1);
+  for (int i = 0; i < st.get_ttl(); i++) {
+    st.query(0, &skey, 1, r);
+    ASSERT_EQ((int)r.size(), 1);
+    char* p = (char*)r[0].second.buffer.get();
+    for (int j = 0; j < r[0].second.actual_size; j++) ASSERT_EQ(p[j], str[j]);
+    r.clear();
+  }
+  st.query(0, &skey, 1, r);
+  ASSERT_EQ((int)r.size(), 0);
+}
 void testGraphToBuffer() {
   ::paddle::distributed::GraphNode s, s1;
   s.set_feature_size(1);
diff --git a/paddle/fluid/distributed/test/memory_sparse_table_test.cc b/paddle/fluid/distributed/test/memory_sparse_table_test.cc
new file mode 100644
index 0000000000000..30a1107d64e3c
--- /dev/null
+++ b/paddle/fluid/distributed/test/memory_sparse_table_test.cc
@@ -0,0 +1,140 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <ThreadPool.h>
+
+#include <unistd.h>
+#include <string>
+#include <thread>  // NOLINT
+
+#include "google/protobuf/text_format.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/distributed/ps.pb.h"
+#include "paddle/fluid/distributed/table/memory_sparse_table.h"
+#include "paddle/fluid/distributed/table/table.h"
+
+namespace paddle {
+namespace distributed {
+
+TEST(MemorySparseTable, SGD) {
+  int emb_dim = 8;
+  int trainers = 2;
+
+  TableParameter table_config;
+  table_config.set_table_class("MemorySparseTable");
+  table_config.set_shard_num(10);
+  FsClientParameter fs_config;
+  Table *table = new MemorySparseTable();
+  table->set_shard(0, 1);
+
+  TableAccessorParameter *accessor_config = table_config.mutable_accessor();
+  accessor_config->set_accessor_class("CtrCommonAccessor");
+  accessor_config->set_fea_dim(11);
+  accessor_config->set_embedx_dim(8);
+  accessor_config->set_embedx_threshold(5);
+  accessor_config->mutable_ctr_accessor_param()->set_nonclk_coeff(0.2);
+  accessor_config->mutable_ctr_accessor_param()->set_click_coeff(1);
+  accessor_config->mutable_ctr_accessor_param()->set_base_threshold(0.5);
+  accessor_config->mutable_ctr_accessor_param()->set_delta_threshold(0.2);
+  accessor_config->mutable_ctr_accessor_param()->set_delta_keep_days(16);
+  accessor_config->mutable_ctr_accessor_param()->set_show_click_decay_rate(
+      0.99);
+
+  accessor_config->mutable_embed_sgd_param()->set_name("SparseNaiveSGDRule");
+  auto *naive_param =
+      accessor_config->mutable_embed_sgd_param()->mutable_naive();
+  naive_param->set_learning_rate(0.1);
+  naive_param->set_initial_range(0.3);
+  naive_param->add_weight_bounds(-10.0);
+  naive_param->add_weight_bounds(10.0);
+
+  accessor_config->mutable_embedx_sgd_param()->set_name("SparseNaiveSGDRule");
+  naive_param = accessor_config->mutable_embedx_sgd_param()->mutable_naive();
+  naive_param->set_learning_rate(0.1);
+  naive_param->set_initial_range(0.3);
+  naive_param->add_weight_bounds(-10.0);
+  naive_param->add_weight_bounds(10.0);
+
+  auto ret = table->initialize(table_config, fs_config);
+  ASSERT_EQ(ret, 0);
+
+  // pull parameters for create and check
+  std::vector<uint64_t> init_keys = {0, 1, 2, 3, 4};
+  std::vector<uint32_t> init_fres = {1, 1, 1, 1, 1};
+
+  std::vector<float> init_values;
+  init_values.resize(init_keys.size() * (emb_dim + 1));
+  auto value = PullSparseValue(init_keys, init_fres, emb_dim);
+  table->pull_sparse(init_values.data(), value);
+
+  // for check
+  std::vector<float> total_gradients;
+  total_gradients.resize(init_keys.size() * (4 + emb_dim));
+  memset(total_gradients.data(), 0, sizeof(float) * total_gradients.size());
+
+  // push gradient
+  std::vector<std::vector<uint64_t>> trainer_keys;
+  std::vector<std::vector<float>> trainer_gradient_values;
+  trainer_keys.resize(trainers);
+  trainer_gradient_values.resize(trainers);
+  float start = 0.0;
+  for (int i = 0; i < trainers; i++) {
+    start = 0.0;
+    trainer_keys[i] = init_keys;
+    for (size_t j = 0; j < trainer_keys[i].size(); j++) {
+      auto id = trainer_keys[i][j];
+      for (int k = 0; k < emb_dim + 4; k++) {
+        trainer_gradient_values[i].push_back(start);
+        total_gradients[id * (emb_dim + 4) + k] += start;
+        start += 0.1;
+      }
+    }
+  }
+
+  std::shared_ptr<::ThreadPool> pool_ =
+      std::make_shared<::ThreadPool>(trainers);
+  std::vector<std::future<void>> task_status;
+  for (int i = 0; i < trainers; i++) {
+    auto &push_keys = trainer_keys[i];
+    auto &push_values = trainer_gradient_values[i];
+    auto task = [table, &push_keys, &push_values] {
+      table->push_sparse(push_keys.data(), push_values.data(),
+                         push_keys.size());
+    };
+    task_status.push_back(pool_->enqueue(std::move(task)));
+  }
+  for (auto &status : task_status) {
+    status.wait();
+  }
+
+  std::vector<float> pull_values;
+  pull_values.resize(init_keys.size() * (emb_dim + 1));
+  table->pull_sparse(pull_values.data(), value);
+
+  for (size_t i = 0; i < init_keys.size(); ++i) {
+    for (size_t j = 0; j < emb_dim + 1; ++j) {
+      auto update_val = init_values[i * (emb_dim + 1) + j] -
+                        0.1 * total_gradients[3 + i * (emb_dim + 4) + j];
+      VLOG(3) << total_gradients[i * (emb_dim + 4) + j + 3] << ":"
+              << init_values[i * (emb_dim + 1) + j];
+      VLOG(3) << update_val << ": " << pull_values[i * (emb_dim + 1) + j];
+    }
+  }
+
+  MemorySparseTable *ctr_table = dynamic_cast<MemorySparseTable *>(table);
+  ctr_table->save_local_fs("./work/table.save", "0", "test");
+}
+
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/extension/include/ext_place.h b/paddle/fluid/extension/include/ext_place.h
index c9ed40a382417..91d4f41c21351 100644
--- a/paddle/fluid/extension/include/ext_place.h
+++ b/paddle/fluid/extension/include/ext_place.h
@@ -17,6 +17,6 @@ limitations under the License. */
 namespace paddle {
 
 // TODO(yangjiabin): Add other place support in next PR
-enum class PlaceType { kUNK = -1, kCPU, kGPU, kHIP };
+enum class PlaceType { kUNK = -1, kCPU, kGPU };
 
 }  // namespace paddle
diff --git a/paddle/fluid/extension/include/ext_tensor.h b/paddle/fluid/extension/include/ext_tensor.h
index 7d13f56b02b82..970be905cc256 100644
--- a/paddle/fluid/extension/include/ext_tensor.h
+++ b/paddle/fluid/extension/include/ext_tensor.h
@@ -16,8 +16,15 @@ limitations under the License. */
 
 #include <memory>
 #include <vector>
+
 #ifdef PADDLE_WITH_CUDA
 #include <cuda_runtime.h>
+using gpuStream_t = cudaStream_t;
+#endif
+
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_runtime.h>
+using gpuStream_t = hipStream_t;
 #endif
 
 #include "ext_dll_decl.h"  // NOLINT
@@ -126,11 +133,9 @@ class PD_DLL_DECL Tensor {
   /// \brief Check Tensor is initialized
   bool is_initialized() const;
 
-#if defined(PADDLE_WITH_CUDA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   /// \bref Get current stream of Tensor
-  cudaStream_t stream() const;
-#elif defined(PADDLE_WITH_HIP)
-  hipStream_t stream() const;
+  gpuStream_t stream() const;
 #endif
 
  private:
diff --git a/paddle/fluid/extension/src/ext_tensor.cc b/paddle/fluid/extension/src/ext_tensor.cc
index a0a9872c4c29c..b5cd9e0b5c0e1 100644
--- a/paddle/fluid/extension/src/ext_tensor.cc
+++ b/paddle/fluid/extension/src/ext_tensor.cc
@@ -69,9 +69,9 @@ struct CastDataType {
 };
 
 template <typename T>
-void DeviceCopy(T *src, T *dst, PlaceType src_plc, PlaceType dst_plc,
-                int64_t ele_size) {
-#if defined(PADDLE_WITH_CUDA)
+void GpuCopy(T *src, T *dst, PlaceType src_plc, PlaceType dst_plc,
+             int64_t ele_size) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
   int device_num = paddle::platform::GetCurrentDeviceId();
   platform::CUDAPlace gpu_place(device_num);
@@ -90,29 +90,11 @@ void DeviceCopy(T *src, T *dst, PlaceType src_plc, PlaceType dst_plc,
     PADDLE_THROW(platform::errors::Unavailable(
         "Only GPU related Copy can reach this func."));
   }
-#elif defined(PADDLE_WITH_HIP)
-  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-  int device_num = paddle::platform::GetCurrentDeviceId();
-  platform::CUDAPlace gpu_place(device_num);
-  auto *dev_ctx =
-      static_cast<const platform::CUDADeviceContext *>(pool.Get(gpu_place));
-  if ((src_plc == PlaceType::kHIP) && (dst_plc == PlaceType::kCPU)) {
-    memory::Copy(platform::CPUPlace(), static_cast<void *>(dst), gpu_place, src,
-                 ele_size, dev_ctx->stream());
-  } else if ((src_plc == PlaceType::kHIP) && (dst_plc == PlaceType::kHIP)) {
-    memory::Copy(gpu_place, static_cast<void *>(dst), gpu_place, src, ele_size,
-                 dev_ctx->stream());
-  } else if ((src_plc == PlaceType::kCPU) && (dst_plc == PlaceType::kHIP)) {
-    memory::Copy(gpu_place, static_cast<void *>(dst), platform::CPUPlace(), src,
-                 ele_size, dev_ctx->stream());
-  } else {
-    PADDLE_THROW(platform::errors::Unavailable(
-        "Only GPU related Copy can reach this func."));
-  }
+#ifdef PADDLE_WITH_HIP
+  hipStreamSynchronize(dev_ctx->stream());
 #else
-  PADDLE_THROW(platform::errors::Unavailable(
-      "This function can only be used if compiled with"
-      "either -DWITH_ROCM=ON or -DWITH_GPU=ON"));
+  cudaStreamSynchronize(dev_ctx->stream());
+#endif
 #endif
 }
 
@@ -175,16 +157,11 @@ T *Tensor::mutable_data() {
     case static_cast<int>(PlaceType::kCPU): {
       return tensor->mutable_data<T>(platform::CPUPlace());
     }
-#if defined(PADDLE_WITH_CUDA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     case static_cast<int>(PlaceType::kGPU): {
       int device_num = platform::GetCurrentDeviceId();
       return tensor->mutable_data<T>(platform::CUDAPlace(device_num));
     }
-#elif defined(PADDLE_WITH_HIP)
-    case static_cast<int>(PlaceType::kHIP): {
-      int device_num = platform::GetCurrentDeviceId();
-      return tensor->mutable_data<T>(platform::CUDAPlace(device_num));
-    }
 #endif
     default:
       PADDLE_THROW(platform::errors::Unavailable(
@@ -245,23 +222,17 @@ Tensor Tensor::copy_to(const PlaceType &target_place) const {
   target.reshape(shape());
   auto *p_target_data = target.template mutable_data<T>();
 
-  bool supported_gpu_transform = false;
-#if defined(PADDLE_WITH_CUDA)
-  supported_gpu_transform =
-      (src_place == PlaceType::kGPU && target_place == PlaceType::kCPU) ||
-      (src_place == PlaceType::kCPU && target_place == PlaceType::kGPU) ||
-      (src_place == PlaceType::kGPU && target_place == PlaceType::kGPU);
-#elif defined(PADDLE_WITH_HIP)
-  supported_gpu_transform =
-      (src_place == PlaceType::kHIP && target_place == PlaceType::kCPU) ||
-      (src_place == PlaceType::kCPU && target_place == PlaceType::kHIP) ||
-      (src_place == PlaceType::kHIP && target_place == PlaceType::kHIP);
-#endif
-
   if ((src_place == PlaceType::kCPU) && (target_place == PlaceType::kCPU)) {
     std::memcpy(static_cast<void *>(p_target_data), p_src_data, ele_size);
-  } else if (supported_gpu_transform) {
-    DeviceCopy<T>(p_src_data, p_target_data, src_place, target_place, ele_size);
+  } else if ((src_place == PlaceType::kGPU) &&
+             (target_place == PlaceType::kCPU)) {
+    GpuCopy<T>(p_src_data, p_target_data, src_place, target_place, ele_size);
+  } else if ((src_place == PlaceType::kCPU) &&
+             (target_place == PlaceType::kGPU)) {
+    GpuCopy<T>(p_src_data, p_target_data, src_place, target_place, ele_size);
+  } else if ((src_place == PlaceType::kGPU) &&
+             (target_place == PlaceType::kGPU)) {
+    GpuCopy<T>(p_src_data, p_target_data, src_place, target_place, ele_size);
   } else {
     PADDLE_THROW(platform::errors::Unavailable(
         "Not supported place transform of place: %d to place: %d",
@@ -363,18 +334,15 @@ const PlaceType &Tensor::place() const {
   GET_CASTED_TENSOR;
   if (platform::is_cpu_place(tensor->place())) {
     place_ = PlaceType::kCPU;
-#if defined(PADDLE_WITH_CUDA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   } else if (platform::is_gpu_place(tensor->place())) {
     place_ = PlaceType::kGPU;
-#elif defined(PADDLE_WITH_HIP)
-  } else if (platform::is_gpu_place(tensor->place())) {
-    place_ = PlaceType::kHIP;
 #endif
   } else {
     PADDLE_THROW(platform::errors::Unimplemented(
         "Current Tensor hold unsupported Place Type, Please Init it"
         "using Tensor::mutable_data<T>(PaddlePlace) with T among:"
-        "Place::kCPU or Place::kGPU or Place::kHIP"));
+        "Place::kCPU or Place::kGPU"));
   }
   return place_;
 }
@@ -456,21 +424,16 @@ bool Tensor::is_initialized() const {
   }
 }
 
-#define DEFINE_STREAM(_stream_t_)                               \
-  _stream_t_ Tensor::stream() const {                           \
-    if (!stream_.IsStreamSet()) {                               \
-      PADDLE_THROW(platform::errors::PreconditionNotMet(        \
-          "Stream is not Set, only input tensor will have "     \
-          "stream which is set by framework "));                \
-    } else {                                                    \
-      return reinterpret_cast<_stream_t_>(stream_.GetStream()); \
-    }                                                           \
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+gpuStream_t Tensor::stream() const {
+  if (!stream_.IsStreamSet()) {
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "Stream is not Set, only input tensor will have "
+        "stream which is set by framework "));
+  } else {
+    return reinterpret_cast<gpuStream_t>(stream_.GetStream());
   }
-
-#if defined(PADDLE_WITH_CUDA)
-DEFINE_STREAM(cudaStream_t)
-#elif defined(PADDLE_WITH_HIP)
-DEFINE_STREAM(hipStream_t)
+}
 #endif
 
 namespace framework {
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index edb43b8d38c27..7a3a287af33c0 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -117,7 +117,7 @@ cc_test(reader_test SRCS reader_test.cc DEPS reader)
 cc_library(threadpool SRCS threadpool.cc DEPS enforce)
 cc_test(threadpool_test SRCS threadpool_test.cc DEPS threadpool)
 
-cc_library(var_type_traits SRCS var_type_traits.cc DEPS lod_tensor selected_rows framework_proto)
+cc_library(var_type_traits SRCS var_type_traits.cc DEPS lod_tensor selected_rows framework_proto scope)
 if (WITH_GPU)
   target_link_libraries(var_type_traits dynload_cuda)
 endif()
@@ -125,7 +125,7 @@ cc_test(var_type_traits_test SRCS var_type_traits_test.cc DEPS var_type_traits)
 
 set(BRPC_DEPS "")
 if(WITH_PSLIB OR WITH_PSCORE)
-    set(BRPC_DEPS brpc)
+    set(BRPC_DEPS brpc ssl crypto)
     if(WITH_PSLIB_BRPC)
         set(BRPC_DEPS pslib_brpc)
     endif()
@@ -197,10 +197,12 @@ cc_library(unused_var_check SRCS unused_var_check.cc DEPS glog no_need_buffer_va
 
 IF(WITH_XPU)
 cc_library(operator SRCS operator.cc DEPS xpu_op_list op_info device_context tensor scope glog trainer_desc_proto data_feed_proto
-    shape_inference data_transform lod_tensor profiler transfer_scope_cache op_kernel_type op_call_stack unused_var_check nan_inf_utils)
+    shape_inference data_transform lod_tensor profiler transfer_scope_cache op_kernel_type op_call_stack unused_var_check nan_inf_utils
+    pten pten_utils kernel_factory)
 ELSE()
 cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog trainer_desc_proto data_feed_proto
-    shape_inference data_transform lod_tensor profiler transfer_scope_cache op_kernel_type op_call_stack unused_var_check nan_inf_utils)
+    shape_inference data_transform lod_tensor profiler transfer_scope_cache op_kernel_type op_call_stack unused_var_check nan_inf_utils
+    pten pten_utils kernel_factory)
 ENDIF()
 
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context)
@@ -245,6 +247,9 @@ if(WITH_PYTHON)
       COMMAND cp distributed_strategy_*.py ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto
       COMMENT "Copy generated python proto into directory paddle/fluid/proto."
       WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+    add_custom_target(fleet_executor_proto_init ALL DEPENDS fleet_executor_desc_py_proto 
+      COMMAND cp ${PADDLE_BINARY_DIR}/paddle/fluid/distributed/fleet_executor/fleet_executor_*.py ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto
+      COMMENT "Copy generated python proto into directory paddle/distributed/fleet/proto.")
   else(NOT WIN32)
     string(REPLACE "/" "\\" proto_dstpath "${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/")
     string(REPLACE "/" "\\" fleet_proto_dstpath "${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto/")
@@ -284,7 +289,7 @@ if(WITH_DISTRIBUTE)
     fleet_wrapper heter_wrapper ps_gpu_wrapper box_wrapper lodtensor_printer
     lod_rank_table feed_fetch_method collective_helper ${GLOB_DISTRIBUTE_DEPS}
     graph_to_program_pass variable_helper data_feed_proto timer monitor
-    heter_service_proto ${BRPC_DEP})
+    heter_service_proto fleet_executor ${BRPC_DEP})
     set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
     if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
         set(DISTRIBUTE_COMPILE_FLAGS
@@ -303,7 +308,7 @@ if(WITH_DISTRIBUTE)
             pull_dense_worker.cc section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry
             device_context scope framework_proto data_feed_proto heter_service_proto trainer_desc_proto glog
             lod_rank_table fs shell fleet_wrapper heter_wrapper box_wrapper lodtensor_printer feed_fetch_method
-            graph_to_program_pass variable_helper timer monitor heter_service_proto fleet)
+            graph_to_program_pass variable_helper timer monitor heter_service_proto fleet fleet_executor)
     set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
     set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
     set_source_files_properties(multi_trainer.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
@@ -317,7 +322,7 @@ if(WITH_DISTRIBUTE)
             pull_dense_worker.cc section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry
             device_context scope framework_proto data_feed_proto heter_service_proto trainer_desc_proto glog
             lod_rank_table fs shell fleet_wrapper heter_wrapper ps_gpu_wrapper box_wrapper lodtensor_printer feed_fetch_method
-            graph_to_program_pass variable_helper timer monitor)
+            graph_to_program_pass variable_helper timer monitor fleet_executor)
   endif()
 elseif(WITH_PSLIB)
   set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
@@ -337,7 +342,7 @@ elseif(WITH_PSLIB)
   pull_dense_worker.cc section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry
   device_context scope framework_proto data_feed_proto heter_service_proto trainer_desc_proto glog
   lod_rank_table fs shell fleet_wrapper heter_wrapper ps_gpu_wrapper box_wrapper lodtensor_printer feed_fetch_method
-  graph_to_program_pass variable_helper timer monitor ${BRPC_DEP})
+  graph_to_program_pass variable_helper timer monitor fleet_executor ${BRPC_DEP})
 else()
   cc_library(executor SRCS executor.cc multi_trainer.cc pipeline_trainer.cc dataset_factory.cc
   dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc
@@ -347,7 +352,7 @@ else()
   pull_dense_worker.cc section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry
   device_context scope framework_proto data_feed_proto heter_service_proto trainer_desc_proto glog
   lod_rank_table fs shell fleet_wrapper heter_wrapper ps_gpu_wrapper box_wrapper lodtensor_printer feed_fetch_method
-  graph_to_program_pass variable_helper timer monitor)
+  graph_to_program_pass variable_helper timer monitor fleet_executor)
 endif()
 
 target_link_libraries(executor while_op_helper executor_gc_helper recurrent_op_helper conditional_block_op_helper)
@@ -394,6 +399,8 @@ cc_library(save_load_util SRCS save_load_util.cc DEPS tensor scope layer)
 cc_test(save_load_util_test SRCS save_load_util_test.cc DEPS save_load_util tensor scope layer)
 cc_library(generator SRCS generator.cc DEPS enforce place)
 
+cc_library(pten_utils SRCS pten_utils.cc DEPS lod_tensor selected_rows place pten var_type_traits pten_api_utils op_info)
+
 # Get the current working branch
 execute_process(
   COMMAND git rev-parse --abbrev-ref HEAD
@@ -456,3 +463,4 @@ if(WITH_TESTING AND TEST selected_rows_test)
 endif()
 
 cc_test(scope_guard_test SRCS scope_guard_test.cc)
+cc_test(pten_utils_test SRCS pten_utils_test.cc DEPS pten_utils)
diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc
index 19e661587716b..bb8258dcd9228 100644
--- a/paddle/fluid/framework/custom_operator.cc
+++ b/paddle/fluid/framework/custom_operator.cc
@@ -503,7 +503,7 @@ void RegisterOperatorKernel(const std::string& name,
   // but call api in gpu device, it will cause error.
   RegisterOperatorKernelWithPlace(name, kernel_func, proto::VarType::RAW,
                                   PlaceType::kCPU, inputs, outputs, attrs);
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   RegisterOperatorKernelWithPlace(name, kernel_func, proto::VarType::RAW,
                                   PlaceType::kGPU, inputs, outputs, attrs);
 #endif
diff --git a/paddle/fluid/framework/custom_tensor_test.cc b/paddle/fluid/framework/custom_tensor_test.cc
index 5d181bfb53bc9..342be27c896ae 100644
--- a/paddle/fluid/framework/custom_tensor_test.cc
+++ b/paddle/fluid/framework/custom_tensor_test.cc
@@ -38,7 +38,7 @@ void TestCopyTensor() {
   for (int64_t i = 0; i < t1.size(); i++) {
     CHECK_EQ(t1_cpu_cp.template data<T>()[i], T(5));
   }
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   VLOG(2) << "Do GPU copy test";
   auto t1_gpu_cp = t1_cpu_cp.template copy_to<T>(paddle::PlaceType::kGPU);
   CHECK((paddle::PlaceType::kGPU == t1_gpu_cp.place()));
@@ -50,33 +50,16 @@ void TestCopyTensor() {
   for (int64_t i = 0; i < t1.size(); i++) {
     CHECK_EQ(t1_gpu_cp_cp_cpu.template data<T>()[i], T(5));
   }
-#elif defined(PADDLE_WITH_HIP)
-  VLOG(2) << "Do HIP copy test";
-  auto t1_gpu_cp = t1_cpu_cp.template copy_to<T>(paddle::PlaceType::kHIP);
-  CHECK((paddle::PlaceType::kHIP == t1_gpu_cp.place()));
-  auto t1_gpu_cp_cp = t1_gpu_cp.template copy_to<T>(paddle::PlaceType::kHIP);
-  CHECK((paddle::PlaceType::kHIP == t1_gpu_cp_cp.place()));
-  auto t1_gpu_cp_cp_cpu =
-      t1_gpu_cp_cp.template copy_to<T>(paddle::PlaceType::kCPU);
-  CHECK((paddle::PlaceType::kCPU == t1_gpu_cp_cp_cpu.place()));
-  for (int64_t i = 0; i < t1.size(); i++) {
-    CHECK_EQ(t1_gpu_cp_cp_cpu.template data<T>()[i], T(5));
-  }
 #endif
 }
 
 void TestAPIPlace() {
   std::vector<int64_t> tensor_shape = {5, 5};
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   auto t1 = paddle::Tensor(paddle::PlaceType::kGPU);
   t1.reshape(tensor_shape);
   t1.mutable_data<float>();
   CHECK((paddle::PlaceType::kGPU == t1.place()));
-#elif defined(PADDLE_WITH_HIP)
-  auto t1 = paddle::Tensor(paddle::PlaceType::kHIP);
-  t1.reshape(tensor_shape);
-  t1.mutable_data<float>();
-  CHECK((paddle::PlaceType::kHIP == t1.place()));
 #endif
   auto t2 = paddle::Tensor(paddle::PlaceType::kCPU);
   t2.reshape(tensor_shape);
@@ -97,7 +80,7 @@ void TestAPISlice() {
   std::vector<int64_t> tensor_shape_sub1 = {3, 5};
   std::vector<int64_t> tensor_shape_origin2 = {5, 5, 5};
   std::vector<int64_t> tensor_shape_sub2 = {1, 5, 5};
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   auto t1 = paddle::Tensor(paddle::PlaceType::kGPU, tensor_shape_origin1);
   t1.mutable_data<float>();
   CHECK(t1.slice(0, 5).shape() == tensor_shape_origin1);
@@ -144,7 +127,7 @@ void TestCast(paddle::DataType data_type) {
   t1.template mutable_data<T>();
   auto t2 = t1.cast(data_type);
   CHECK(t2.type() == data_type);
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   auto tg1 = paddle::Tensor(paddle::PlaceType::kGPU);
   tg1.reshape(tensor_shape);
   tg1.template mutable_data<T>();
diff --git a/paddle/fluid/framework/custom_tensor_utils.h b/paddle/fluid/framework/custom_tensor_utils.h
index 809a6b965aad9..d7bde04b84b16 100644
--- a/paddle/fluid/framework/custom_tensor_utils.h
+++ b/paddle/fluid/framework/custom_tensor_utils.h
@@ -18,11 +18,9 @@ limitations under the License. */
 
 #include "paddle/fluid/extension/include/ext_tensor.h"
 #include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/gpu_info.h"
 #include "paddle/fluid/platform/place.h"
-#ifdef PADDLE_WITH_CUDA
-#endif
-#include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
 namespace framework {
@@ -110,7 +108,7 @@ class CustomTensorUtils {
     if (pc == PlaceType::kCPU) {
       return platform::Place(platform::CPUPlace());
     } else if (pc == PlaceType::kGPU) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       return platform::Place(
           platform::CUDAPlace(platform::GetCurrentDeviceId()));
 #endif
@@ -127,7 +125,7 @@ class CustomTensorUtils {
     if (platform::is_cpu_place(pc)) {
       return PlaceType::kCPU;
     } else if (platform::is_gpu_place(pc)) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       return PlaceType::kGPU;
 #endif
     } else {
@@ -142,7 +140,7 @@ class CustomTensorUtils {
   static void SetTensorCurrentStream(paddle::Tensor* src,
                                      const platform::Place& pc) {
     if (platform::is_gpu_place(pc)) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       auto* dev_ctx = static_cast<platform::CUDADeviceContext*>(
           platform::DeviceContextPool::Instance().Get(pc));
       src->stream_.SetStream(reinterpret_cast<void*>(dev_ctx->stream()));
diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index 1bb1ae0ea6755..cee97820d6a03 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -52,6 +52,15 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
     ResolveOptionConfliction();
 
     AppendPrintGraphPass("graph_viz_pass", "_original_graph");
+
+#ifdef PADDLE_WITH_CINN
+    if (FLAGS_use_cinn) {
+      // Note: This pass is used to enable cinn.
+      AppendPass("build_cinn_pass");
+      AppendPrintGraphPass("graph_viz_pass", "_build_cinn_graph");
+    }
+#endif
+
     AppendPassWithCheck(strategy_.enable_sequential_execution_,
                         "sequential_execution_pass");
     AppendPassWithCheck(strategy_.sync_batch_norm_, "sync_batch_norm_pass");
@@ -74,13 +83,6 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
     // Note: This pass is used to check whether the multi_device_graph is right.
     AppendPass("multi_devices_check_pass");
 
-#ifdef PADDLE_WITH_CINN
-    if (FLAGS_use_cinn) {
-      // Note: This pass is used to enable cinn.
-      AppendPass("build_cinn_pass");
-    }
-#endif
-
     SetCollectiveContext();
   }
 
diff --git a/paddle/fluid/framework/details/nan_inf_utils.h b/paddle/fluid/framework/details/nan_inf_utils.h
index 5a592f22dc494..e4fd24f201d7f 100644
--- a/paddle/fluid/framework/details/nan_inf_utils.h
+++ b/paddle/fluid/framework/details/nan_inf_utils.h
@@ -27,7 +27,7 @@ namespace framework {
 namespace details {
 // assert false when meets NAN or inf
 void CheckVarHasNanOrInf(const std::string& op_type,
-                         const framework::Scope& scope,
+                         const framework::ScopeBase& scope,
                          const std::string& var_name,
                          const platform::Place& place);
 
@@ -37,7 +37,7 @@ void CheckVarHasNanOrInf(const std::string& op_type,
                          const platform::Place& place);
 
 void CheckOpHasNanOrInf(const framework::OperatorBase& op,
-                        const framework::Scope& scope,
+                        const framework::ScopeBase& scope,
                         const platform::Place& place);
 
 template <typename VarType>
@@ -55,7 +55,7 @@ void CheckOpHasNanOrInfInDygraph(const std::string& op_type,
 
 #ifdef PADDLE_WITH_ASCEND_CL
 void NPUAllocAndClearFloatStatus(const framework::OperatorBase& op,
-                                 const framework::Scope& scope,
+                                 const framework::ScopeBase& scope,
                                  const platform::Place& place);
 #endif
 
diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cc b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
index f22f008c19896..2c2f40c06ea34 100644
--- a/paddle/fluid/framework/details/nan_inf_utils_detail.cc
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
@@ -407,7 +407,7 @@ void CheckVarHasNanOrInf(const std::string& op_type,
 }
 
 void CheckVarHasNanOrInf(const std::string& op_type,
-                         const framework::Scope& scope,
+                         const framework::ScopeBase& scope,
                          const std::string& var_name,
                          const platform::Place& place) {
   auto* var = scope.FindVar(var_name);
@@ -440,7 +440,7 @@ static framework::Tensor& npu_float_status() {
 }
 
 void NPUAllocAndClearFloatStatus(const framework::OperatorBase& op,
-                                 const framework::Scope& scope,
+                                 const framework::ScopeBase& scope,
                                  const platform::Place& place) {
   if (!platform::is_npu_place(place)) return;
 
@@ -505,7 +505,7 @@ void PrintNpuVarInfo(const std::string& op_type, const std::string& var_name,
 }
 
 void PrintNPUOpValueInfo(const framework::OperatorBase& op,
-                         const framework::Scope& scope,
+                         const framework::ScopeBase& scope,
                          const platform::Place& place) {
   LOG(WARNING) << "There are `nan` or `inf` in operator (" << op.Type()
                << "), here we print some tensor value info of this op.";
@@ -523,7 +523,7 @@ void PrintNPUOpValueInfo(const framework::OperatorBase& op,
 }
 
 static void NPUCheckOpHasNanOrInf(const framework::OperatorBase& op,
-                                  const framework::Scope& scope,
+                                  const framework::ScopeBase& scope,
                                   const platform::Place& place) {
   if (!platform::is_npu_place(place)) return;
 
@@ -551,14 +551,13 @@ static void NPUCheckOpHasNanOrInf(const framework::OperatorBase& op,
 
   if (sum >= 1.0) PrintNPUOpValueInfo(op, scope, place);
 
-  PADDLE_ENFORCE_LT(
-      sum, 1.0, platform::errors::PreconditionNotMet(
-                    "Operator %s contains Nan/Inf.", op.DebugStringEx(&scope)));
+  PADDLE_ENFORCE_LT(sum, 1.0, platform::errors::PreconditionNotMet(
+                                  "Operator %s contains Nan/Inf.", op.Type()));
 }
 #endif
 
 void CheckOpHasNanOrInf(const framework::OperatorBase& op,
-                        const framework::Scope& exec_scope,
+                        const framework::ScopeBase& exec_scope,
                         const platform::Place& place) {
   std::call_once(white_list_init_flag, InitWhiteListFormEnv);
 
diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto
index 28eebeb4d9bdc..bd84471e63ef7 100644
--- a/paddle/fluid/framework/distributed_strategy.proto
+++ b/paddle/fluid/framework/distributed_strategy.proto
@@ -173,6 +173,68 @@ message TensorParallelConfig {
   optional int32 tensor_init_seed = 2 [ default = -1 ];
 }
 
+enum TableType {
+  PS_SPARSE_TABLE = 0;
+  PS_DENSE_TABLE = 1;
+}
+
+message TableParameter {
+  optional uint64 table_id = 1;
+  optional string table_class = 2;
+  optional uint64 shard_num = 3;
+  optional TableType type = 4;
+  optional TableAccessorParameter accessor = 5;
+}
+
+message TableAccessorParameter {
+  optional string accessor_class = 1;
+  optional SGDParameter embed_sgd_param = 2;
+  optional SGDParameter embedx_sgd_param = 3;
+  optional uint32 fea_dim = 4; // for sparse table, this means field size of one
+                               // value; for dense table, this means total value
+                               // num
+  optional uint32 embedx_dim = 5;       // embedx feature size
+  optional uint32 embedx_threshold = 6; // embedx feature create threshold
+  optional CtrAccessorParameter ctr_accessor_param = 7;
+}
+
+// TODO(guanqun): add NaiveSGD/Adam...
+message SGDParameter {
+  optional string name = 1;
+  optional SGDRuleParameter adagrad = 2;
+}
+
+message SGDRuleParameter {
+  optional double learning_rate = 1;
+  optional double initial_g2sum = 2;
+  optional double initial_range = 3 [ default = 0 ];
+  repeated float weight_bounds = 4;
+}
+
+message CtrAccessorParameter {
+  optional float nonclk_coeff = 1; // to calculate show_click_score
+  optional float click_coeff = 2;  // to calculate show_click_score
+  optional float base_threshold =
+      3; // show_click_score > base_threshold, this feature can be saved
+  optional float delta_threshold =
+      4; // delta_score > delta_threshold, this feature can be saved
+  optional float delta_keep_days =
+      5; // unseen_day < delta_keep_days, this feature can be saved
+  optional float show_click_decay_rate = 6; // show/click will update to
+                                            // show/click *
+                                            // show_click_decay_rate after a day
+  optional float delete_threshold = 7;      // threshold to shrink a feasign
+  optional float delete_after_unseen_days = 8;
+  optional int32 ssd_unseenday_threshold = 9;
+}
+
+message FsClientParameter {
+  optional string uri = 1;
+  optional string user = 2;
+  optional string passwd = 3;
+  optional string hadoop_bin = 4;
+}
+
 message DistributedStrategy {
   // bool options
   optional Mode mode = 1 [ default = COLLECTIVE ];
@@ -210,6 +272,7 @@ message DistributedStrategy {
   optional bool asp = 33 [ default = false ];
   optional bool fuse_grad_merge = 34 [ default = false ];
   optional bool semi_auto = 35 [ default = false ];
+  optional bool adam_d2sum = 36 [ default = true ];
 
   optional RecomputeConfig recompute_configs = 101;
   optional AMPConfig amp_configs = 102;
@@ -225,6 +288,9 @@ message DistributedStrategy {
   optional HybridConfig hybrid_configs = 112;
   optional TensorParallelConfig tensor_parallel_configs = 113;
   optional TrainerDescConfig trainer_desc_configs = 114;
+  optional TableParameter downpour_table_param = 115;
+  optional FsClientParameter fs_client_param = 116;
+
   optional BuildStrategy build_strategy = 201;
   optional ExecutionStrategy execution_strategy = 202;
   optional GradientScaleConfig gradient_scale_configs = 203;
diff --git a/paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h b/paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h
index 8b04d703c8898..266508eb4de6c 100644
--- a/paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h
+++ b/paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h
@@ -12,6 +12,10 @@
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
+ *
+ * This source code refers to https://github.com/rapidsai/cudf
+ * and is licensed under the license found in the LICENSE file
+ * in the root directory of this source tree.
  */
 
 #ifndef CONCURRENT_UNORDERED_MAP_CUH
diff --git a/paddle/fluid/framework/fleet/heter_ps/cudf/hash_functions.cuh b/paddle/fluid/framework/fleet/heter_ps/cudf/hash_functions.cuh
index 9264bd0a21c8b..aae2aaaaca0e2 100644
--- a/paddle/fluid/framework/fleet/heter_ps/cudf/hash_functions.cuh
+++ b/paddle/fluid/framework/fleet/heter_ps/cudf/hash_functions.cuh
@@ -12,6 +12,10 @@
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
+ *
+ * This source code refers to https://github.com/rapidsai/cudf
+ * and is licensed under the license found in the LICENSE file
+ * in the root directory of this source tree.
  */
 
 #ifndef HASH_FUNCTIONS_CUH
diff --git a/paddle/fluid/framework/fleet/heter_ps/cudf/managed.cuh b/paddle/fluid/framework/fleet/heter_ps/cudf/managed.cuh
index a0e34c66f0b2a..1b0c31554108f 100644
--- a/paddle/fluid/framework/fleet/heter_ps/cudf/managed.cuh
+++ b/paddle/fluid/framework/fleet/heter_ps/cudf/managed.cuh
@@ -12,6 +12,10 @@
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
+ *
+ * This source code refers to https://github.com/rapidsai/cudf
+ * and is licensed under the license found in the LICENSE file
+ * in the root directory of this source tree.
  */
 
 #ifndef MANAGED_CUH
diff --git a/paddle/fluid/framework/fleet/heter_ps/cudf/managed_allocator.cuh b/paddle/fluid/framework/fleet/heter_ps/cudf/managed_allocator.cuh
index 62c7d7aa74d9d..a052bc61af22a 100644
--- a/paddle/fluid/framework/fleet/heter_ps/cudf/managed_allocator.cuh
+++ b/paddle/fluid/framework/fleet/heter_ps/cudf/managed_allocator.cuh
@@ -12,6 +12,10 @@
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
+ *
+ * This source code refers to https://github.com/rapidsai/cudf
+ * and is licensed under the license found in the LICENSE file
+ * in the root directory of this source tree.
  */
 
 #ifndef MANAGED_ALLOCATOR_CUH 
diff --git a/paddle/fluid/framework/io/CMakeLists.txt b/paddle/fluid/framework/io/CMakeLists.txt
index 0ffaf70be316f..85b45f1a5bbc1 100644
--- a/paddle/fluid/framework/io/CMakeLists.txt
+++ b/paddle/fluid/framework/io/CMakeLists.txt
@@ -1,5 +1,5 @@
-cc_library(fs SRCS fs.cc DEPS string_helper glog boost enforce)
 cc_library(shell SRCS shell.cc DEPS string_helper glog timer enforce)
+cc_library(fs SRCS fs.cc DEPS string_helper glog boost enforce shell)
 
 cc_test(test_fs SRCS test_fs.cc DEPS fs shell)
 if (WITH_CRYPTO) 
diff --git a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc
index 0f6421134c216..12800bd26dae5 100644
--- a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc
@@ -314,14 +314,17 @@ int EmbeddingEltwiseLayerNormFusePass::BuildFusion(
     new_op_desc.SetType("fused_embedding_eltwise_layernorm");
     new_op_desc.SetInput("Ids", ids);
     new_op_desc.SetInput("Embs", embs);
-
     new_op_desc.SetInput("WordId", {ids[0]});
     new_op_desc.SetInput("PosId", {ids[1]});
-    new_op_desc.SetInput("SentId", {ids[2]});
+    if (ids.size() > 2) {
+      new_op_desc.SetInput("SentId", {ids[2]});
+    }
 
     new_op_desc.SetInput("WordEmbedding", {embs[0]});
     new_op_desc.SetInput("PosEmbedding", {embs[1]});
-    new_op_desc.SetInput("SentEmbedding", {embs[2]});
+    if (embs.size() > 2) {
+      new_op_desc.SetInput("SentEmbedding", {embs[2]});
+    }
 
     new_op_desc.SetInput("Bias", {end_pattern_biases[k]->Name()});
     new_op_desc.SetInput("Scale", {end_pattern_scales[k]->Name()});
@@ -380,7 +383,6 @@ EmbeddingEltwiseLayerNormFusePass::EmbeddingEltwiseLayerNormFusePass() {
       .IsTensor()
       .End()
       .AddAttr("axis")
-      .IsIntIn({0, -1})
       .End();
 
   AddOpCompat(OpCompat("layer_norm"))
@@ -430,6 +432,6 @@ REGISTER_PASS(embedding_eltwise_layernorm_fuse_pass,
 REGISTER_PASS_CAPABILITY(embedding_eltwise_layernorm_fuse_pass)
     .AddCombination(
         paddle::framework::compatible::OpVersionComparatorCombination()
-            .EQ("lookup_table", 0)
+            .LE("lookup_table", 1)
             .LE("lookup_table_v2", 1)
-            .EQ("elementweise_add", 0));
+            .LE("elementweise_add", 1));
diff --git a/paddle/fluid/framework/ir/generate_pass.cc b/paddle/fluid/framework/ir/generate_pass.cc
index b261cbeb08e3b..3f9ad5b2c5203 100644
--- a/paddle/fluid/framework/ir/generate_pass.cc
+++ b/paddle/fluid/framework/ir/generate_pass.cc
@@ -19,21 +19,63 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
-void InitGeneratePattern(const proto::PassDesc& pass_desc, PDPattern* pattern) {
-  const proto::BlockDesc& block = pass_desc.pattern().blocks(0);
-  for (const proto::VarDesc& var : block.vars()) {
-    PDNode* var_pdnode = pattern->NewNode(var.name())->AsInput();
-    var_pdnode->assert_is_var();
-    var_pdnode->assert_more([&](Node* x) {
-      if (VarDesc(var).GetShape() == x->Var()->GetShape()) {
-        return true;
+class operation_visitor : public boost::static_visitor<Attribute> {
+ public:
+  explicit operation_visitor(const proto::PassDesc::OperationType& type)
+      : type_(type) {}
+
+  template <typename T1, typename T2>
+  Attribute operator()(const T1& attr, const T2& operation) const {
+    PADDLE_THROW(platform::errors::Unimplemented("Unimplemented operand."));
+  }
+
+  template <typename T,
+            std::enable_if_t<std::is_integral<T>::value ||
+                             std::is_floating_point<T>::value>* = nullptr>
+  Attribute operator()(const T& attr, const T& operation) const {
+    switch (type_) {
+      case proto::PassDesc_OperationType_kSub: {
+        return attr - operation;
+      }
+
+      default:
+        PADDLE_THROW(
+            platform::errors::Unimplemented("Unimplemented operation type."));
+    }
+  }
+
+ private:
+  proto::PassDesc::OperationType type_;
+};
+
+Attribute GetVarAttrValue(const VarDesc* desc,
+                          const proto::PassDesc::Attr& attr) {
+  if ("shape" == attr.name()) {
+    std::vector<int64_t> shape = desc->GetShape();
+    if (attr.has_operation()) {
+      if (attr.operation() == proto::PassDesc_OperationType_kSize) {
+        return static_cast<int>(shape.size());
+      }
+    } else if (attr.has_element_index()) {
+      int element_index = attr.element_index();
+      if (attr.element_index() < 0) {
+        element_index += shape.size();
       }
-      return false;
-    });
+      if (element_index >= 0 &&
+          static_cast<size_t>(element_index) < shape.size()) {
+        return static_cast<int>(shape[element_index]);
+      }
+    } else {
+      return shape;
+    }
   }
+  return boost::blank();
+}
+
+void InitGeneratePattern(const proto::PassDesc& pass_desc, PDPattern* pattern) {
   // Traverse all operators to create subgraph.
-  for (int index = 0; index < block.ops_size(); ++index) {
-    const proto::OpDesc& op = block.ops(index);
+  for (int index = 0; index < pass_desc.pattern_size(); ++index) {
+    const proto::OpDesc& op = pass_desc.pattern(index);
     // Create a PDNode for current operator. Use the index as name to avoid
     // multiple operators with same type. Get a PDNode from pattern subgraph
     // through index in rewrite phase.
@@ -116,6 +158,23 @@ void InitGeneratePattern(const proto::PassDesc& pass_desc, PDPattern* pattern) {
       });
     }
   }
+  for (const auto& condition : pass_desc.var_attr_conditions()) {
+    if (condition.has_condition_value()) {
+      PDNode* pdnode = pattern->RetrieveNode(condition.attr().var_name());
+      pdnode->assert_more([&](Node* x) {
+        Attribute attr = GetVarAttrValue(x->Var(), condition.attr());
+        switch (condition.type()) {
+          case proto::PassDesc_ConditionType_kEQ: {
+            return attr == GetAttrValue(condition.condition_value());
+          }
+
+          default:
+            PADDLE_THROW(platform::errors::Unimplemented(
+                "Unimplemented condition type."));
+        }
+      });
+    }
+  }
 }
 
 // There are some duplicate patterns.
@@ -176,7 +235,33 @@ GraphPatternDetector::handle_t GetGenerateRewrite(
     if (IsDuplicatePattern(subgraph, graph)) {
       return;
     }
-    const proto::BlockDesc& block = pass_desc.replace().blocks(0);
+    for (const auto& condition : pass_desc.var_attr_conditions()) {
+      if (condition.has_condition_attr()) {
+        Node* node =
+            subgraph.at(pattern.RetrieveNode(condition.attr().var_name()));
+        Attribute node_attr = GetVarAttrValue(node->Var(), condition.attr());
+        Attribute condition_attr;
+        if (condition.condition_attr().role() ==
+            proto::PassDesc_RoleType_kVariable) {
+          Node* condition_node =
+              subgraph.at(pattern.RetrieveNode(condition.attr().var_name()));
+          condition_attr = GetVarAttrValue(condition_node->Var(),
+                                           condition.condition_attr());
+        } else {
+          PADDLE_THROW(
+              platform::errors::Unimplemented("Unimplemented for operation."));
+        }
+        bool check_failed = false;
+        if (condition.type() == proto::PassDesc_ConditionType_kEQ) {
+          check_failed = !(node_attr == condition_attr);
+        }
+        if (check_failed) {
+          VLOG(3) << "Check var [" << node->Name() << "] with attr ["
+                  << condition.attr().name() << "] failed, skip this pattern.";
+          return;
+        }
+      }
+    }
     // `var_node_maps` record the mapping of variable to the pattern subgraph.
     std::map<std::string, Node*> var_node_maps;
     for (const proto::PassDesc::VarMap& var_map : pass_desc.var_maps()) {
@@ -184,7 +269,8 @@ GraphPatternDetector::handle_t GetGenerateRewrite(
       var_node_maps.insert({var_map.replace_var(), node});
     }
     // Traverse all operators to create subgraph.
-    for (const proto::OpDesc& op : block.ops()) {
+    for (int index = 0; index < pass_desc.replace_size(); ++index) {
+      const proto::OpDesc& op = pass_desc.replace(index);
       OpDesc op_desc;
       std::vector<Node *> in_nodes, out_nodes;
       op_desc.SetType(op.type());
@@ -230,6 +316,30 @@ GraphPatternDetector::handle_t GetGenerateRewrite(
       for (const proto::OpDesc::Attr& attr : op.attrs()) {
         op_desc.SetAttr(attr.name(), GetAttrValue(attr));
       }
+      for (const auto& attr_map : pass_desc.op_attr_maps()) {
+        if (attr_map.replace_attr().op_index() == index) {
+          Attribute attr;
+          if (attr_map.pattern_attr().role() ==
+              proto::PassDesc_RoleType_kVariable) {
+            Node* condition_node = subgraph.at(
+                pattern.RetrieveNode(attr_map.pattern_attr().var_name()));
+            attr =
+                GetVarAttrValue(condition_node->Var(), attr_map.pattern_attr());
+          } else {
+            Node* condition_node = subgraph.at(pattern.RetrieveNode(
+                std::to_string(attr_map.pattern_attr().op_index())));
+            attr =
+                condition_node->Op()->GetAttr(attr_map.pattern_attr().name());
+          }
+          if (attr_map.has_operation()) {
+            Attribute operation = GetAttrValue(attr_map.operation().value());
+            attr = boost::apply_visitor(
+                operation_visitor(attr_map.operation().type()), attr,
+                operation);
+          }
+          op_desc.SetAttr(attr_map.replace_attr().name(), attr);
+        }
+      }
       // Create a Node for current operator.
       Node* op_node = graph->CreateOpNode(&op_desc);
       for (Node* node : in_nodes) {
@@ -266,7 +376,7 @@ void GeneratePass::ApplyImpl(Graph* graph) const {
   for (const proto::PassDesc& pass_desc : multi_pass_desc_.pass_descs()) {
     GraphPatternDetector detector;
     InitGeneratePattern(pass_desc, detector.mutable_pattern());
-    if (pass_desc.replace().blocks(0).ops_size() == 0) {
+    if (pass_desc.replace_size() == 0) {
       detector(graph, GetGenerateDelete(detector.pattern(), pass_desc));
     } else {
       detector(graph, GetGenerateRewrite(detector.pattern(), pass_desc));
@@ -282,37 +392,6 @@ void GeneratePass::VerifyDesc() const {
   PADDLE_ENFORCE_NE(multi_pass_desc_.pass_descs_size(), 0,
                     platform::errors::InvalidArgument(
                         "Size of PassDesc should not be empty."));
-  for (const proto::PassDesc& pass_desc : multi_pass_desc_.pass_descs()) {
-    // Check inputs/outputs of subgraph should in `var_maps`.
-    std::set<std::string> pattern_var_sets, replace_var_sets;
-    for (const proto::PassDesc::VarMap& var_map : pass_desc.var_maps()) {
-      pattern_var_sets.emplace(var_map.pattern_var());
-      replace_var_sets.emplace(var_map.replace_var());
-    }
-    auto check_vars = [=](std::set<std::string>* var_sets,
-                          const proto::BlockDesc& block) {
-      for (const proto::OpDesc& op : block.ops()) {
-        for (const proto::OpDesc::Var& var : op.outputs()) {
-          for (const std::string& argument : var.arguments()) {
-            var_sets->emplace(argument);
-          }
-        }
-      }
-      for (const proto::OpDesc& op : block.ops()) {
-        for (const proto::OpDesc::Var& var : op.inputs()) {
-          for (const std::string& argument : var.arguments()) {
-            PADDLE_ENFORCE_NE(
-                var_sets->find(argument), var_sets->end(),
-                platform::errors::InvalidArgument(
-                    "Subgraph of PassDesc has argument [%s] not in `var_maps`.",
-                    argument));
-          }
-        }
-      }
-    };
-    check_vars(&pattern_var_sets, pass_desc.pattern().blocks(0));
-    check_vars(&replace_var_sets, pass_desc.replace().blocks(0));
-  }
 }
 
 bool GeneratePass::VerifyGraph(const Graph& graph) {
@@ -403,8 +482,8 @@ PassPairs::PassPairs(const SubgraphType& pattern, const SubgraphType& replace) {
 void PassPairs::AddPassDesc(const SubgraphType& pattern,
                             const SubgraphType& replace) {
   proto::PassDesc* pass_desc = multi_pass_desc_.add_pass_descs();
-  pass_desc->mutable_pattern()->CopyFrom(pattern.ProgramDesc());
-  pass_desc->mutable_replace()->CopyFrom(replace.ProgramDesc());
+  pass_desc->mutable_pattern()->CopyFrom(pattern.ProgramDesc().blocks(0).ops());
+  pass_desc->mutable_replace()->CopyFrom(replace.ProgramDesc().blocks(0).ops());
   PADDLE_ENFORCE_EQ(pattern.InputVars().size(), replace.InputVars().size(),
                     platform::errors::InvalidArgument(
                         "Size of lambda expression arguments is not equal "
diff --git a/paddle/fluid/framework/ir/graph_viz_pass.cc b/paddle/fluid/framework/ir/graph_viz_pass.cc
index f2c711fb6f004..735b433b6cfe1 100644
--- a/paddle/fluid/framework/ir/graph_viz_pass.cc
+++ b/paddle/fluid/framework/ir/graph_viz_pass.cc
@@ -62,10 +62,14 @@ void GraphVizPass::ApplyImpl(ir::Graph* graph) const {
         }
       }
     }
+    const std::string& optim_cache_dir = Get<std::string>("optim_cache_dir");
     std::string program_bytes = program_desc.Proto()->SerializeAsString();
     // rename from "17_ir_fc_fuse_pass.dot" to "fc_fuse_pass.pdmodel"
     program_path =
         graph_viz_path.substr(found1 + 4, found2 - found1 - 4) + ".pdmodel";
+    if (!optim_cache_dir.empty()) {
+      program_path = optim_cache_dir + "/" + program_path;
+    }
     std::ofstream file(program_path.c_str(), std::ios::binary);
     file.write(program_bytes.c_str(), program_bytes.size());
     file.close();
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc
index aaae505edde38..c817400056c21 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc
@@ -88,6 +88,13 @@ void ConvActivationFusePass::ApplyImpl(ir::Graph* graph) const {
     desc->SetAttr("fuse_beta",
                   activation->Op()->GetAttrIfExists<float>("beta"));
 
+    if (activation_type() == "hard_sigmoid") {
+      desc->SetAttr("fuse_alpha",
+                    activation->Op()->GetAttrIfExists<float>("slope"));
+      desc->SetAttr("fuse_beta",
+                    activation->Op()->GetAttrIfExists<float>("offset"));
+    }
+
     GraphSafeRemoveNodes(graph, {activation, conv_out});
 
     PADDLE_ENFORCE_GT(subgraph.count(conv_input), 0UL,
@@ -213,6 +220,26 @@ Conv2DHardSwishFusePass::Conv2DHardSwishFusePass() {
       .End();
 }
 
+Conv2DHardSigmoidFusePass::Conv2DHardSigmoidFusePass() {
+  AddOpCompat(OpCompat("hard_sigmoid"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      // optional, default=0.2
+      .AddAttr("slope")
+      .IsOptional()
+      .IsType<float>()
+      .End()
+      // optional, default=0.5
+      .AddAttr("offset")
+      .IsOptional()
+      .IsType<float>()
+      .End();
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
@@ -259,3 +286,11 @@ REGISTER_PASS_CAPABILITY(conv_hard_swish_mkldnn_fuse_pass)
         paddle::framework::compatible::OpVersionComparatorCombination()
             .LE("conv2d", 1)
             .EQ("hard_swish", 0));
+
+REGISTER_PASS(conv_hard_sigmoid_mkldnn_fuse_pass,
+              paddle::framework::ir::Conv2DHardSigmoidFusePass);
+REGISTER_PASS_CAPABILITY(conv_hard_sigmoid_mkldnn_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .LE("conv2d", 1)
+            .EQ("hard_sigmoid", 0));
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.h
index d22773fb41904..eacde101d5a0a 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.h
@@ -72,6 +72,15 @@ class Conv2DHardSwishFusePass : public ConvActivationFusePass {
   Conv2DHardSwishFusePass();
   std::string activation_type() const { return "hard_swish"; }
 };
+/*
+ * Fuse Conv and HardSigmoid class
+ */
+class Conv2DHardSigmoidFusePass : public ConvActivationFusePass {
+ public:
+  Conv2DHardSigmoidFusePass();
+  std::string activation_type() const { return "hard_sigmoid"; }
+};
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass_tester.cc
index 453197cda3915..a398e33416989 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass_tester.cc
@@ -148,6 +148,9 @@ TEST(ConvActivationFusePass, conv_swish_fuse_pass) { MainTest("swish"); }
 TEST(ConvActivationFusePass, conv_hard_swish_fuse_pass) {
   MainTest("hard_swish");
 }
+TEST(ConvActivationFusePass, conv_hard_sigmoid_fuse_pass) {
+  MainTest("hard_sigmoid");
+}
 
 }  // namespace ir
 }  // namespace framework
diff --git a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
index 22babcc719aeb..619fe7ab4f738 100644
--- a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
@@ -210,6 +210,22 @@ QuantDequantFusePass::QuantDequantFusePass() {
       .AddAttr("y_num_col_dims")
       .IsNumEQ(1)
       .End();
+  AddOpCompat(OpCompat("matmul_v2"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddInput("Y")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("trans_x")
+      .IsBoolEQ(false)
+      .End()
+      .AddAttr("trans_y")
+      .IsBoolEQ(false)
+      .End();
   AddOpCompat(OpCompat("matmul"))
       .AddInput("X")
       .IsTensor()
@@ -355,7 +371,8 @@ void QuantDequantFusePass::DeleteQuant(ir::Graph* graph, Scope* scope,
           quantized_op_type == "fc" ||
           quantized_op_type == "conv2d_transpose") {
         op_desc->SetAttr("Input_scale", scale_value);
-      } else if (quantized_op_type == "mul" || quantized_op_type == "matmul") {
+      } else if (quantized_op_type == "mul" || quantized_op_type == "matmul" ||
+                 quantized_op_type == "matmul_v2") {
         op_desc->SetAttr("X_scale", scale_value);
       } else {
         PADDLE_THROW(platform::errors::Unimplemented(
@@ -387,7 +404,8 @@ void QuantDequantFusePass::FuseDequant(ir::Graph* graph, Scope* scope,
       quantized_op_type == "conv2d_transpose") {
     weight_name = "Filter";
     input_name = "Input";
-  } else if (quantized_op_type == "mul" || quantized_op_type == "matmul") {
+  } else if (quantized_op_type == "mul" || quantized_op_type == "matmul" ||
+             quantized_op_type == "matmul_v2") {
     weight_name = "Y";
     input_name = "X";
   } else if (quantized_op_type == "fc") {
@@ -396,7 +414,7 @@ void QuantDequantFusePass::FuseDequant(ir::Graph* graph, Scope* scope,
   } else {
     PADDLE_THROW(platform::errors::Unimplemented(
         "QuantDequantFuse: We only support conv2d, conv2d_fusion, "
-        "conv2d_transpose, fc, mul, matmul for "
+        "conv2d_transpose, fc, mul, matmul, matmul_v2 for "
         "now."));
   }
   const std::string pattern_name = "dequant_fuse";
@@ -479,14 +497,14 @@ void QuantDequantFusePass::FuseDequant(ir::Graph* graph, Scope* scope,
     // If quantized op is conv2d, weight scale size = weight dims[0]
     // If quantized op is conv2d_transpose, weight scale size = weight dims[1]
     if (quantized_op_type == "mul" || quantized_op_type == "matmul" ||
-        quantized_op_type == "fc") {
+        quantized_op_type == "matmul_v2" || quantized_op_type == "fc") {
       if (dequant_type == "fake_dequantize_max_abs") {
-        PADDLE_ENFORCE_EQ(
-            weight_scale.size(), 1,
-            platform::errors::InvalidArgument(
-                "mul/matmul op weight dequantized by [fake_dequantize_max_abs] "
-                "requires weight scale size = 1, but got %d.",
-                weight_scale.size()));
+        PADDLE_ENFORCE_EQ(weight_scale.size(), 1,
+                          platform::errors::InvalidArgument(
+                              "mul/matmul/matmul_v2 op weight dequantized by "
+                              "[fake_dequantize_max_abs] "
+                              "requires weight scale size = 1, but got %d.",
+                              weight_scale.size()));
         for (int j = 0; j < weight_tensor->numel(); j++) {
           quantized_weight_data[j] *= weight_scale[0];
         }
@@ -497,7 +515,8 @@ void QuantDequantFusePass::FuseDequant(ir::Graph* graph, Scope* scope,
           PADDLE_ENFORCE_EQ(
               quant_axis == 1, true,
               platform::errors::InvalidArgument(
-                  "'quant_axis' of mul/matmul/fc op weight dequantized by "
+                  "'quant_axis' of mul/matmul/fc/matmul_v2 op weight "
+                  "dequantized by "
                   "[fake_channel_wise_dequantize_max_abs]should be 1, but "
                   "the received is %d",
                   quant_axis));
@@ -505,9 +524,10 @@ void QuantDequantFusePass::FuseDequant(ir::Graph* graph, Scope* scope,
         PADDLE_ENFORCE_EQ(
             weight_scale.size(), static_cast<size_t>(w_dims[1]),
             platform::errors::InvalidArgument(
-                "mul/matmul op weight dequantized by "
+                "mul/matmul/matmul_v2 op weight dequantized by "
                 "[fake_channel_wise_dequantize_max_abs] requires weight scale "
-                "size = 2nd dim of mul/matmul's weight, which is %d, but got "
+                "size = 2nd dim of mul/matmul/matmul_v2's weight, which is %d, "
+                "but got "
                 "%d.",
                 static_cast<size_t>(w_dims[1]), weight_scale.size()));
         for (int j = 0; j < weight_tensor->numel(); j++) {
@@ -594,7 +614,8 @@ void QuantDequantFusePass::FuseDequant(ir::Graph* graph, Scope* scope,
     } else if (quantized_op_type == "fc") {
       new_op_desc.SetInput("Input", {new_input});
       new_op_desc.SetOutput("Out", {new_output});
-    } else if (quantized_op_type == "mul" || quantized_op_type == "matmul") {
+    } else if (quantized_op_type == "mul" || quantized_op_type == "matmul" ||
+               quantized_op_type == "matmul_v2") {
       new_op_desc.SetInput("X", {new_input});
       new_op_desc.SetOutput("Out", {new_output});
     }
@@ -621,7 +642,9 @@ void QuantDequantFusePass::ApplyImpl(ir::Graph* graph) const {
   std::unordered_set<std::string> quant_types = {
       "fake_quantize_range_abs_max", "fake_quantize_moving_average_abs_max"};
   std::unordered_set<std::string> quantized_op_types = {
-      "conv2d", "mul", "matmul", "depthwise_conv2d", "fc", "conv2d_transpose"};
+      "conv2d",           "mul", "matmul",    "depthwise_conv2d",
+      "conv2d_transpose", "fc",  "matmul_v2",
+  };
   auto* scope = param_scope();
 
   for (auto& quant_type : quant_types) {
diff --git a/paddle/fluid/framework/new_executor/CMakeLists.txt b/paddle/fluid/framework/new_executor/CMakeLists.txt
index 365083a34782a..d758e98b417e7 100644
--- a/paddle/fluid/framework/new_executor/CMakeLists.txt
+++ b/paddle/fluid/framework/new_executor/CMakeLists.txt
@@ -1,6 +1,6 @@
 set(INTERPRETERCORE_DEPS op_registry device_context scope framework_proto data_feed_proto heter_service_proto trainer_desc_proto glog 
 lod_rank_table fs shell fleet_wrapper heter_wrapper ps_gpu_wrapper box_wrapper lodtensor_printer feed_fetch_method
-graph_to_program_pass variable_helper timer monitor)
+graph_to_program_pass variable_helper timer monitor nan_inf_utils)
 
 cc_library(workqueue SRCS workqueue.cc workqueue_utils.cc DEPS enforce)
 cc_library(interpretercore_garbage_collector SRCS interpretercore_garbage_collector.cc DEPS workqueue ${DEVICE_EVENT_LIBS})
diff --git a/paddle/fluid/framework/new_executor/event_manager.cc b/paddle/fluid/framework/new_executor/event_manager.cc
index bd83f49db1d0e..a45f65d264c3a 100644
--- a/paddle/fluid/framework/new_executor/event_manager.cc
+++ b/paddle/fluid/framework/new_executor/event_manager.cc
@@ -16,32 +16,31 @@
 
 namespace paddle {
 namespace framework {
-
-void EventManager::WaitEvent(const Instruction& instruction,
-                             const platform::Place& place) {
+namespace interpreter {
+void WaitEvent(const Instruction& instruction, const platform::Place& place) {
   // If InterpreterCore in on CPUPlace, do nothing.
   if (platform::is_cpu_place(place)) return;
 
-  VLOG(3) << "Deal StreamWaitEventOrSync for "
-          << instruction.kernel_func_.operator_base_->Type();
+  VLOG(3) << "Deal StreamWaitEventOrSync for " << instruction.OpBase()->Type();
 
-  for (auto& event_iter : instruction.intput_events_) {
+  for (auto& event_iter : instruction.InputEvents()) {
     VLOG(3) << "wait var_id: " << event_iter.var_id_
             << " 's event with waiter_type: " << event_iter.waiter_type_;
-    event_iter.event_->Wait(event_iter.waiter_type_, instruction.dev_ctx_);
+    event_iter.event_->Wait(event_iter.waiter_type_,
+                            &instruction.DeviceContext());
   }
 }
 
-void EventManager::RecordEvent(const Instruction& instruction,
-                               const platform::Place& place) {
+void RecordEvent(const Instruction& instruction, const platform::Place& place) {
   // If InterpreterCore in on CPUPlace, do nothing.
   if (platform::is_cpu_place(place)) return;
 
-  for (auto& event : instruction.output_events_) {
+  for (auto& event : instruction.OutputEvents()) {
     VLOG(3) << "Record event in out_var_id: " << event.var_id_;
-    event.event_->Record(instruction.dev_ctx_);
+    event.event_->Record(&instruction.DeviceContext());
   }
 }
 
+}  // namespace interpreter
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/new_executor/event_manager.h b/paddle/fluid/framework/new_executor/event_manager.h
index d23c240469f96..a949ae144017d 100644
--- a/paddle/fluid/framework/new_executor/event_manager.h
+++ b/paddle/fluid/framework/new_executor/event_manager.h
@@ -17,14 +17,11 @@
 
 namespace paddle {
 namespace framework {
+namespace interpreter {
+void RecordEvent(const Instruction& instruction, const platform::Place& place);
 
-class EventManager {
- public:
-  void RecordEvent(const Instruction& instruction,
-                   const platform::Place& place);
-
-  void WaitEvent(const Instruction& instruction, const platform::Place& place);
-};
+void WaitEvent(const Instruction& instruction, const platform::Place& place);
 
+}  // namespace interpreter
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc
index d6ea840362e7e..89810fd303802 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -13,16 +13,19 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/new_executor/interpretercore.h"
-#include "paddle/fluid/framework/new_executor/interpretercore_util.h"
-
 #include <unordered_set>
-
+#include "paddle/fluid/framework/details/nan_inf_utils.h"
 #include "paddle/fluid/framework/details/share_tensor_buffer_functor.h"
+#include "paddle/fluid/framework/new_executor/interpretercore_util.h"
+#include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/profiler.h"
 
 PADDLE_DEFINE_EXPORTED_bool(new_executor_use_inplace, true,
                             "Use inplace in new executor");
 
+DECLARE_bool(check_nan_inf);
+DECLARE_bool(benchmark);
+
 constexpr const char* kExceptionCaught = "ExceptionCaught";
 
 namespace paddle {
@@ -31,25 +34,20 @@ namespace framework {
 static constexpr size_t kHostNumThreads = 4;
 
 InterpreterCore::InterpreterCore(const platform::Place& place,
-                                 const ProgramDesc& main_prog,
-                                 VariableScope* global_scope,
-                                 const std::vector<std::string>& feed_names,
-                                 const std::vector<std::string>& fetch_names)
+                                 const BlockDesc& block,
+                                 VariableScope* global_scope)
     : place_(place),
-      main_program_(main_prog),
+      block_(block),
       global_scope_(global_scope),
-      stream_analyzer_(place),
-      async_work_queue_(kHostNumThreads, &main_thread_blocker_) {
+      stream_analyzer_(place) {
   is_build_ = false;
-
-  feed_names_ = feed_names;
+  async_work_queue_.reset(
+      new interpreter::AsyncWorkQueue(kHostNumThreads, &main_thread_blocker_));
+  gc_.reset(new InterpreterCoreGarbageCollector());
 
   exception_notifier_ = main_thread_blocker_.RegisterEvent(
       kExceptionCaught, [this]() { return exception_holder_.IsCaught(); });
 
-  // Step1: add feedop and fetchop to main_program
-  AddFetch(fetch_names);
-
   // prune
 
   // optmize graph pass
@@ -57,88 +55,57 @@ InterpreterCore::InterpreterCore(const platform::Place& place,
   // convert to run graph
 }
 
-void InterpreterCore::AddFetch(const std::vector<std::string>& fetch_names) {
-  auto* fetch_holder = main_program_.MutableBlock(0)->Var("fetch_vars");
-  fetch_holder->SetType(proto::VarType::FETCH_LIST);
-  fetch_holder->SetPersistable(true);
-
-  int i = 0;
-  for (auto& fetch_name : fetch_names) {
-    // append fetch op
-    auto* op = main_program_.MutableBlock(0)->AppendOp();
-    op->SetType("fetch_v2");
-    op->SetInput("X", {fetch_name});
-    op->SetOutput("Out", {"fetch_vars"});
-    op->SetAttr("col", {static_cast<int>(i)});
-    op->CheckAttrs();
-    i++;
-  }
+InterpreterCore::~InterpreterCore() {
+  // cancle gc's thread
+  gc_.reset(nullptr);
+
+  async_work_queue_.reset(nullptr);
 }
 
 paddle::framework::FetchList InterpreterCore::Run(
-    const std::vector<framework::Tensor>& feed_tensors) {
-  auto FeedInput = [&] {
-    for (size_t i = 0; i < feed_names_.size(); ++i) {
-      auto it = global_scope_->name2id.find(feed_names_[i]);
-      assert(it != global_scope_->name2id.end());
+    const std::vector<std::string>& feed_names,
+    const std::vector<framework::LoDTensor>& feed_tensors) {
+  bool is_build = is_build_;
+  Prepare(feed_names, feed_tensors, is_build);
 
-      auto feed_tensor = global_scope_->var_list[it->second]
-                             ->GetMutable<framework::LoDTensor>();
-      feed_tensor->ShareDataWith(feed_tensors[i]);
-    }
-  };
-
-  if (is_build_ == false) {
-    paddle::framework::interpretercore::build_variable_scope(main_program_,
-                                                             global_scope_);
-    FeedInput();
-    paddle::framework::interpretercore::build_op_func_list(
-        place_, main_program_, &op_list_, &vec_func_list_, global_scope_);
-    is_build_ = true;
-    // convert vec func_list to graph
-    Convert();
-  } else {
-    FeedInput();
+  if (is_build) {
     ExecuteInstructionList(vec_instruction_);
   }
 
   // return Fetch Tensors
-  return *(global_scope_->var_list[global_scope_->name2id["fetch_vars"]]
-               ->GetMutable<framework::FetchList>());
+  auto* fetch_var = global_scope_->Var(interpreter::kFetchVarName);
+  return *(fetch_var->GetMutable<framework::FetchList>());
 }
 
 void InterpreterCore::Convert() {
-  input_var2op_info_.resize(global_scope_->var_list.size());
-
-  vec_instruction_.reserve(vec_func_list_.size());
-  dependecy_count_.resize(vec_func_list_.size());
-  vec_meta_info_.resize(global_scope_->var_list.size());
-  for (size_t i = 0; i < vec_func_list_.size(); ++i) {
-    Instruction temp_inst;
-    auto* op_base = op_list_[i];
-    temp_inst.dev_ctx_ =
-        stream_analyzer_.ParseDeviceContext(vec_func_list_[i], *op_base);
-    temp_inst.kernel_func_.compute_func_ = vec_func_list_[i].kernel_func_;
-    temp_inst.kernel_func_.operator_base_ = op_base;
-    temp_inst.input_index_ = vec_func_list_[i].input_index;
-    temp_inst.output_index_ = vec_func_list_[i].output_index;
-    temp_inst.type_ = vec_func_list_[i].type_;
-    temp_inst.no_data_transform_index_ =
-        vec_func_list_[i].no_data_transform_index;
+  auto& vec_meta_info = global_scope_->MutableVecMetaInfo();
+  auto var_nums = global_scope_->VarSize();
+  input_var2op_info_.resize(var_nums);
 
-    OpInOutInfo info;
+  auto op_nums = vec_func_list_.size();
+  vec_instruction_.reserve(op_nums);
+  dependecy_count_.resize(op_nums);
+
+  for (size_t op_idx = 0; op_idx < op_nums; ++op_idx) {
+    auto& op_func_node = vec_func_list_[op_idx];
+    auto* dev_ctx_ = stream_analyzer_.ParseDeviceContext(op_func_node);
+
+    vec_instruction_.emplace_back(op_idx, op_func_node, *dev_ctx_);
+    auto& instr = vec_instruction_.back();
 
+    OpInOutInfo info;
     std::vector<size_t> gc_check_input_list;
-    for (auto& item : vec_func_list_[i].input_index) {
+
+    for (auto& item : op_func_node.input_index) {
       for (auto id : item.second) {
-        input_var2op_info_[id].push_back(i);
+        input_var2op_info_.at(id).push_back(op_idx);
         // var can be gc-ed
         if (!info.IsBuilt()) {
-          info.Build(op_list_[i]);
+          info.Build(op_func_node.operator_base_);
         }
-        if (global_scope_->vec_meta_info_[id].vardesc_) {
-          if (info.IsInArgBufferNeeded(
-                  global_scope_->vec_meta_info_[id].vardesc_->Name())) {
+        auto* var_desc = global_scope_->VarDesc(id);
+        if (var_desc) {
+          if (info.IsInArgBufferNeeded(var_desc->Name())) {
             gc_check_input_list.push_back(id);
           }
         } else {
@@ -150,23 +117,21 @@ void InterpreterCore::Convert() {
     auto last =
         std::unique(gc_check_input_list.begin(), gc_check_input_list.end());
     gc_check_input_list.erase(last, gc_check_input_list.end());
+
     for (auto var_id : gc_check_input_list) {
-      vec_meta_info_[var_id].var_ref_count_++;
+      vec_meta_info[var_id].var_ref_count_++;
+      instr.AddGCCheckVar(var_id);
     }
-
-    temp_inst.gc_check_var_list.swap(gc_check_input_list);
-
-    vec_instruction_.push_back(temp_inst);
   }
 
   for (size_t i = 0; i < vec_instruction_.size(); ++i) {
     // checkout ouput
-    for (auto& item : vec_instruction_[i].output_index_) {
+    for (auto& item : vec_instruction_[i].Outputs()) {
       for (auto id : item.second) {
-        if (input_var2op_info_[id].size() == 0) {
+        if (input_var2op_info_.at(id).size() == 0) {
           // output var not be used by any kernel
-          vec_instruction_[i].gc_check_var_list.push_back(id);
-          vec_meta_info_[id].var_ref_count_++;
+          vec_instruction_[i].AddGCCheckVar(id);
+          vec_meta_info[id].var_ref_count_++;
         }
       }
     }
@@ -174,10 +139,9 @@ void InterpreterCore::Convert() {
 
   for (size_t i = 0; i < vec_instruction_.size(); ++i) {
     std::vector<size_t> vec_temp;
-    for (auto& item : vec_instruction_[i].output_index_) {
+    for (auto& item : vec_instruction_[i].Outputs()) {
       for (auto id : item.second) {
-        vec_temp =
-            interpretercore::merge_vector(vec_temp, input_var2op_info_[id]);
+        vec_temp = interpreter::merge_vector(vec_temp, input_var2op_info_[id]);
       }
     }
 
@@ -199,13 +163,13 @@ void InterpreterCore::Convert() {
   }
 
   for (size_t i = 0; i < vec_instruction_.size(); ++i) {
-    BuildAndCacheInstructionCtx(&vec_instruction_[i], *global_scope_, place_);
+    BuildAndCacheInstructionCtx(&vec_instruction_[i]);
   }
 
   BuildSkipShareLoDInfo();
 
   for (size_t i = 0; i < vec_instruction_.size(); ++i) {
-    gc_event_.emplace_back(vec_instruction_[i].execution_ctx_.get()->GetPlace(),
+    gc_event_.emplace_back(vec_instruction_[i].DeviceContext().GetPlace(),
                            platform::GenerateDeviceEventFlag());
   }
 
@@ -215,15 +179,14 @@ void InterpreterCore::Convert() {
 }
 
 bool InterpreterCore::BuildInplaceCheckVarIsOnlyInput(size_t var_index) {
-  if (!global_scope_->vec_meta_info_[var_index].vardesc_) {
-    return input_var2op_info_[var_index].size() == 1;
+  if (!global_scope_->VarDesc(var_index)) {
+    return input_var2op_info_.at(var_index).size() == 1;
   } else {
     int is_input_cnt = 0;
-    for (auto inst_id : input_var2op_info_[var_index]) {
+    for (auto inst_id : input_var2op_info_.at(var_index)) {
       OpInOutInfo info;
-      info.Build(vec_instruction_[inst_id].kernel_func_.operator_base_);
-      if (info.IsInArgBufferNeeded(
-              global_scope_->vec_meta_info_[var_index].vardesc_->Name())) {
+      info.Build(vec_instruction_.at(inst_id).OpBase());
+      if (info.IsInArgBufferNeeded(global_scope_->VarDesc(var_index)->Name())) {
         is_input_cnt++;
       }
     }
@@ -233,35 +196,32 @@ bool InterpreterCore::BuildInplaceCheckVarIsOnlyInput(size_t var_index) {
 
 void InterpreterCore::BuildInplace() {
   for (size_t i = 0; i < vec_instruction_.size(); ++i) {
-    if (!vec_instruction_[i]
-             .kernel_func_.operator_base_->Info()
-             .infer_inplace_) {
+    auto& instr = vec_instruction_[i];
+    auto* op_base = instr.OpBase();
+    if (!op_base->Info().infer_inplace_) {
       continue;
     }
 
-    auto in_to_outs =
-        vec_instruction_[i].kernel_func_.operator_base_->Info().infer_inplace_(
-            platform::is_gpu_place(vec_instruction_[i].dev_ctx_->GetPlace()));
+    auto in_to_outs = op_base->Info().infer_inplace_(
+        platform::is_gpu_place(instr.DeviceContext().GetPlace()));
 
+    auto& inputs = instr.Inputs();
+    auto& outputs = instr.Outputs();
     for (auto& pair : in_to_outs) {
-      auto iter = vec_instruction_[i].input_index_.find(pair.first);
-      if (iter != vec_instruction_[i].input_index_.end()) {
+      auto iter = inputs.find(pair.first);
+      if (iter != inputs.end() && !iter->second.empty()) {
         if (BuildInplaceCheckVarIsOnlyInput(iter->second[0])) {
-          auto iterout = vec_instruction_[i].output_index_.find(pair.second);
-          if (iterout != vec_instruction_[i].output_index_.end()) {
-            auto invar = global_scope_->var_list[iter->second[0]];
-            auto outvar = global_scope_->var_list[iterout->second[0]];
-            if (invar && outvar) {
-              vec_instruction_[i].vec_inplace_in_to_out_.emplace_back(invar,
-                                                                      outvar);
-              VLOG(3) << "inplace "
-                      << vec_instruction_[i].kernel_func_.operator_base_->Type()
-                      << " "
-                      << global_scope_->vec_meta_info_[iter->second[0]]
-                             .vardesc_->Name()
+          auto iterout = outputs.find(pair.second);
+          if (iterout != outputs.end() && !iterout->second.empty()) {
+            auto invar = global_scope_->Var(iter->second[0]);
+            auto outvar = global_scope_->Var(iterout->second[0]);
+            if (invar && outvar && invar->IsType<LoDTensor>() &&
+                outvar->IsType<LoDTensor>()) {
+              instr.AddInplace(invar, outvar);
+              VLOG(3) << "inplace " << vec_instruction_[i].OpBase()->Type()
+                      << " " << global_scope_->GetNameById(iter->second[0])
                       << " -> "
-                      << global_scope_->vec_meta_info_[iterout->second[0]]
-                             .vardesc_->Name()
+                      << global_scope_->GetNameById(iterout->second[0])
                       << std::endl;
             }
           }
@@ -271,51 +231,36 @@ void InterpreterCore::BuildInplace() {
   }
 }
 
-void InterpreterCore::BuildAndCacheInstructionCtx(
-    Instruction* instr_node, const VariableScope& var_scope,
-    const platform::Place& place) {
-  auto op_base = instr_node->kernel_func_.operator_base_;
-
+void InterpreterCore::BuildAndCacheInstructionCtx(Instruction* instr_node) {
   VariableValueMap ins_map;
-  for (auto& var_name_item : instr_node->input_index_) {
+  for (auto& var_name_item : instr_node->Inputs()) {
     std::vector<Variable*> input_vars;
 
     input_vars.reserve(var_name_item.second.size());
     for (auto& id : var_name_item.second) {
-      input_vars.emplace_back(var_scope.var_list[id]);
+      input_vars.emplace_back(global_scope_->Var(id));
     }
     ins_map.emplace(var_name_item.first, std::move(input_vars));
   }
 
   VariableValueMap outs_map;
-  for (auto& var_name_item : instr_node->output_index_) {
+  for (auto& var_name_item : instr_node->Outputs()) {
     std::vector<Variable*> out_vars;
 
     out_vars.reserve(var_name_item.second.size());
     for (auto& id : var_name_item.second) {
-      out_vars.emplace_back(var_scope.var_list[id]);
+      out_vars.emplace_back(global_scope_->Var(id));
     }
     outs_map.emplace(var_name_item.first, std::move(out_vars));
   }
-
-  instr_node->runtime_ctx_.reset(new RuntimeContext({}, {}));
-  instr_node->runtime_ctx_->inputs.swap(ins_map);
-  instr_node->runtime_ctx_->outputs.swap(outs_map);
-
-  instr_node->infershape_ctx_.reset(new InterpretercoreInferShapeContext(
-      *op_base, *instr_node->runtime_ctx_.get()));
-
-  auto* dev_ctx = instr_node->dev_ctx_;
-  Scope scope;
-
-  instr_node->execution_ctx_.reset(new ExecutionContext(
-      *op_base, scope, *dev_ctx, *instr_node->runtime_ctx_.get()));
+  // set runtime_ctx and infershape_ctx_
+  instr_node->ResetContext(ins_map, outs_map);
 }
 
 void InterpreterCore::BuildSkipShareLoDInfo() {
   for (size_t i = 0; i < vec_instruction_.size(); ++i) {
     bool can_skip_lod = true;
-    for (auto& input : vec_instruction_[i].runtime_ctx_.get()->inputs) {
+    for (auto& input : vec_instruction_[i].InnerRuntimeContext()->inputs) {
       for (auto& var : input.second) {
         if (var->IsType<LoDTensor>()) {
           if (var->Get<LoDTensor>().lod().size() != 0) {
@@ -328,23 +273,28 @@ void InterpreterCore::BuildSkipShareLoDInfo() {
         }
       }
     }
-    vec_instruction_[i].infershape_ctx_.get()->SetSkipLoD(can_skip_lod);
+    vec_instruction_[i].InnerInferShapeContext()->SetSkipLoD(can_skip_lod);
   }
 }
 
 void InterpreterCore::RunInstruction(const Instruction& instr_node) {
-  VLOG(3) << "RunInstruction:  "
-          << instr_node.kernel_func_.operator_base_->Type();
+  auto* op = instr_node.OpBase();
+  auto place = instr_node.DeviceContext().GetPlace();
+  VLOG(4) << place << " " << op->DebugStringEx(global_scope_);
 
+  auto op_with_kernel = dynamic_cast<const framework::OperatorWithKernel*>(op);
   {
     platform::RecordEvent infershape_event("InferShape");
-    static_cast<const framework::OperatorWithKernel*>(
-        instr_node.kernel_func_.operator_base_)
-        ->InferShape(instr_node.infershape_ctx_.get());
+    // If it is OperatorBase, InferShape do nothing.
+    if (op_with_kernel != nullptr)
+      op_with_kernel->InferShape(instr_node.InnerInferShapeContext().get());
   }
 
-  if (FLAGS_new_executor_use_inplace) {
-    for (auto& pair : instr_node.vec_inplace_in_to_out_) {
+  if (op_with_kernel != nullptr &&
+      FLAGS_new_executor_use_inplace) {  // TODO(xiongkun03) Does operator
+                                         // base support
+                                         // inplace ?
+    for (auto& pair : instr_node.InplaceInfo()) {
       const auto& in = paddle::framework::details::GetTensorFromVar(pair.first);
       auto* out =
           paddle::framework::details::GetMutableTensorFromVar(pair.second);
@@ -355,22 +305,50 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) {
   }
   {
     platform::RecordEvent compute_event("Compute");
-    instr_node.kernel_func_.compute_func_(*instr_node.execution_ctx_.get());
+    if (op_with_kernel == nullptr)
+      instr_node.OpBase()->Run(*global_scope_->GetScope(), place_);
+    else
+      instr_node.KernelFunc()(*instr_node.InnerExecutionContext().get());
+  }
+
+  VLOG(3) << place << " " << op->DebugStringEx(global_scope_);
+
+  /*For profiling/benchmark only*/
+  if (FLAGS_benchmark) {
+    instr_node.DeviceContext().Wait();
+#if defined(PADDLE_WITH_CUDA)
+    PADDLE_ENFORCE_CUDA_SUCCESS(cudaGetLastError());
+    VLOG(4) << "Operator(" << op->Type()
+            << "): context wait and get last error";
+#endif
+#if defined(PADDLE_WITH_HIP)
+    PADDLE_ENFORCE_CUDA_SUCCESS(hipGetLastError());
+    VLOG(4) << "Operator(" << op->Type()
+            << "): context wait and get last error";
+#endif
+  }
+
+  // for debug nan/inf
+  if (FLAGS_check_nan_inf) {
+    VLOG(4) << "Check nan/inf";
+    framework::details::CheckOpHasNanOrInf(
+        *op, *global_scope_,
+        place);  // TODO(xiongkun03) change it to inner scope.
   }
 }
 
 void InterpreterCore::ExecuteInstructionList(
     const std::vector<Instruction>& vec_instr) {
-  async_work_queue_.PrepareAtomicDeps(dependecy_count_);
-  async_work_queue_.PrepareAtomicVarRef(vec_meta_info_);
+  async_work_queue_->PrepareAtomicDeps(dependecy_count_);
+  async_work_queue_->PrepareAtomicVarRef(global_scope_->VecMetaInfo());
   op_run_number_ = 0;
 
   exception_holder_.Clear();
 
   for (size_t i = 0; i < dependecy_count_.size(); ++i) {
     if (dependecy_count_[i] == 0) {
-      async_work_queue_.AddTask(vec_instr[i].type_,
-                                [&, i] { RunInstructionAsync(i); });
+      async_work_queue_->AddTask(vec_instr.at(i).KernelType(),
+                                 [&, i] { RunInstructionAsync(i); });
     }
   }
 
@@ -391,43 +369,43 @@ void InterpreterCore::ExecuteInstructionList(
 
 void InterpreterCore::RunNextInstructions(
     const Instruction& instr, std::queue<size_t>* reserved_next_ops) {
-  auto& next_instr = instr.next_instruction_;
-  auto& atomic_deps = async_work_queue_.AtomicDeps();
+  auto& next_instr = instr.NextInstructions();
+  auto& atomic_deps = async_work_queue_->AtomicDeps();
   auto IsReady = [&](size_t next_id) {
     return atomic_deps[next_id]->fetch_sub(1, std::memory_order_relaxed) == 1;
   };
 
-  if (instr.type_ == OpFuncType::kQueueAsync) {
+  if (instr.KernelType() == OpFuncType::kQueueAsync) {
     // move all sync_ops into other threads
-    for (auto next_id : next_instr.synchronize_run_) {
+    for (auto next_id : next_instr.SyncRunIds()) {
       if (IsReady(next_id)) {
-        async_work_queue_.AddTask(
-            vec_instruction_[next_id].type_,
+        async_work_queue_->AddTask(
+            vec_instruction_[next_id].KernelType(),
             [&, next_id] { RunInstructionAsync(next_id); });
       }
     }
     // keep all async_ops running in current thread
-    for (auto next_id : next_instr.direct_run_) {
+    for (auto next_id : next_instr.DirectRunIds()) {
       if (IsReady(next_id)) {
         reserved_next_ops->push(next_id);
       }
     }
-    for (auto next_id : next_instr.event_wait_run_) {
+    for (auto next_id : next_instr.EventRunIds()) {
       if (IsReady(next_id)) {
         reserved_next_ops->push(next_id);
       }
     }
   } else {
     // move async_ops into async_thread
-    for (auto next_id : next_instr.event_wait_run_) {
+    for (auto next_id : next_instr.EventRunIds()) {
       if (IsReady(next_id)) {
-        async_work_queue_.AddTask(
-            vec_instruction_[next_id].type_,
+        async_work_queue_->AddTask(
+            vec_instruction_[next_id].KernelType(),
             [&, next_id] { RunInstructionAsync(next_id); });
       }
     }
-    auto direct_run_ops = interpretercore::merge_vector(
-        next_instr.synchronize_run_, next_instr.direct_run_);
+    auto direct_run_ops = interpreter::merge_vector(next_instr.SyncRunIds(),
+                                                    next_instr.DirectRunIds());
     size_t first_op = 0;
     for (auto next_id : direct_run_ops) {
       if (IsReady(next_id)) {
@@ -437,8 +415,8 @@ void InterpreterCore::RunNextInstructions(
           continue;
         }
         // move rest ops into other threads
-        async_work_queue_.AddTask(
-            vec_instruction_[next_id].type_,
+        async_work_queue_->AddTask(
+            vec_instruction_[next_id].KernelType(),
             [&, next_id] { RunInstructionAsync(next_id); });
       }
     }
@@ -452,10 +430,10 @@ void InterpreterCore::RunInstructionAsync(size_t instr_id) {
   while (!ready_ops.empty()) {
     instr_id = ready_ops.front();
     ready_ops.pop();
-    auto& instr_node = vec_instruction_[instr_id];
-    auto* op = instr_node.kernel_func_.operator_base_;
+    auto& instr_node = vec_instruction_.at(instr_id);
+    auto* op = instr_node.OpBase();
     platform::RecordEvent instruction_event(op->Type());
-    event_manager_.WaitEvent(instr_node, place_);
+    interpreter::WaitEvent(instr_node, place_);
 
     try {
       RunInstruction(instr_node);
@@ -482,78 +460,83 @@ void InterpreterCore::RunInstructionAsync(size_t instr_id) {
       return;
     }
 
-    event_manager_.RecordEvent(instr_node, place_);
+    interpreter::RecordEvent(instr_node, place_);
     op_run_number_.fetch_add(1, std::memory_order_relaxed);
 
     // GC infomation
-    CheckGC(instr_id, instr_node.gc_check_var_list);
+    CheckGC(instr_node);
 
     RunNextInstructions(instr_node, &ready_ops);
   }
 }
 
-void InterpreterCore::CheckGC(size_t instr_id,
-                              const std::vector<size_t>& gc_check_list) {
+void InterpreterCore::CheckGC(const Instruction& instr) {
+  size_t instr_id = instr.Id();
   auto& var_scope = *global_scope_;
-  auto& atomic_var_ref = async_work_queue_.AtomicVarRef();
+  auto& atomic_var_ref = async_work_queue_->AtomicVarRef();
 
-  for (auto var_id : gc_check_list) {
+  for (auto var_id : instr.GCCheckVars()) {
     bool is_ready =
         atomic_var_ref[var_id]->fetch_sub(1, std::memory_order_relaxed) == 1;
-    if (is_ready && var_scope.vec_meta_info_[var_id].vardesc_ &&
-        !var_scope.vec_meta_info_[var_id].vardesc_->Persistable()) {
-      gc_.Add(var_scope.var_list[var_id], gc_event_[instr_id],
-              vec_instruction_[instr_id].dev_ctx_);
-    } else if (is_ready &&
-               var_scope.vec_meta_info_[var_id].vardesc_ == nullptr) {
-      gc_.Add(var_scope.var_list[var_id], gc_event_[instr_id],
-              vec_instruction_[instr_id].dev_ctx_);
+    // ignore all persistable var while GC
+    if (var_scope.VarDesc(var_id) && var_scope.VarDesc(var_id)->Persistable()) {
+      continue;
+    }
+    if (is_ready) {
+      gc_->Add(var_scope.Var(var_id), gc_event_.at(instr_id),
+               &instr.DeviceContext());
     }
   }
 }
 
-void InterpreterCore::DryRunPrepare(
-    const std::vector<framework::Tensor>& feed_tensors) {
+void InterpreterCore::Prepare(
+    const std::vector<std::string>& feed_names,
+    const std::vector<framework::LoDTensor>& feed_tensors, bool prepare_feed) {
+  PADDLE_ENFORCE_EQ(feed_names.size(), feed_tensors.size(),
+                    platform::errors::PreconditionNotMet(
+                        "Required feed_names.size() == feed_tensors.size(), "
+                        "but received %d != %d",
+                        feed_names.size(), feed_tensors.size()));
+
   auto FeedInput = [&] {
-    for (size_t i = 0; i < feed_names_.size(); ++i) {
-      auto it = global_scope_->name2id.find(feed_names_[i]);
-      assert(it != global_scope_->name2id.end());
+    for (size_t i = 0; i < feed_names.size(); ++i) {
+      auto* feed_var = global_scope_->FindVar(feed_names[i]);
+      PADDLE_ENFORCE_NOT_NULL(feed_var, platform::errors::NotFound(
+                                            "feed_var shall not be nullptr."));
 
-      auto feed_tensor = global_scope_->var_list[it->second]
-                             ->GetMutable<framework::LoDTensor>();
+      auto feed_tensor = feed_var->GetMutable<framework::LoDTensor>();
       feed_tensor->ShareDataWith(feed_tensors[i]);
+      feed_tensor->set_lod(feed_tensors[i].lod());
     }
   };
 
-  if (is_build_ == false) {
-    paddle::framework::interpretercore::build_variable_scope(main_program_,
-                                                             global_scope_);
+  if (!is_build_) {
+    paddle::framework::interpreter::build_variable_scope(block_, global_scope_);
     FeedInput();
-    paddle::framework::interpretercore::build_op_func_list(
-        place_, main_program_, &op_list_, &vec_func_list_, global_scope_);
+    paddle::framework::interpreter::build_op_func_list(
+        place_, block_, &vec_func_list_, global_scope_);
     is_build_ = true;
     // convert vec func_list to graph
     Convert();
   }
   // NOTE: Because feed_tensor will be GC after
   // paddle::framework::build_op_func_list, so we should
-  // call
-  // FeedInput again.
-  FeedInput();
+  // call FeedInput again.
+  if (prepare_feed) FeedInput();
 }
 
-const CostInfo& InterpreterCore::DryRun(
-    const std::vector<framework::Tensor>& feed_tensors) {
-  DryRunPrepare(feed_tensors);
-  // DryRun may be called many times.
-  dry_run_profiler_.Reset();
-  dry_run_profiler_.Start();
-  ExecuteInstructionList(vec_instruction_);
-  platform::DeviceContextPool::Instance().Get(place_)->Wait();
-
-  dry_run_profiler_.Pause();
-  dry_run_profiler_.TotalCUDAAllocatedMemorySize(place_);
-  return dry_run_profiler_.GetCostInfo();
+interpreter::CostInfo InterpreterCore::DryRun(
+    const std::vector<std::string>& feed_names,
+    const std::vector<framework::LoDTensor>& feed_tensors) {
+  Prepare(feed_names, feed_tensors, true);
+  interpreter::CostInfo cost_info;
+  {
+    interpreter::ProfilerGuard(place_, &cost_info);
+    ExecuteInstructionList(vec_instruction_);
+    platform::DeviceContextPool::Instance().Get(place_)->Wait();
+  }
+
+  return cost_info;
 }
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/new_executor/interpretercore.h b/paddle/fluid/framework/new_executor/interpretercore.h
index 9fba5f2cdce8b..915ae782e2210 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.h
+++ b/paddle/fluid/framework/new_executor/interpretercore.h
@@ -40,22 +40,23 @@ using AtomicVectorSizeT = std::vector<std::unique_ptr<std::atomic<size_t>>>;
 
 class InterpreterCore {
  public:
-  InterpreterCore(const platform::Place& place, const ProgramDesc& main_prog,
-                  VariableScope* global_scope,
-                  const std::vector<std::string>& feed_names,
-                  const std::vector<std::string>& fetch_names);
+  InterpreterCore(const platform::Place& place, const BlockDesc& block,
+                  VariableScope* global_scope);
+
+  ~InterpreterCore();
 
   paddle::framework::FetchList Run(
-      const std::vector<framework::Tensor>& feed_tensors);
+      const std::vector<std::string>& feed_names,
+      const std::vector<framework::LoDTensor>& feed_tensors);
 
-  const CostInfo& DryRun(const std::vector<framework::Tensor>& feed_tensors);
+  interpreter::CostInfo DryRun(
+      const std::vector<std::string>& feed_names,
+      const std::vector<framework::LoDTensor>& feed_tensors);
 
  private:
   void Convert();
 
-  void BuildAndCacheInstructionCtx(Instruction* instr_node,
-                                   const VariableScope& var_scope,
-                                   const platform::Place& place);
+  void BuildAndCacheInstructionCtx(Instruction* instr_node);
 
   void BuildInplace();
 
@@ -65,46 +66,39 @@ class InterpreterCore {
 
   void ExecuteInstructionList(const std::vector<Instruction>& vec_instr);
 
-  void DryRunPrepare(const std::vector<framework::Tensor>& feed_tensors);
+  void Prepare(const std::vector<std::string>& feed_names,
+               const std::vector<framework::LoDTensor>& feed_tensors,
+               bool prepare_feed);
 
-  void CheckGC(size_t instr_id, const std::vector<size_t>& gc_check_list);
+  void CheckGC(const Instruction& instr);
 
   void RunInstructionAsync(size_t instr_id);
   void RunNextInstructions(const Instruction& instr_id,
                            std::queue<size_t>* reserved_next_ops);
-  void AddFetch(const std::vector<std::string>& fetch_names);
 
   void BuildSkipShareLoDInfo();
 
   bool is_build_;
 
   const platform::Place& place_;
-  ProgramDesc main_program_;
-  VariableScope* global_scope_;
-
-  std::vector<Instruction> vec_instruction_;
-  InstructionInfo instruction_info_;
-  std::vector<size_t> dependecy_count_;
-  std::vector<std::vector<size_t>> input_var2op_info_;
-  std::vector<VariableMetaInfo> ref_coun_info_;
-  std::vector<VariableMetaInfo> vec_meta_info_;
+  const BlockDesc& block_;       // not owned
+  VariableScope* global_scope_;  // not owned
 
   std::vector<paddle::framework::OpFuncNode> vec_func_list_;
-  std::vector<paddle::framework::OperatorBase*> op_list_;
+  std::vector<Instruction> vec_instruction_;  // deconstruct before OpFuncNode
 
-  std::vector<std::string> feed_names_;
+  std::vector<size_t> dependecy_count_;
+  std::atomic<size_t> op_run_number_{0};
+  std::vector<std::vector<size_t>> input_var2op_info_;
 
-  InterpreterProfiler dry_run_profiler_;
   StreamAnalyzer stream_analyzer_;
-  EventManager event_manager_;
   EventsWaiter main_thread_blocker_;
-  interpretercore::AsyncWorkQueue async_work_queue_;
+  std::unique_ptr<interpreter::AsyncWorkQueue> async_work_queue_;
   details::ExceptionHolder exception_holder_;
   std::shared_ptr<EventsWaiter::EventNotifier> exception_notifier_{nullptr};
 
-  InterpreterCoreGarbageCollector gc_;
+  std::unique_ptr<InterpreterCoreGarbageCollector> gc_;
   std::vector<paddle::platform::DeviceEvent> gc_event_;
-  std::atomic<size_t> op_run_number_{0};
 };
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/new_executor/interpretercore_garbage_collector.cc b/paddle/fluid/framework/new_executor/interpretercore_garbage_collector.cc
index 2ae84d9dcdddd..59dd44ab9ada6 100644
--- a/paddle/fluid/framework/new_executor/interpretercore_garbage_collector.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore_garbage_collector.cc
@@ -28,6 +28,10 @@ InterpreterCoreGarbageCollector::InterpreterCoreGarbageCollector() {
   queue_ = CreateSingleThreadedWorkQueue(options);
 }
 
+InterpreterCoreGarbageCollector::~InterpreterCoreGarbageCollector() {
+  queue_.reset(nullptr);
+}
+
 void InterpreterCoreGarbageCollector::Add(
     std::shared_ptr<memory::Allocation> garbage,
     paddle::platform::DeviceEvent& event, const platform::DeviceContext* ctx) {
@@ -58,6 +62,11 @@ void InterpreterCoreGarbageCollector::Add(paddle::framework::Variable* var,
                                           const platform::DeviceContext* ctx) {
   if (var->IsType<LoDTensor>()) {
     Add(var->GetMutable<LoDTensor>()->MoveMemoryHolder(), event, ctx);
+  } else if (var->IsType<
+                 operators::reader::
+                     OrderedMultiDeviceLoDTensorBlockingQueueHolder>()) {
+    // var->Clear(); // TODO(xiongkun03) can we clear directly? Why we must use
+    // Add interface?
   } else if (var->IsType<SelectedRows>()) {
     Add(var->GetMutable<SelectedRows>()->mutable_value()->MoveMemoryHolder(),
         event, ctx);
@@ -66,6 +75,10 @@ void InterpreterCoreGarbageCollector::Add(paddle::framework::Variable* var,
     for (auto& t : *tensor_arr) {
       Add(t.MoveMemoryHolder(), event, ctx);
     }
+  } else if (var->IsType<std::vector<Scope*>>()) {
+    // NOTE(@xiongkun03) conditional_op / while_op will create a STEP_SCOPE
+    // refer to executor.cc to see what old garbage collector does.
+    // do nothing, because the sub scope will be deleted by sub-executor.
   } else {
     PADDLE_THROW(platform::errors::Unimplemented(
         "The variable(%s) is not supported in eager deletion.",
diff --git a/paddle/fluid/framework/new_executor/interpretercore_garbage_collector.h b/paddle/fluid/framework/new_executor/interpretercore_garbage_collector.h
index b1157c861754c..166139a73c8f9 100644
--- a/paddle/fluid/framework/new_executor/interpretercore_garbage_collector.h
+++ b/paddle/fluid/framework/new_executor/interpretercore_garbage_collector.h
@@ -35,6 +35,8 @@ class InterpreterCoreGarbageCollector {
  public:
   InterpreterCoreGarbageCollector();
 
+  ~InterpreterCoreGarbageCollector();
+
   void Add(std::shared_ptr<memory::Allocation> garbage,  // NOLINT
            paddle::platform::DeviceEvent& event,         // NOLINT
            const platform::DeviceContext* ctx);
diff --git a/paddle/fluid/framework/new_executor/interpretercore_util.cc b/paddle/fluid/framework/new_executor/interpretercore_util.cc
index 7bb0429c6228b..acf0b4b30c781 100644
--- a/paddle/fluid/framework/new_executor/interpretercore_util.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore_util.cc
@@ -18,7 +18,8 @@
 
 namespace paddle {
 namespace framework {
-namespace interpretercore {
+namespace interpreter {
+using VariableIdMap = std::map<std::string, std::vector<int>>;
 
 AtomicVectorSizeT& AsyncWorkQueue::PrepareAtomicDeps(
     const std::vector<size_t>& dependecy_count) {
@@ -128,47 +129,31 @@ std::string get_memcpy_type(const platform::Place& src_place,
   }
 }
 
-void build_variable_scope(const framework::ProgramDesc& pdesc,
+void build_variable_scope(const framework::BlockDesc& block,
                           VariableScope* var_scope) {
-  auto& global_block = pdesc.Block(0);
-
-  for (auto& var : global_block.AllVars()) {
-    if (var->Name() == framework::kEmptyVarName) {
+  for (auto& var_desc : block.AllVars()) {
+    auto var_name = var_desc->Name();
+    if (var_name == framework::kEmptyVarName) {
       continue;
     }
 
-    if (var_scope->name2id.find(var->Name()) == var_scope->name2id.end()) {
-      var_scope->name2id[var->Name()] = var_scope->var_list.size();
-      auto v = new Variable();
-      InitializeVariable(v, var->GetType());
-      var_scope->var_list.push_back(v);
-
-      VariableMetaInfo info;
-      info.var_ref_count_ = 0;
-      info.vardesc_ = var;
-      var_scope->vec_meta_info_.push_back(info);
+    if (nullptr == var_scope->FindVar(var_name)) {
+      var_scope->AddVar(var_desc->Name(), var_desc);
     } else {
-      auto var_id = var_scope->name2id[var->Name()];
-      if (nullptr == var_scope->vec_meta_info_[var_id].vardesc_) {
-        VLOG(3) << "update var:" << var->Name() << " desc from nullptr into "
-                << var;
-        var_scope->vec_meta_info_[var_id].vardesc_ = var;
+      auto* var_desc_tmp = var_scope->VarDesc(var_name);
+      if (nullptr == var_desc_tmp) {
+        VLOG(3) << "update var:" << var_name << " desc from nullptr into "
+                << var_desc;
+        var_scope->SetVarDesc(var_name, var_desc);
       }
     }
   }
 }
 
-void build_op_func_list(const platform::Place& place,
-                        const framework::ProgramDesc& pdesc,
-                        std::vector<OperatorBase*>* op_list,
-                        std::vector<OpFuncNode>* vec_func_list,
-                        VariableScope* var_scope) {
-  auto& global_block = pdesc.Block(0);
-  auto& all_op_kernels = OperatorWithKernel::AllOpKernels();
-
+std::vector<OperatorBase*> create_all_ops(const framework::BlockDesc& block) {
   std::vector<OperatorBase*> ops;
-  for (auto& op : global_block.AllOps()) {
-    VLOG(3) << "Build OpFuncNode from : " << op->Type();
+  for (auto& op : block.AllOps()) {
+    VLOG(3) << "CreateOp from : " << op->Type();
 
     auto& info = OpInfoMap::Instance().Get(op->Type());
 
@@ -179,252 +164,381 @@ void build_op_func_list(const platform::Place& place,
     if (info.Checker() != nullptr) {
       info.Checker()->Check(&op_attr_map);
     }
-    // step 1. Prepare VariableValueMap of input/output
     auto op_base =
         info.Creator()(op->Type(), inputs_names, outputs_names, op_attr_map);
     ops.push_back(op_base);
   }
+  return ops;
+}
 
-  auto unused_var_map = get_unused_vars(global_block, ops);
+std::tuple<VariableValueMap, VariableIdMap> build_variable_map(
+    const VariableNameMap& var_name_map, VariableScope* var_scope,
+    bool enforce_exist = true) {
+  VariableValueMap name2var;
+  VariableIdMap name2id;
+  for (auto& item : var_name_map) {
+    std::vector<Variable*> vars;
+    std::vector<int> ids;
+    vars.reserve(item.second.size());
+
+    for (auto& var_name : item.second) {
+      if (!enforce_exist && !var_scope->HasVar(var_name)) {
+        // skip the non-exist variable: such as recurrent_grad
+        VLOG(4) << var_name << " don't exist in variable scope, skip it!";
+        continue;
+      }
+      auto var_id = var_scope->VarId(var_name);
+      auto* in_var = var_scope->Var(var_id);
+      vars.push_back(in_var);
+      ids.push_back(var_id);
+    }
+    name2var[item.first] = std::move(vars);
+    name2id[item.first] = std::move(ids);
+  }
+  return std::make_tuple(name2var, name2id);
+}
+
+void apply_device_guard(const OperatorBase* op_base,
+                        const platform::Place& place,
+                        OpKernelType* expected_kernel_key) {
+  bool need_change_place =
+      (op_base->HasAttr("op_device") &&
+       (op_base->Attr<std::string>("op_device").length() > 0));
+  if (need_change_place) {
+    auto& op_device = op_base->Attr<std::string>("op_device");
+    if (op_device == "cpu" || platform::is_cpu_place(place)) {
+      VLOG(3) << "Switch into CPUPlace by device_guard.";
+      expected_kernel_key->place_ = platform::CPUPlace();
+    } else if (op_device.find("gpu") != std::string::npos &&
+               (platform::is_gpu_place(place) ||
+                platform::is_npu_place(place))) {
+      // when the Op that only has CPUKernel is assigned to GPU, the CPUKernel
+      // will be executed and a warning will be given at the same time.
+      if (op_base->SupportGPU()) {
+        expected_kernel_key->place_ = place;
+      } else if (op_base->SupportNPU()) {
+        expected_kernel_key->place_ = place;
+      } else {
+        expected_kernel_key->place_ = platform::CPUPlace();
+        LOG_FIRST_N(WARNING, 1)
+            << "Op(" << op_base->Type()
+            << ") has no CUDA implementation. It will be assigned to CPUPlace.";
+      }
+      VLOG(3) << "Switch into " << expected_kernel_key->place_
+              << " by device_guard.";
+    } else {
+      PADDLE_THROW(
+          platform::errors::Fatal("Unsupported current place %s", op_device));
+    }
+  }
+}
+
+void deal_operator_base(const platform::Place& place,
+                        const VariableScope* var_scope, OperatorBase* op_base,
+                        OpFuncNode* op_func_node) {
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  auto* dev_ctx = pool.Get(place);
+  // input, output is prepared. set the other attributes.
+  op_func_node->operator_base_ = op_base;
+  op_func_node->type_ = OpFuncType::kQueueSync;  // alway Sync
+  op_func_node->kernel_func_ = nullptr;
+  op_base->Run(*var_scope->GetScope(), place);  // Run without data transformer.
+
+  std::unordered_set<int> no_data_transform_index;
+  for (auto& it : op_func_node->input_index) {
+    for (auto& id : it.second) {
+      no_data_transform_index.emplace(id);
+    }
+  }
+  op_func_node->no_data_transform_index =
+      no_data_transform_index;  // all index is no-need-transform
+  op_func_node->dev_ctx_ = dev_ctx;
+}
+
+// the return value is whether data transformer is needed for this var
+bool need_place_transform_for_var(const OpKernelType& kernel_type_for_var,
+                                  const OpKernelType& expected_kernel_key) {
+  if (platform::is_same_place(kernel_type_for_var.place_,
+                              expected_kernel_key.place_) ||
+      (is_cuda_pinned_place(kernel_type_for_var.place_) &&
+       is_cpu_place(expected_kernel_key.place_))) {
+    return false;
+  } else {
+    return true;
+  }
+}
+
+bool need_dtype_transform_for_var(const OpKernelType& kernel_type_for_var,
+                                  const OpKernelType& expected_kernel_key) {
+  return false;  // TODO(@xiongkun) add dtype judgement here
+}
+
+bool need_layout_transform_for_var(const OpKernelType& kernel_type_for_var,
+                                   const OpKernelType& expected_kernel_key) {
+  return false;  // TODO(@xiongkun) add layout judgement here
+}
+
+// NOTE(@xiongkun03)
+// the difference between var_name and outer_name :
+// if "X": ["var1", "var2"], then X is the outer name,
+// var1 and var2 is the var_name
+std::tuple<std::string, OpFuncNode> apply_place_transform_for_var(
+    const OpKernelType& kernel_type_for_var,
+    const OpKernelType& expected_kernel_key, const platform::Place& place,
+    const std::string& var_name, const std::string& outer_name,
+    const OpFuncNode& op_func_node, Variable* var, VariableScope* var_scope) {
+  auto& ins_name2id = op_func_node.input_index;
+  auto& all_op_kernels = OperatorWithKernel::AllOpKernels();
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  std::string new_var_name =
+      var_name + "_copy_" + std::to_string(var_scope->VarSize() + 1);
+  var_scope->AddVar(new_var_name, nullptr);
+
+  VariableNameMap copy_in_map;
+  copy_in_map["X"] = {var_name};
+  VariableNameMap copy_out_map;
+  copy_out_map["Out"] = {new_var_name};
+  AttributeMap attr_map;
+  attr_map["dst_place_type"] =
+      is_cpu_place(expected_kernel_key.place_)
+          ? 0
+          : is_gpu_place(expected_kernel_key.place_) ? 1 : -1;
+
+  std::map<std::string, std::vector<int>> copy_ins_name2id;
+  copy_ins_name2id["X"] = ins_name2id.at(outer_name);
+  std::map<std::string, std::vector<int>> copy_out_name2id;
+  copy_out_name2id["Out"] = {var_scope->VarId(new_var_name)};
+
+  VariableValueMap copy_ins_value_map;
+  copy_ins_value_map["X"] = {var};
+  VariableValueMap copy_outs_value_map;
+  copy_outs_value_map["Out"] = {var_scope->Var(new_var_name)};
+
+  // memcpy_d2h, memcpy_h2d
+  auto memcpy_op_type =
+      get_memcpy_type(kernel_type_for_var.place_, expected_kernel_key.place_);
+  VLOG(3) << string::Sprintf("Insert %s with %s(%s) -> %s(%s).", memcpy_op_type,
+                             var_name, kernel_type_for_var.place_, new_var_name,
+                             expected_kernel_key.place_);
+  auto& copy_info = OpInfoMap::Instance().Get(memcpy_op_type);
+  auto copy_op =
+      copy_info.Creator()(memcpy_op_type, copy_in_map, copy_out_map, attr_map);
+  OpFuncNode copy_op_func_node;
+  copy_op_func_node.input_index = copy_ins_name2id;
+  copy_op_func_node.output_index = copy_out_name2id;
+
+  RuntimeContext copy_runtime_context({}, {});
+  copy_runtime_context.inputs.swap(copy_ins_value_map);
+  copy_runtime_context.outputs.swap(copy_outs_value_map);
+  InterpretercoreInferShapeContext copy_infer_shape_ctx(*copy_op,
+                                                        copy_runtime_context);
+  static_cast<const framework::OperatorWithKernel*>(copy_op)->InferShape(
+      &copy_infer_shape_ctx);
+
+  auto kernels_iter = all_op_kernels.find(memcpy_op_type);
+  PADDLE_ENFORCE_NE(kernels_iter, all_op_kernels.end(),
+                    platform::errors::Unavailable(
+                        "There are no kernels which are registered in "
+                        "the memcpy operator."));
+
+  OpKernelMap& kernels = kernels_iter->second;
+  auto* dev_ctx = pool.Get(place);
+  Scope scope;
+  auto copy_exec_ctx =
+      ExecutionContext(*copy_op, scope, *dev_ctx, copy_runtime_context);
+  auto copy_expected_kernel_key =
+      dynamic_cast<const framework::OperatorWithKernel*>(copy_op)
+          ->GetExpectedKernelType(copy_exec_ctx);
+  auto kernel_iter = kernels.find(copy_expected_kernel_key);
+  copy_op_func_node.kernel_func_ = OpKernelComputeFunc(kernel_iter->second);
+  copy_op_func_node.kernel_func_(copy_exec_ctx);
+  VLOG(3) << "Run " << memcpy_op_type << " done.";
+  // NOTE(Aurelius84): memcpy_op is expensive operation, so we tag them
+  // as kQueueSync and execute them in thread pool.
+  copy_op_func_node.type_ = OpFuncType::kQueueSync;
+  copy_op_func_node.dev_ctx_ = dev_ctx;
+  copy_op_func_node.operator_base_ = copy_op;
+
+  return std::make_pair(new_var_name, copy_op_func_node);
+}
+
+std::vector<OpFuncNode> apply_data_transform(
+    const OpKernelType& expected_kernel_key, const platform::Place& place,
+    VariableValueMap* ins_map_temp, VariableScope* var_scope,
+    OpFuncNode* op_func_node) {
+  auto& op_base = op_func_node->operator_base_;
+  PADDLE_ENFORCE_NOT_NULL(op_base, platform::errors::PreconditionNotMet(
+                                       "op_base is null, please pass a valid "
+                                       "op_base in apply_data_transform."));
+  auto inputs_names = op_base->Inputs();
+
+  std::unordered_set<int>
+      no_data_transform_index;  // record the no need transform variable index.
+  std::vector<OpFuncNode> copy_func_nodes;  // return all the copy opfuncnode.
+
+  for (auto& var_name_item : *ins_map_temp) {
+    for (size_t i = 0; i < var_name_item.second.size(); ++i) {
+      auto var = var_name_item.second[i];
+      auto& var_name = inputs_names[var_name_item.first].at(i);
+      auto tensor_in = GetLoDTensorOrSelectedRowsValueFromVar(*var);
+      if (!tensor_in->IsInitialized()) {
+        continue;
+      }
+      auto kernel_type_for_var =  // the true kernel type for op_base
+          static_cast<const framework::OperatorWithKernel*>(op_base)
+              ->GetKernelTypeForVar(var_name_item.first, *tensor_in,
+                                    expected_kernel_key);
+      if (need_place_transform_for_var(kernel_type_for_var,
+                                       expected_kernel_key)) {
+        if (op_base->Type() == "fetch_v2") {
+          op_base->SetAttr("deepcopy", false);
+        }
+        std::string new_var_name;
+        OpFuncNode copy_op_func_node;
+        std::tie(new_var_name, copy_op_func_node) =
+            apply_place_transform_for_var(
+                kernel_type_for_var, expected_kernel_key, place, var_name,
+                var_name_item.first, *op_func_node, var, var_scope);
+        op_func_node->input_index[var_name_item.first][i] =
+            var_scope->VarId(new_var_name);
+        copy_func_nodes.push_back(copy_op_func_node);
+        var_name_item.second[i] = var_scope->Var(new_var_name);
+      } else if (need_dtype_transform_for_var(kernel_type_for_var,
+                                              expected_kernel_key)) {
+        // TODO(@xiongkun) add dtype judgement here
+      } else if (need_layout_transform_for_var(kernel_type_for_var,
+                                               expected_kernel_key)) {
+        // TODO(@xiongkun) add layout judgement here
+      } else {
+        // record no need data transformer input var_id
+        VLOG(3) << op_base->Type()
+                << " found no data_transform var: " << var_name
+                << " with id: " << var_scope->VarId(var_name);
+        no_data_transform_index.emplace(var_scope->VarId(var_name));
+      }
+    }
+  }
+  op_func_node->no_data_transform_index = std::move(no_data_transform_index);
+  return copy_func_nodes;
+}
+
+void build_op_func_list(const platform::Place& place,
+                        const framework::BlockDesc& block,
+                        std::vector<OpFuncNode>* vec_func_list,
+                        VariableScope* var_scope) {
+  auto& all_op_kernels = OperatorWithKernel::AllOpKernels();
+
+  // Step 1: create all ops for current block.
+  auto ops = create_all_ops(block);
+  auto unused_var_map = get_unused_vars(block, ops);
 
   size_t ops_index = 0;
-  for (auto& op : global_block.AllOps()) {
-    VLOG(3) << op->Type();
-    // << op->Type() << endl;
+  for (auto& op : block.AllOps()) {
+    VLOG(6) << "Build OpFuncNode from : " << op->Type();
 
     auto op_base = ops[ops_index++];
-
     auto inputs_names = op->Inputs();
     auto outputs_names = op->Outputs();
 
     VariableValueMap ins_map;
-    std::map<std::string, std::vector<int>> ins_name2id;
-    for (auto& var_name_item : inputs_names) {
-      std::vector<Variable*> input_vars;
-      std::vector<int> vec_ids;
-      input_vars.reserve(var_name_item.second.size());
-      for (auto& var_name : var_name_item.second) {
-        auto it = var_scope->name2id.find(var_name);
-        assert(it != var_scope->name2id.end());
-        input_vars.push_back(var_scope->var_list[it->second]);
-        vec_ids.push_back(it->second);
-      }
-      ins_map[var_name_item.first] = input_vars;
-      ins_name2id[var_name_item.first] = vec_ids;
-    }
+    VariableIdMap ins_name2id;
+    bool enforce_exist = true;
+    if (op->Type() == "recurrent_grad") enforce_exist = false;
+    std::tie(ins_map, ins_name2id) =
+        build_variable_map(inputs_names, var_scope, enforce_exist);
 
     VariableValueMap outs_map;
-    std::map<std::string, std::vector<int>> outs_name2id;
-    for (auto& var_name_item : outputs_names) {
-      std::vector<Variable*> output_vars;
-      std::vector<int> vec_ids;
-      output_vars.reserve(var_name_item.second.size());
-      for (auto& var_name : var_name_item.second) {
-        auto it = var_scope->name2id.find(var_name);
-        assert(it != var_scope->name2id.end());
-        output_vars.push_back(var_scope->var_list[it->second]);
-        vec_ids.push_back(it->second);
-      }
-      outs_map[var_name_item.first] = output_vars;
-      outs_name2id[var_name_item.first] = vec_ids;
-    }
+    VariableIdMap outs_name2id;
+    std::tie(outs_map, outs_name2id) =
+        build_variable_map(outputs_names, var_scope, enforce_exist);
 
+    // step 2: build OpFuncNode
     OpFuncNode op_func_node;
     op_func_node.input_index = ins_name2id;
     op_func_node.output_index = outs_name2id;
-    // step 2: construct RuntimeContext and analysis KernelType
-    RuntimeContext runtime_context({}, {});
-    runtime_context.inputs.swap(ins_map);
-    runtime_context.outputs.swap(outs_map);
-    InterpretercoreInferShapeContext infer_shape_ctx(*op_base, runtime_context);
-    static_cast<const framework::OperatorWithKernel*>(op_base)->InferShape(
-        &infer_shape_ctx);
-    auto kernels_iter = all_op_kernels.find(op->Type());
-    PADDLE_ENFORCE_NE(
-        kernels_iter, all_op_kernels.end(),
-        platform::errors::Unavailable(
-            "There are no kernels which are registered in the %s operator.",
-            op->Type()));
-
-    OpKernelMap& kernels = kernels_iter->second;
-
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    auto* dev_ctx = pool.Get(place);
-    Scope scope;
-    auto expected_kernel_key =
-        dynamic_cast<const framework::OperatorWithKernel*>(op_base)
-            ->GetExpectedKernelType(
-                ExecutionContext(*op_base, scope, *dev_ctx, runtime_context));
-
-    // consider device_guard context
-    bool need_change_place =
-        (op_base->HasAttr("op_device") &&
-         (op_base->Attr<std::string>("op_device").length() > 0));
-    if (need_change_place) {
-      auto& op_device = op_base->Attr<std::string>("op_device");
-      if (op_device == "cpu" || platform::is_cpu_place(place)) {
-        VLOG(3) << "Switch into CPUPlace by device_guard.";
-        expected_kernel_key.place_ = platform::CPUPlace();
-      } else if (op_device.find("gpu") != std::string::npos &&
-                 platform::is_gpu_place(place)) {
-        VLOG(3) << "Switch into " << place << " by device_guard.";
-        expected_kernel_key.place_ = place;
+
+    if (dynamic_cast<const framework::OperatorWithKernel*>(op_base) ==
+        nullptr) {
+      // op is not a operatorwithkernel, so direcly run OperatorBase::Run()
+      deal_operator_base(place, var_scope, op_base, &op_func_node);
+    } else {
+      // construct RuntimeContext and analysis KernelType
+      RuntimeContext runtime_context({}, {});
+      runtime_context.inputs.swap(ins_map);
+      runtime_context.outputs.swap(outs_map);
+      InterpretercoreInferShapeContext infer_shape_ctx(*op_base,
+                                                       runtime_context);
+      // TODO(Aurelius84): In case of control flow ops, they are NOT inheritted
+      // from OperatorWithKernel.
+      static_cast<const framework::OperatorWithKernel*>(op_base)->InferShape(
+          &infer_shape_ctx);
+      auto kernels_iter = all_op_kernels.find(op->Type());
+      PADDLE_ENFORCE_NE(
+          kernels_iter, all_op_kernels.end(),
+          platform::errors::Unavailable(
+              "There are no kernels which are registered in the %s operator.",
+              op->Type()));
+
+      OpKernelMap& kernels = kernels_iter->second;
+
+      platform::DeviceContextPool& pool =
+          platform::DeviceContextPool::Instance();
+      auto* dev_ctx = pool.Get(place);
+      Scope scope;
+      auto expected_kernel_key =
+          dynamic_cast<const framework::OperatorWithKernel*>(op_base)
+              ->GetExpectedKernelType(
+                  ExecutionContext(*op_base, scope, *dev_ctx, runtime_context));
+
+      // consider device_guard()
+      apply_device_guard(
+          op_base, place,
+          &expected_kernel_key);  // change device by the device_guard()
+      VLOG(3) << "expected_kernel_key : " << expected_kernel_key;
+
+      // step 3. apply data transforms and insert memory ops
+      VariableValueMap& ins_map_temp = runtime_context.inputs;
+      std::vector<OpFuncNode> copy_op_to_insert;
+      // NOTE(xiongkun03): assign op_base here to reduce parameter number of
+      // apply_data_transform.
+      op_func_node.operator_base_ = op_base;
+      copy_op_to_insert = apply_data_transform(
+          expected_kernel_key, place, &ins_map_temp, var_scope, &op_func_node);
+      for (auto& item : copy_op_to_insert) {
+        vec_func_list->push_back(item);
+      }
+      // step 4. Run op kernel
+      VLOG(3) << op_base->Type()
+              << " : expected_kernel_key : " << expected_kernel_key;
+
+      if (platform::is_gpu_place(expected_kernel_key.place_)) {
+        op_func_node.type_ = OpFuncType::kQueueAsync;
+      } else if (platform::is_cpu_place(expected_kernel_key.place_)) {
+        op_func_node.type_ = OpFuncType::kQueueSync;
       } else {
-        PADDLE_THROW(
-            platform::errors::Fatal("Unsupported current place %s", op_device));
+        PADDLE_THROW(platform::errors::Fatal("Unsupported current place %s",
+                                             expected_kernel_key.place_));
       }
-    }
-    VLOG(3) << "expected_kernel_key : " << expected_kernel_key;
-
-    // step 3. Insert memcpy_op if needed
-    VariableValueMap& ins_map_temp = runtime_context.inputs;
-    std::unordered_set<int> no_data_transform_index;
-    for (auto& var_name_item : ins_map_temp) {
-      for (size_t i = 0; i < var_name_item.second.size(); ++i) {
-        auto var = var_name_item.second[i];
-        auto tensor_in = static_cast<const Tensor*>(&(var->Get<LoDTensor>()));
-        if (!tensor_in->IsInitialized()) {
-          continue;
-        }
-        auto kernel_type_for_var =
-            static_cast<const framework::OperatorWithKernel*>(op_base)
-                ->GetKernelTypeForVar(var_name_item.first, *tensor_in,
-                                      expected_kernel_key);
-        if (platform::is_same_place(kernel_type_for_var.place_,
-                                    expected_kernel_key.place_)) {
-          // record no need data transformer input var_id
-          auto& var_name = inputs_names[var_name_item.first][i];
-          VLOG(3) << op->Type() << " found no data_transform var: " << var_name
-                  << " with id: " << var_scope->name2id[var_name];
-          no_data_transform_index.emplace(var_scope->name2id[var_name]);
-        } else {
-          if (op_base->Type() == "fetch_v2") {
-            op_base->SetAttr("deepcopy", false);
-          }
-          // need trans place
-          // 1. add var in scope
-          // 2. add copy op
-          std::string new_var_name =
-              "temp_1" + std::to_string(var_scope->var_list.size() + 1);
-          auto v = new Variable();
-          v->GetMutable<LoDTensor>();
-          var_scope->name2id[new_var_name] = var_scope->var_list.size();
-          var_scope->var_list.push_back(v);
-
-          VariableMetaInfo info;
-          info.var_ref_count_ = 0;
-          info.vardesc_ = nullptr;
-          var_scope->vec_meta_info_.push_back(info);
-
-          VariableNameMap copy_in_map;
-          auto x_iter = inputs_names.find(var_name_item.first);
-          copy_in_map["X"] = {x_iter->second[i]};
-          VariableNameMap copy_out_map;
-          copy_out_map["Out"] = {new_var_name};
-          AttributeMap attr_map;
-          attr_map["dst_place_type"] =
-              is_cpu_place(expected_kernel_key.place_)
-                  ? 0
-                  : is_gpu_place(expected_kernel_key.place_) ? 1 : -1;
-
-          std::map<std::string, std::vector<int>> copy_ins_name2id;
-          copy_ins_name2id["X"] = ins_name2id[var_name_item.first];
-          std::map<std::string, std::vector<int>> copy_out_name2id;
-          copy_out_name2id["Out"] = {var_scope->name2id[new_var_name]};
-
-          op_func_node.input_index[var_name_item.first][i] =
-              var_scope->name2id[new_var_name];
-
-          VariableValueMap copy_ins_value_map;
-          copy_ins_value_map["X"] = {var};
-          VariableValueMap copy_outs_value_map;
-          copy_outs_value_map["Out"] = {v};
-
-          // memcpy_d2h, memcpy_h2d
-          auto memcpy_op_type = get_memcpy_type(kernel_type_for_var.place_,
-                                                expected_kernel_key.place_);
-          VLOG(3) << string::Sprintf("Insert %s with %s(%s) -> %s(%s).",
-                                     memcpy_op_type, x_iter->second[i],
-                                     kernel_type_for_var.place_, new_var_name,
-                                     expected_kernel_key.place_);
-          auto& copy_info = OpInfoMap::Instance().Get(memcpy_op_type);
-          auto copy_op = copy_info.Creator()(memcpy_op_type, copy_in_map,
-                                             copy_out_map, attr_map);
-          OpFuncNode copy_op_func_node;
-          copy_op_func_node.input_index = copy_ins_name2id;
-          copy_op_func_node.output_index = copy_out_name2id;
-
-          RuntimeContext copy_runtime_context({}, {});
-          copy_runtime_context.inputs.swap(copy_ins_value_map);
-          copy_runtime_context.outputs.swap(copy_outs_value_map);
-          InterpretercoreInferShapeContext copy_infer_shape_ctx(
-              *copy_op, copy_runtime_context);
-          static_cast<const framework::OperatorWithKernel*>(copy_op)
-              ->InferShape(&copy_infer_shape_ctx);
-
-          auto kernels_iter = all_op_kernels.find(memcpy_op_type);
-          PADDLE_ENFORCE_NE(kernels_iter, all_op_kernels.end(),
-                            platform::errors::Unavailable(
-                                "There are no kernels which are registered in "
-                                "the memcpy operator."));
-
-          OpKernelMap& kernels = kernels_iter->second;
-          auto* dev_ctx = pool.Get(place);
-          Scope scope;
-          auto copy_exec_ctx =
-              ExecutionContext(*copy_op, scope, *dev_ctx, copy_runtime_context);
-          auto expected_kernel_key =
-              dynamic_cast<const framework::OperatorWithKernel*>(copy_op)
-                  ->GetExpectedKernelType(copy_exec_ctx);
-          auto kernel_iter = kernels.find(expected_kernel_key);
-          copy_op_func_node.kernel_func_ =
-              OpKernelComputeFunc(kernel_iter->second);
-          copy_op_func_node.kernel_func_(copy_exec_ctx);
-          VLOG(3) << "Run " << memcpy_op_type << " done.";
-          // NOTE(Aurelius84): memcpy_op is expensive operation, so we tag them
-          // as kQueueSync and execute them in thread pool.
-          copy_op_func_node.type_ = OpFuncType::kQueueSync;
-          copy_op_func_node.dev_ctx_ = dev_ctx;
-          op_list->push_back(copy_op);
-          vec_func_list->push_back(copy_op_func_node);
-
-          var_name_item.second[i] = v;
-        }
+      if (!(expected_kernel_key.place_ == dev_ctx->GetPlace())) {
+        dev_ctx = pool.Get(expected_kernel_key.place_);
       }
-    }
-    op_func_node.no_data_transform_index = std::move(no_data_transform_index);
-    // step 4. Run op kernel
-    op_list->push_back(op_base);
-    VLOG(3) << op_base->Type()
-            << " : expected_kernel_key : " << expected_kernel_key;
-
-    if (platform::is_gpu_place(expected_kernel_key.place_)) {
-      op_func_node.type_ = OpFuncType::kQueueAsync;
-    } else if (platform::is_cpu_place(expected_kernel_key.place_)) {
-      op_func_node.type_ = OpFuncType::kQueueSync;
-    } else {
-      PADDLE_THROW(platform::errors::Fatal("Unsupported current place %s",
-                                           expected_kernel_key.place_));
-    }
+      op_func_node.dev_ctx_ = dev_ctx;
 
-    if (!(expected_kernel_key.place_ == dev_ctx->GetPlace())) {
-      dev_ctx = pool.Get(expected_kernel_key.place_);
-    }
-    op_func_node.dev_ctx_ = dev_ctx;
+      auto exec_ctx =
+          ExecutionContext(*op_base, scope, *dev_ctx, runtime_context);
 
-    auto exec_ctx =
-        ExecutionContext(*op_base, scope, *dev_ctx, runtime_context);
+      auto kernel_iter = kernels.find(expected_kernel_key);
+      PADDLE_ENFORCE_NE(
+          kernel_iter, kernels.end(),
+          platform::errors::NotFound(
+              "Operator (%s) does not have kernel for %s.", op->Type(),
+              KernelTypeToString(expected_kernel_key)));
 
-    auto kernel_iter = kernels.find(expected_kernel_key);
-    PADDLE_ENFORCE_NE(kernel_iter, kernels.end(),
-                      platform::errors::NotFound(
-                          "Operator (%s) does not have kernel for %s.",
-                          op->Type(), KernelTypeToString(expected_kernel_key)));
+      op_func_node.kernel_func_ = OpKernelComputeFunc(kernel_iter->second);
+      op_func_node.kernel_func_(exec_ctx);
+    }
 
-    op_func_node.kernel_func_ = OpKernelComputeFunc(kernel_iter->second);
-    op_func_node.kernel_func_(exec_ctx);
     vec_func_list->push_back(op_func_node);
-
     // gc---------------------------------------------------------------------------
     auto iter = unused_var_map.find(op_base);
     if (iter == unused_var_map.end()) {
@@ -436,14 +550,12 @@ void build_op_func_list(const platform::Place& place,
         new std::deque<std::shared_ptr<memory::Allocation>>();
 
     for (auto& var_name : delete_vars) {
-      auto it = var_scope->name2id.find(var_name);
-      assert(it != var_scope->name2id.end());
-      auto* var = var_scope->var_list[it->second];
+      auto* var = var_scope->FindVar(var_name);
       if (var == nullptr) {
         continue;
       }
 
-      VLOG(2) << "Erase variable " << var_name;
+      VLOG(6) << "Erase variable " << var_name;
       if (var->IsType<LoDTensor>()) {
         garbages->emplace_back(
             var->GetMutable<LoDTensor>()->MoveMemoryHolder());
@@ -469,6 +581,25 @@ void build_op_func_list(const platform::Place& place,
   }
 }
 
+void add_fetch(const std::vector<std::string>& fetch_names,
+               framework::BlockDesc* block) {
+  auto* fetch_holder = block->Var(kFetchVarName);
+  fetch_holder->SetType(proto::VarType::FETCH_LIST);
+  fetch_holder->SetPersistable(true);
+
+  int i = 0;
+  for (auto& fetch_name : fetch_names) {
+    // append fetch op
+    auto* op = block->AppendOp();
+    op->SetType("fetch_v2");
+    op->SetInput("X", {fetch_name});
+    op->SetOutput("Out", {kFetchVarName});
+    op->SetAttr("col", {static_cast<int>(i)});
+    op->CheckAttrs();
+    i++;
+  }
+}
+
 std::vector<size_t> merge_vector(const std::vector<size_t>& first,
                                  const std::vector<size_t>& second) {
   std::vector<size_t> out(first.size() + second.size());
@@ -483,6 +614,6 @@ std::vector<size_t> merge_vector(const std::vector<size_t>& first,
   return out;
 }
 
-}  // namespace interpretercore
+}  // namespace interpreter
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/new_executor/interpretercore_util.h b/paddle/fluid/framework/new_executor/interpretercore_util.h
index b1e1c02ab9513..375fed2356a01 100644
--- a/paddle/fluid/framework/new_executor/interpretercore_util.h
+++ b/paddle/fluid/framework/new_executor/interpretercore_util.h
@@ -48,9 +48,10 @@
 namespace paddle {
 namespace framework {
 
-namespace interpretercore {
+namespace interpreter {
 
 using AtomicVectorSizeT = std::vector<std::unique_ptr<std::atomic<size_t>>>;
+static constexpr char kFetchVarName[] = "fetch_vars";
 
 class AsyncWorkQueue {
  public:
@@ -96,18 +97,20 @@ class AsyncWorkQueue {
 std::string get_memcpy_type(const platform::Place& src_place,
                             const platform::Place& dst_place);
 
-void build_variable_scope(const framework::ProgramDesc& pdesc,
+void build_variable_scope(const framework::BlockDesc& block,
                           VariableScope* var_scope);
 
 void build_op_func_list(const platform::Place& place,
-                        const framework::ProgramDesc& pdesc,
-                        std::vector<OperatorBase*>* op_list,
+                        const framework::BlockDesc& block,
                         std::vector<OpFuncNode>* vec_func_list,
                         VariableScope* var_scope);
 
+void add_fetch(const std::vector<std::string>& fetch_names,
+               framework::BlockDesc* block);
+
 std::vector<size_t> merge_vector(const std::vector<size_t>& first,
                                  const std::vector<size_t>& second);
 
-}  // namespace interpretercore
+}  // namespace interpreter
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.h b/paddle/fluid/framework/new_executor/new_executor_defs.h
index e6cff353a659d..37fb57072f5ec 100644
--- a/paddle/fluid/framework/new_executor/new_executor_defs.h
+++ b/paddle/fluid/framework/new_executor/new_executor_defs.h
@@ -19,6 +19,7 @@
 #include <vector>
 
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/variable_helper.h"
 #include "paddle/fluid/platform/device_event_base.h"
 #include "paddle/fluid/platform/event.h"
 
@@ -463,7 +464,6 @@ class InterpretercoreInferShapeContext : public InferShapeContext {
 
 struct OpKernelFunc {
   OpKernelComputeFunc compute_func_;
-  OperatorBase* operator_base_;
 };
 
 struct VariableMetaInfo {
@@ -471,13 +471,170 @@ struct VariableMetaInfo {
   paddle::framework::VarDesc* vardesc_;
 };
 
-struct VariableScope {
-  std::vector<Variable*> var_list;
-  std::map<std::string, int> name2id;
+// TODO(zhiqiu): Maybe we need to add rwlock for VariableScope?
+
+// NOTE(xiongkun03): Use scope as a member of VariableScope, we don't need
+// ScopeBase.
+//                   Scope manager the variables and VariableScope is just a
+//                   quick
+//                   access machanism.
+class VariableScope : public ScopeBase {
+ public:
+  VariableScope() {
+    // for @EMPTY@ variable
+    var_list_.push_back(nullptr);
+    name2id_[kEmptyVarName] = 0;
+    VariableMetaInfo info;
+    info.var_ref_count_ = 0;
+    info.vardesc_ = nullptr;
+    vec_meta_info_.push_back(info);
+    scope_ptr_.reset(new Scope());
+  }
+  const Scope* GetScope() const { return scope_ptr_.get(); }
+
+  Variable* FindVar(const std::string& name) const {
+    auto it = name2id_.find(name);
+    if (it != name2id_.end()) {
+      PADDLE_ENFORCE_LT(it->second, var_list_.size(),
+                        platform::errors::NotFound(
+                            "The id(%d) of variable(%s) should not be larger "
+                            "than the size of variable list(%d).",
+                            it->second, name, var_list_.size()));
+      return var_list_[it->second];
+    }
+    return nullptr;
+  }
+
+  // Get variable id by name, return -1 if not found
+  int GetIdByName(const std::string& name) const {
+    auto it = name2id_.find(name);
+    if (it != name2id_.end()) {
+      return it->second;
+    }
+    return -1;
+  }
+
+  // Get variable name by id, return "" if not found
+  std::string GetNameById(int id) const {
+    // NOTE(zhiqiu): do not use vec_meta_info_[id].vardesc_->Name() since
+    // vec_meta_info_[id] may be nullptr,
+    // typically when the target variable is not existed in the original program
+    // desc, but created by interpretercore.
+    // For example, created and used by d2h_copy or h2d_copy operator.
+    auto it =
+        std::find_if(name2id_.begin(), name2id_.end(),
+                     [id](const auto& pair) { return pair.second == id; });
+    if (it != name2id_.end()) {
+      return it->first;
+    }
+    return "";
+  }
+
+  bool HasVar(const std::string& name) const {
+    return name2id_.find(name) != name2id_.end();
+  }
+
+  int VarId(const std::string& name) const {
+    CheckExist(name);
+    return name2id_.at(name);
+  }
+
+  Variable* Var(int id) const { return var_list_.at(id); }
+
+  Variable* Var(const std::string& name) const {
+    return var_list_.at(VarId(name));
+  }
+
+  size_t VarSize() const { return var_list_.size(); }
+
+  void AddVar(const std::string& name, VarDesc* var_desc) {  // NOLINT
+    name2id_[name] = VarSize();
+    auto v = scope_ptr_->Var(name);
+    if (nullptr == var_desc) {
+      v->GetMutable<LoDTensor>();
+    } else {
+      InitializeVariable(
+          v,
+          var_desc
+              ->GetType());  // Scope don't initialize variable recently created
+    }
+    var_list_.push_back(v);
+
+    VariableMetaInfo info;
+    info.var_ref_count_ = 0;
+    info.vardesc_ = var_desc;
+    vec_meta_info_.push_back(info);
+  }
+
+  void AddVar(const std::string& name, Variable& var) {  // NOLINT
+    // must copy.
+    VLOG(4) << "Add variable: " << name << " through AddVar()";
+    auto v = scope_ptr_->Var(name);
+    *v = var;
+    name2id_[name] = VarSize();
+    var_list_.push_back(v);
+
+    VariableMetaInfo info;
+    info.var_ref_count_ = 0;
+    info.vardesc_ = nullptr;
+    vec_meta_info_.push_back(info);
+  }
+
+  void SetVarDesc(const std::string& name, framework::VarDesc* var_desc) {
+    CheckExist(name);
+    vec_meta_info_[VarId(name)].vardesc_ = var_desc;
+  }
+
+  paddle::framework::VarDesc* VarDesc(const std::string& name) const {
+    return VarDesc(VarId(name));
+  }
+
+  paddle::framework::VarDesc* VarDesc(int id) const {
+    CheckExist(id);
+    return vec_meta_info_[id].vardesc_;
+  }
+
+  void CheckExist(int id) const {
+    PADDLE_ENFORCE_LT(id, var_list_.size(),
+                      platform::errors::PreconditionNotMet(
+                          "Required var_id < %d, but received var_id = %d.",
+                          var_list_.size(), id));
+  }
+
+  void CheckExist(const std::string& name) const {
+    PADDLE_ENFORCE_EQ(
+        HasVar(name), true,
+        platform::errors::NotFound("%s not in VariableScope.", name));
+  }
+
+  std::vector<VariableMetaInfo>& MutableVecMetaInfo() { return vec_meta_info_; }
+
+  const std::vector<VariableMetaInfo>& VecMetaInfo() const {
+    return vec_meta_info_;
+  }
+
+ private:
+  std::vector<Variable*> var_list_;
+  std::map<std::string, int> name2id_;
   std::vector<VariableMetaInfo> vec_meta_info_;
+  std::unique_ptr<Scope> scope_ptr_;
 };
 
-struct NextInstruction {
+class NextInstruction {
+ public:
+  void AddDirectRun(size_t id) { direct_run_.push_back(id); }
+
+  void ADDEventRun(size_t id) { event_wait_run_.push_back(id); }
+
+  void AddSyncRun(size_t id) { synchronize_run_.push_back(id); }
+
+  const std::vector<size_t>& DirectRunIds() const { return direct_run_; }
+
+  const std::vector<size_t>& EventRunIds() const { return event_wait_run_; }
+
+  const std::vector<size_t>& SyncRunIds() const { return synchronize_run_; }
+
+ private:
   std::vector<size_t> direct_run_;
   std::vector<size_t> event_wait_run_;
   std::vector<size_t> synchronize_run_;
@@ -503,51 +660,140 @@ enum class OpFuncType {
 };
 class RuntimeInferShapeContext;
 
-struct Instruction {
-  OpKernelFunc kernel_func_;
+struct OpFuncNode {
+  OperatorBase* operator_base_;
+  std::map<std::string, std::vector<int>> input_index;
+  std::map<std::string, std::vector<int>> output_index;
+  std::unordered_set<int> no_data_transform_index;
+
+  OpKernelComputeFunc kernel_func_;
+  platform::DeviceContext* dev_ctx_;  // not owned
+  OpFuncType type_;
+};
+
+class Instruction {
+ public:
+  Instruction(size_t id, const OpFuncNode& op_func_node,
+              const platform::DeviceContext& dev_ctx)
+      : id_(id), op_func_node_(op_func_node), dev_ctx_(dev_ctx) {
+    PADDLE_ENFORCE_GE(id, 0, platform::errors::PreconditionNotMet(
+                                 "Required id >= 0, but received id = %d", id));
+  }
+
+  size_t Id() const { return id_; }
+
+  const std::map<std::string, std::vector<int>>& Inputs() const {
+    return op_func_node_.input_index;
+  }
+
+  const std::map<std::string, std::vector<int>>& Outputs() const {
+    return op_func_node_.output_index;
+  }
+
+  const std::unordered_set<int>& NoDataTransformVars() const {
+    return op_func_node_.no_data_transform_index;
+  }
+
+  OpKernelComputeFunc KernelFunc() const { return op_func_node_.kernel_func_; }
+
+  OpFuncType KernelType() const { return op_func_node_.type_; }
+
+  OperatorBase* OpBase() const {
+    auto* op_base = op_func_node_.operator_base_;
+    PADDLE_ENFORCE_NOT_NULL(op_base, platform::errors::PreconditionNotMet(
+                                         "op_base shall not be nullptr."));
+    return op_base;
+  }
+
+  NextInstruction& NextInstructions() { return next_instruction_; }
+
+  const NextInstruction& NextInstructions() const { return next_instruction_; }
+
+  void AddGCCheckVar(size_t id) { gc_check_var_list_.push_back(id); }
+
+  const std::vector<size_t>& GCCheckVars() const { return gc_check_var_list_; }
+
+  void ResetContext(const VariableValueMap& in_vars,
+                    const VariableValueMap& out_vars) {
+    runtime_ctx_.reset(new RuntimeContext(in_vars, out_vars));
+    infershape_ctx_.reset(
+        new InterpretercoreInferShapeContext(*OpBase(), *runtime_ctx_.get()));
+    // NOTE: Because execution_ctx_ is constructed by `scope&`, so we fake an
+    // empty here to avoid illegal local reference.
+    static framework::Scope scope_;
+    execution_ctx_.reset(
+        new ExecutionContext(*OpBase(), scope_, dev_ctx_, *runtime_ctx_.get()));
+  }
+
+  std::shared_ptr<RuntimeContext> InnerRuntimeContext() const {
+    return runtime_ctx_;
+  }
+
+  std::shared_ptr<InterpretercoreInferShapeContext> InnerInferShapeContext()
+      const {
+    return infershape_ctx_;
+  }
+
+  std::shared_ptr<ExecutionContext> InnerExecutionContext() const {
+    return execution_ctx_;
+  }
+
+  const platform::DeviceContext& DeviceContext() const { return dev_ctx_; }
+
+  const std::vector<std::pair<Variable*, Variable*>>& InplaceInfo() const {
+    return vec_inplace_in_to_out_;
+  }
+
+  void AddInplace(Variable* in, Variable* out) {
+    vec_inplace_in_to_out_.emplace_back(in, out);
+  }
+
+  const std::vector<EventInter>& InputEvents() const { return intput_events_; }
+
+  const std::vector<EventInter>& OutputEvents() const { return output_events_; }
+
+  void AddInputEvent(size_t var_id,
+                     std::shared_ptr<platform::DeviceEvent> event,
+                     platform::DeviceType waiter_type) {
+    intput_events_.emplace_back(var_id, event, waiter_type);
+  }
+
+  void AddOutputEvent(size_t var_id,
+                      std::shared_ptr<platform::DeviceEvent> event,
+                      platform::DeviceType waiter_type) {
+    output_events_.emplace_back(var_id, event, waiter_type);
+  }
+
+ private:
+  size_t id_;
+  const OpFuncNode& op_func_node_;          // not owned
+  const platform::DeviceContext& dev_ctx_;  // not owned
+
   std::shared_ptr<RuntimeContext> runtime_ctx_;
   std::shared_ptr<InterpretercoreInferShapeContext> infershape_ctx_;
   std::shared_ptr<ExecutionContext> execution_ctx_;
-  std::map<std::string, std::vector<int>> input_index_;
-  std::map<std::string, std::vector<int>> output_index_;
 
-  std::unordered_set<int> no_data_transform_index_;
-
-  std::vector<size_t> gc_check_var_list;
+  std::vector<size_t> gc_check_var_list_;
   NextInstruction next_instruction_;
 
   std::vector<EventInter> intput_events_;
   std::vector<EventInter> output_events_;
 
-  platform::DeviceContext* dev_ctx_;  // not owned
-  OpFuncType type_;
-
   std::vector<std::pair<Variable*, Variable*>> vec_inplace_in_to_out_;
 };
 
-struct OpFuncNode {
-  // int unsed;
-  std::map<std::string, std::vector<int>> input_index;
-  std::map<std::string, std::vector<int>> output_index;
-  std::unordered_set<int> no_data_transform_index;
-
-  OpKernelComputeFunc kernel_func_;
-  platform::DeviceContext* dev_ctx_;  // not owned
-  OpFuncType type_;
-};
-
-namespace interpretercore {
+namespace interpreter {
 static constexpr char kMemcpyH2D[] = "memcpy_h2d";
 static constexpr char kMemcpyD2H[] = "memcpy_d2h";
 
 static bool IsMemcpyH2D(const Instruction& instr) {
-  return instr.kernel_func_.operator_base_->Type() == kMemcpyH2D;
+  return instr.OpBase()->Type() == kMemcpyH2D;
 }
 
 static bool IsMemcpyD2H(const Instruction& instr) {
-  return instr.kernel_func_.operator_base_->Type() == kMemcpyD2H;
+  return instr.OpBase()->Type() == kMemcpyD2H;
 }
-}  // namespace interpretercore
+}  // namespace interpreter
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/new_executor/nonblocking_threadpool.h b/paddle/fluid/framework/new_executor/nonblocking_threadpool.h
index 6e56532456c6f..cdcdbbb445185 100644
--- a/paddle/fluid/framework/new_executor/nonblocking_threadpool.h
+++ b/paddle/fluid/framework/new_executor/nonblocking_threadpool.h
@@ -394,16 +394,16 @@ class ThreadPoolTempl {
     // We already did best-effort emptiness check in Steal, so prepare for
     // blocking.
     ec_.Prewait();
+    if (cancelled_) {
+      ec_.CancelWait();
+      return false;
+    }
     // Now do a reliable emptiness check.
     int victim = NonEmptyQueueIndex();
     if (victim != -1) {
       ec_.CancelWait();
-      if (cancelled_) {
-        return false;
-      } else {
-        *t = thread_data_[victim].queue.PopBack();
-        return true;
-      }
+      *t = thread_data_[victim].queue.PopBack();
+      return true;
     }
     // Number of blocked threads is used as termination condition.
     // If we are shutting down and all worker threads blocked without work,
diff --git a/paddle/fluid/framework/new_executor/profiler.h b/paddle/fluid/framework/new_executor/profiler.h
index 77783535b6471..51c9e3d66a6f0 100644
--- a/paddle/fluid/framework/new_executor/profiler.h
+++ b/paddle/fluid/framework/new_executor/profiler.h
@@ -20,84 +20,41 @@
 
 namespace paddle {
 namespace framework {
-
-static void GetTensors(Variable* var, std::unordered_set<Tensor*>* tensor_set) {
-  if (var->IsType<LoDTensor>() && var->Get<LoDTensor>().IsInitialized()) {
-    tensor_set->insert(var->GetMutable<LoDTensor>());
-  } else if (var->IsType<SelectedRows>() &&
-             var->Get<SelectedRows>().value().IsInitialized()) {
-    tensor_set->insert(var->GetMutable<SelectedRows>()->mutable_value());
-  } else if (var->IsType<LoDTensorArray>()) {
-    auto* tensor_arr = var->GetMutable<LoDTensorArray>();
-    for (auto& t : *tensor_arr) {
-      if (t.IsInitialized()) {
-        tensor_set->insert(&t);
-      }
-    }
-  }
-}
-
-static std::pair<size_t, size_t> GetTensorMemorySize(
-    const std::vector<Variable*>& var_list) {
-  std::unordered_set<Tensor*> tensor_set;
-  for (auto* var : var_list) {
-    GetTensors(var, &tensor_set);
-  }
-  size_t host_memory_bytes = 0;
-  size_t device_memory_bytes = 0;
-  std::unordered_set<memory::Allocation*> allocation_set;
-  for (auto* tensor : tensor_set) {
-    auto allocation = tensor->Holder().get();
-    if (!allocation_set.count(allocation)) {
-      allocation_set.insert(allocation);
-      if (platform::is_cuda_pinned_place(tensor->place()) ||
-          platform::is_cpu_place(tensor->place())) {
-        VLOG(3) << "found host memory : " << allocation->size();
-        host_memory_bytes += allocation->size();
-      } else {
-        VLOG(3) << "found device memory : " << allocation->size();
-        device_memory_bytes += allocation->size();
-      }
-    }
-  }
-  return {host_memory_bytes, device_memory_bytes};
-}
-
+namespace interpreter {
 struct CostInfo {
   double total_time{0.};          // ms
   size_t device_memory_bytes{0};  // total allocated memory size
 };
 
-class InterpreterProfiler {
+class ProfilerGuard {
  public:
-  void Start() { timer_.Start(); }
-
-  void Pause() {
-    timer_.Pause();
-    cost_info_.total_time += timer_.ElapsedMS();
+  ProfilerGuard(const platform::Place& place, CostInfo* cost_info)
+      : place_(place), cost_info_(cost_info) {
+    timer_.Start();
   }
 
-  void Reset() {
-    timer_.Reset();
-    cost_info_.total_time = 0.;
-    cost_info_.device_memory_bytes = 0;
+  ~ProfilerGuard() {
+    timer_.Pause();
+    cost_info_->total_time += timer_.ElapsedMS();
+    TotalCUDAAllocatedMemorySize(place_);
   }
 
+ private:
   void TotalCUDAAllocatedMemorySize(const platform::Place& place) {
     if (platform::is_gpu_place(place)) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       auto cuda_place = BOOST_GET_CONST(platform::CUDAPlace, place);
-      cost_info_.device_memory_bytes =
+      cost_info_->device_memory_bytes =
           platform::RecordedCudaMallocSize(cuda_place.device);
 #endif
     }
   }
 
-  const CostInfo& GetCostInfo() const { return cost_info_; }
-
- private:
+  const platform::Place& place_;
+  CostInfo* cost_info_;
   platform::Timer timer_;
-  CostInfo cost_info_;
 };
+
+}  // namespace interpreter
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc
index a7579d54616af..e3bcbaec7ad70 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor.cc
@@ -33,41 +33,33 @@ StandaloneExecutor::StandaloneExecutor(const platform::Place& place,
     auto name_list = outer_scope_->LocalVarNames();
     for (auto name : name_list) {
       auto v = outer_scope_->Var(name);
-      if (global_scope_.name2id.find(name) == global_scope_.name2id.end()) {
-        global_scope_.name2id[name] = global_scope_.var_list.size();
-        global_scope_.var_list.push_back(v);
-
-        VariableMetaInfo info;
-        info.var_ref_count_ = 0;
-        info.vardesc_ = nullptr;
-        global_scope_.vec_meta_info_.push_back(info);
+      if (!global_scope_.HasVar(name)) {
+        global_scope_.AddVar(name, *v);
       }
     }
   }
 
   // run startup program
   std::vector<paddle::framework::OpFuncNode> vec_func_list;
-  std::vector<paddle::framework::OperatorBase*> op_list;
-  paddle::framework::interpretercore::build_op_func_list(
-      place_, startup_prog, &op_list, &vec_func_list, &global_scope_);
+  paddle::framework::interpreter::build_op_func_list(
+      place_, startup_prog.Block(0), &vec_func_list, &global_scope_);
 }
 
 paddle::framework::FetchList StandaloneExecutor::Run(
     const std::vector<std::string>& feed_names,
-    const std::vector<framework::Tensor>& feed_tensors,
+    const std::vector<framework::LoDTensor>& feed_tensors,
     const std::vector<std::string>& fetch_names) {
   auto core = GetInterpreterCore(feed_names, fetch_names);
 
-  return core->Run(feed_tensors);
+  return core->Run(feed_names, feed_tensors);
 }
 
-const CostInfo& StandaloneExecutor::DryRun(
+framework::interpreter::CostInfo StandaloneExecutor::DryRun(
     const std::vector<std::string>& feed_names,
-    const std::vector<framework::Tensor>& feed_tensors) {
+    const std::vector<framework::LoDTensor>& feed_tensors) {
   auto core = GetInterpreterCore(feed_names, {});
 
-  auto& cost_info = core->DryRun(feed_tensors);
-  return cost_info;
+  return core->DryRun(feed_names, feed_tensors);
 }
 
 void StandaloneExecutor::BuildVariableOuterScope(
@@ -80,16 +72,8 @@ void StandaloneExecutor::BuildVariableOuterScope(
       continue;
     }
 
-    if (var_scope->name2id.find(var->Name()) == var_scope->name2id.end()) {
-      var_scope->name2id[var->Name()] = var_scope->var_list.size();
-      auto v = outer_scope->Var(var->Name());
-      InitializeVariable(v, var->GetType());
-      var_scope->var_list.push_back(v);
-
-      VariableMetaInfo info;
-      info.var_ref_count_ = 0;
-      info.vardesc_ = var;
-      var_scope->vec_meta_info_.push_back(info);
+    if (!var_scope->HasVar(var->Name())) {
+      var_scope->AddVar(var->Name(), var);
     }
   }
 }
@@ -111,8 +95,15 @@ std::shared_ptr<InterpreterCore> StandaloneExecutor::GetInterpreterCore(
 
   if (iter == interpretercores_.end()) {
     VLOG(3) << "create interpreter_core for " << oss.str();
-    auto core = std::make_shared<InterpreterCore>(
-        place_, main_prog_, &global_scope_, feed_names, fetch_names);
+    // NOTE(Aurelius84): `add_fetch` will modify BlockDesc, so we should copy a
+    // new program.
+    auto new_prog = std::make_shared<framework::ProgramDesc>(main_prog_);
+    auto* block = new_prog->MutableBlock(0);
+    interpreter::add_fetch(fetch_names, block);
+
+    auto core =
+        std::make_shared<InterpreterCore>(place_, *block, &global_scope_);
+    programs_.emplace(oss.str(), new_prog);
     interpretercores_.emplace(oss.str(), core);
     return core;
   } else {
diff --git a/paddle/fluid/framework/new_executor/standalone_executor.h b/paddle/fluid/framework/new_executor/standalone_executor.h
index 600c90e3a11a6..1fbdf7b4b0ad6 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor.h
+++ b/paddle/fluid/framework/new_executor/standalone_executor.h
@@ -28,7 +28,7 @@ class ExecutorBase {
   virtual ~ExecutorBase() {}
   virtual paddle::framework::FetchList Run(
       const std::vector<std::string>& feed_names,
-      const std::vector<framework::Tensor>& feed_tensors,
+      const std::vector<framework::LoDTensor>& feed_tensors,
       const std::vector<std::string>& fetch_names) = 0;
 };
 
@@ -42,11 +42,12 @@ class StandaloneExecutor : public ExecutorBase {
 
   virtual paddle::framework::FetchList Run(
       const std::vector<std::string>& feed_names,
-      const std::vector<framework::Tensor>& feed_tensors,
+      const std::vector<framework::LoDTensor>& feed_tensors,
       const std::vector<std::string>& fetch_names);
 
-  const CostInfo& DryRun(const std::vector<std::string>& feed_names,
-                         const std::vector<framework::Tensor>& feed_tensors);
+  framework::interpreter::CostInfo DryRun(
+      const std::vector<std::string>& feed_names,
+      const std::vector<framework::LoDTensor>& feed_tensors);
 
  private:
   void BuildVariableOuterScope(const framework::ProgramDesc& pdesc,
@@ -62,6 +63,7 @@ class StandaloneExecutor : public ExecutorBase {
   Scope* outer_scope_;
   VariableScope global_scope_;
 
+  std::unordered_map<std::string, std::shared_ptr<ProgramDesc>> programs_;
   std::unordered_map<std::string, std::shared_ptr<InterpreterCore>>
       interpretercores_;
 };
diff --git a/paddle/fluid/framework/new_executor/stream_analyzer.cc b/paddle/fluid/framework/new_executor/stream_analyzer.cc
index ffc2da499e1f7..23b61dd3d5ee7 100644
--- a/paddle/fluid/framework/new_executor/stream_analyzer.cc
+++ b/paddle/fluid/framework/new_executor/stream_analyzer.cc
@@ -31,15 +31,15 @@ namespace framework {
 std::vector<size_t> StreamAnalyzer::ParseEventVarIds(
     const Instruction& cur_instr, const Instruction& next_instr) {
   std::unordered_set<size_t> unique_var_ids;
-  for (auto& item : cur_instr.output_index_) {
+  for (auto& item : cur_instr.Outputs()) {
     unique_var_ids.insert(item.second.begin(), item.second.end());
   }
 
   std::vector<size_t> new_event_var_ids;
-  for (auto& item : next_instr.input_index_) {
+  for (auto& item : next_instr.Inputs()) {
     for (auto var_id : item.second) {
       if (unique_var_ids.count(var_id) > 0 &&
-          next_instr.no_data_transform_index_.count(var_id) == 0) {
+          next_instr.NoDataTransformVars().count(var_id) == 0) {
         new_event_var_ids.push_back(var_id);
       }
     }
@@ -57,8 +57,7 @@ void StreamAnalyzer::AssociateInputWithEvents(
       var_id2event_.emplace(var_id, std::move(device_event));
     }
     // Add events for next_instr.inputs
-    next_instr->intput_events_.emplace_back(var_id, var_id2event_.at(var_id),
-                                            waiter_type);
+    next_instr->AddInputEvent(var_id, var_id2event_.at(var_id), waiter_type);
   }
 }
 
@@ -66,13 +65,13 @@ void StreamAnalyzer::Schedule(const std::vector<size_t>& downstream_ops,
                               std::vector<Instruction>* instructions,
                               size_t op_index) {
   auto& cur_instr = instructions->at(op_index);
-  auto& next_instruction = cur_instr.next_instruction_;
+  auto& next_instruction = cur_instr.NextInstructions();
   std::vector<size_t> event_var_ids;
   for (auto next_op_id : downstream_ops) {
     auto& next_instr = instructions->at(next_op_id);
 
     if (IsDirectRun(cur_instr, next_instr)) {
-      next_instruction.direct_run_.emplace_back(next_op_id);
+      next_instruction.AddDirectRun(next_op_id);
     } else {
       // Always insert events between different stream
       auto new_event_var_ids = ParseEventVarIds(cur_instr, next_instr);
@@ -83,29 +82,29 @@ void StreamAnalyzer::Schedule(const std::vector<size_t>& downstream_ops,
       AssociateInputWithEvents(new_event_var_ids, &next_instr, waiter_type);
 
       if (waiter_type == platform::kCPU) {  // GPU -> CPU
-        next_instruction.synchronize_run_.emplace_back(next_op_id);
+        next_instruction.AddSyncRun(next_op_id);
       } else {  // GPU -> GPU(different stream)
-        next_instruction.event_wait_run_.emplace_back(next_op_id);
+        next_instruction.ADDEventRun(next_op_id);
       }
     }
   }
   // Create events for these cross-stream vars
-  VLOG(3) << cur_instr.kernel_func_.operator_base_->Type()
+  VLOG(3) << cur_instr.OpBase()->Type()
           << " event_var_ids.size: " << event_var_ids.size();
   for (auto var_id : event_var_ids) {
-    cur_instr.output_events_.emplace_back(var_id, var_id2event_.at(var_id),
-                                          platform::kCUDA /*not used*/);
+    cur_instr.AddOutputEvent(var_id, var_id2event_.at(var_id),
+                             platform::kCUDA /*not used*/);
   }
 }
 
 platform::DeviceContext* StreamAnalyzer::ParseDeviceContext(
-    const OpFuncNode& op_func_node, const OperatorBase& op_base) {
-  auto& op_type = op_base.Type();
+    const OpFuncNode& op_func_node) {
+  auto& op_type = op_func_node.operator_base_->Type();
   auto* dev_ctx = op_func_node.dev_ctx_;
-  if (op_type == interpretercore::kMemcpyH2D) {
+  if (op_type == interpreter::kMemcpyH2D) {
     VLOG(3) << "Get dev_ctx from d2h_context_pool_";
     dev_ctx = d2h_ctx_pool_.Get(place_);
-  } else if (op_type == interpretercore::kMemcpyD2H) {
+  } else if (op_type == interpreter::kMemcpyD2H) {
     VLOG(3) << "Get dev_ctx from h2d_context_pool_";
     dev_ctx = h2d_ctx_pool_.Get(place_);
   }
@@ -122,13 +121,13 @@ platform::DeviceContext* StreamAnalyzer::ParseDeviceContext(
  */
 bool StreamAnalyzer::IsDirectRun(Instruction& cur_instr,
                                  const Instruction& next_instr) {
-  return (cur_instr.dev_ctx_ == next_instr.dev_ctx_ ||
-          interpretercore::IsMemcpyD2H(cur_instr) ||
-          interpretercore::IsMemcpyH2D(next_instr));
+  return (&cur_instr.DeviceContext() == &next_instr.DeviceContext() ||
+          interpreter::IsMemcpyD2H(cur_instr) ||
+          interpreter::IsMemcpyH2D(next_instr));
 }
 
 platform::DeviceType StreamAnalyzer::GetWaiterType(const Instruction& instr) {
-  if (instr.type_ == OpFuncType::kQueueSync) {
+  if (instr.KernelType() == OpFuncType::kQueueSync) {
     return platform::kCPU;
   } else {
     return platform::kCUDA;
diff --git a/paddle/fluid/framework/new_executor/stream_analyzer.h b/paddle/fluid/framework/new_executor/stream_analyzer.h
index dc2af389e36b0..df74c9b933712 100644
--- a/paddle/fluid/framework/new_executor/stream_analyzer.h
+++ b/paddle/fluid/framework/new_executor/stream_analyzer.h
@@ -32,8 +32,7 @@ class StreamAnalyzer {
   void Schedule(const std::vector<size_t>& downstream_ops,
                 std::vector<Instruction>* instructions, size_t op_index);
 
-  platform::DeviceContext* ParseDeviceContext(const OpFuncNode& op_func_node,
-                                              const OperatorBase& op_base);
+  platform::DeviceContext* ParseDeviceContext(const OpFuncNode& op_func_node);
 
  private:
   std::vector<size_t> ParseEventVarIds(const Instruction& cur_instr,
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 0cd17cdb10d55..e0a80d3c79854 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -29,6 +29,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/profiler.h"
+#include "paddle/pten/common/scalar.h"
 
 namespace paddle {
 namespace framework {
@@ -49,6 +50,7 @@ DECLARE_bool(check_nan_inf);
 DECLARE_bool(enable_unused_var_check);
 PADDLE_DEFINE_EXPORTED_int32(inner_op_parallelism, 0,
                              "number of threads for inner op");
+DECLARE_bool(run_pten_kernel);
 
 namespace paddle {
 namespace framework {
@@ -60,7 +62,7 @@ std::vector<std::tuple<platform::Place, LibraryType>> kKernelPriority = {
     std::make_tuple(platform::CPUPlace(), LibraryType::kPlain),
 };
 
-static DDim GetDimsDebug(const Scope& scope, const std::string& name,
+static DDim GetDimsDebug(const ScopeBase& scope, const std::string& name,
                          bool get_actual_dim = false) {
   Variable* var = scope.FindVar(name);
   if (var == nullptr) {
@@ -83,13 +85,13 @@ static DDim GetDimsDebug(const Scope& scope, const std::string& name,
   }
 }
 
-static bool VarInited(const Scope& scope, const std::string& name) {
+static bool VarInited(const ScopeBase& scope, const std::string& name) {
   Variable* var = scope.FindVar(name);
   if (var == nullptr) return false;
   return var->IsInitialized();
 }
 
-static std::string GetDtype(const Scope& scope, const std::string& name) {
+static std::string GetDtype(const ScopeBase& scope, const std::string& name) {
   Variable* var = scope.FindVar(name);
   if (var == nullptr) {
     return "";
@@ -115,7 +117,7 @@ static std::string GetDtype(const Scope& scope, const std::string& name) {
   }
 }
 
-static std::string GetPlace(const Scope& scope, const std::string& name) {
+static std::string GetPlace(const ScopeBase& scope, const std::string& name) {
   Variable* var = scope.FindVar(name);
   if (var == nullptr) {
     return "";
@@ -144,7 +146,7 @@ static std::string GetPlace(const Scope& scope, const std::string& name) {
   }
 }
 
-static int GetRowSize(const Scope& scope, const std::string& name) {
+static int GetRowSize(const ScopeBase& scope, const std::string& name) {
   Variable* var = scope.FindVar(name);
   if (var == nullptr) {
     return -1;
@@ -157,7 +159,7 @@ static int GetRowSize(const Scope& scope, const std::string& name) {
   return -1;
 }
 
-static LoD GetLoDDebug(const Scope& scope, const std::string& name) {
+static LoD GetLoDDebug(const ScopeBase& scope, const std::string& name) {
   Variable* var = scope.FindVar(name);
   auto default_lod = LoD({{}});
 
@@ -306,7 +308,7 @@ const std::vector<std::string>& OperatorBase::Outputs(
   return it->second;
 }
 
-std::string OperatorBase::DebugStringEx(const Scope* scope) const {
+std::string OperatorBase::DebugStringEx(const ScopeBase* scope) const {
   std::stringstream ss;
   ss << "Op(" << type_ << "), inputs:{";
 
@@ -1120,8 +1122,24 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   }
 #endif
 
-  if (kernel_type_.get() == nullptr || kernel_func_.get() == nullptr) {
-    ChooseKernel(*runtime_ctx, scope, place);
+  auto exe_ctx = ExecutionContext(*this, scope, *dev_ctx, *runtime_ctx);
+
+  // TODO(chenweihang): Now we are still reusing a lot of the original fluid
+  // implementation, this is a gradual replacement process
+  // TODO(chenweihang): in the first phase of project, we only support CPU, CUDA
+  // and RCOM backend, the XPU, NPU and MKLDNN will be supported in the second
+  // phase
+  if (FLAGS_run_pten_kernel &&
+      pten::KernelFactory::Instance().HasCompatiblePtenKernel(type_)) {
+    if (pt_kernel_signature_ == nullptr || pt_kernel_ == nullptr) {
+      ChoosePtenKernel(exe_ctx);
+    }
+    run_pten_kernel_ = pt_kernel_->IsValid();
+  }
+  if (!run_pten_kernel_) {
+    if (kernel_type_.get() == nullptr || kernel_func_.get() == nullptr) {
+      ChooseKernel(exe_ctx);
+    }
   }
 
   // do data transformScope &transfer_scope;
@@ -1159,8 +1177,17 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   {
     platform::RecordEvent record_event("compute",
                                        platform::EventRole::kInnerOp);
-    (*kernel_func_)(
-        ExecutionContext(*this, exec_scope, *dev_ctx, *runtime_ctx));
+    if (run_pten_kernel_) {
+      if (pt_kernel_context_ == nullptr) {
+        pt_kernel_context_.reset(new pten::KernelContext());
+      }
+      BuildPtenKernelContext(*runtime_ctx, dev_ctx);
+      (*pt_kernel_)(pt_kernel_context_.get());
+      pt_kernel_context_->ClearData();
+    } else {
+      (*kernel_func_)(
+          ExecutionContext(*this, exec_scope, *dev_ctx, *runtime_ctx));
+    }
   }
 
   if (!transfered_inplace_vars.empty()) {
@@ -1208,25 +1235,11 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   }
 }
 
-void OperatorWithKernel::ChooseKernel(const RuntimeContext& ctx,
-                                      const Scope& scope,
-                                      const platform::Place& place) const {
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  auto* dev_ctx = pool.Get(place);
-
-  // check if op[type] has kernel registered.
-  auto& all_op_kernels = AllOpKernels();
-  auto kernels_iter = all_op_kernels.find(type_);
-  PADDLE_ENFORCE_NE(
-      kernels_iter, all_op_kernels.end(),
-      platform::errors::Unavailable(
-          "There are no kernels which are registered in the %s operator.",
-          type_));
-
-  OpKernelMap& kernels = kernels_iter->second;
+OpKernelType OperatorWithKernel::InnerGetExpectedKernelType(
+    const ExecutionContext& ctx) const {
+  auto& dev_ctx = ctx.device_context();
 
-  auto expected_kernel_key = this->GetExpectedKernelType(
-      ExecutionContext(*this, scope, *dev_ctx, ctx));
+  auto expected_kernel_key = this->GetExpectedKernelType(ctx);
   if (HasAttr("op_device")) {
     if (Attr<std::string>("op_device") == "cpu") {
       expected_kernel_key.place_ = platform::CPUPlace();
@@ -1243,9 +1256,9 @@ void OperatorWithKernel::ChooseKernel(const RuntimeContext& ctx,
       // when the Op that only has CPUKernel is assigned to GPU, the CPUKernel
       // will be executed and a warning will be given at the same time.
       if (SupportGPU()) {
-        expected_kernel_key.place_ = dev_ctx->GetPlace();
+        expected_kernel_key.place_ = dev_ctx.GetPlace();
       } else if (SupportNPU()) {
-        expected_kernel_key.place_ = dev_ctx->GetPlace();
+        expected_kernel_key.place_ = dev_ctx.GetPlace();
       } else {
         expected_kernel_key.place_ = platform::CPUPlace();
         LOG_FIRST_N(WARNING, 1)
@@ -1256,6 +1269,47 @@ void OperatorWithKernel::ChooseKernel(const RuntimeContext& ctx,
   }
   VLOG(3) << "op type:" << type_
           << ", expected_kernel_key:" << expected_kernel_key;
+  return expected_kernel_key;
+}
+
+void OperatorWithKernel::ChoosePtenKernel(const ExecutionContext& ctx) const {
+  pt_kernel_signature_.reset(
+      new KernelSignature(std::move(this->GetExpectedPtenKernelArgs(ctx))));
+
+  VLOG(1) << KernelSignatureToString(*pt_kernel_signature_.get());
+
+  kernel_type_.reset(
+      new OpKernelType(std::move(InnerGetExpectedKernelType(ctx))));
+
+  auto pt_kernel_name = pten::KernelName(pt_kernel_signature_->name);
+  auto pt_kernel_key = TransOpKernelTypeToPtenKernelKey(*kernel_type_.get());
+  pt_kernel_.reset(
+      new pten::Kernel(pten::KernelFactory::Instance().SelectKernel(
+          pt_kernel_name, pt_kernel_key)));
+
+  if (pt_kernel_->IsValid()) {
+    VLOG(1) << "Static mode ChoosePtenKernel - kernel name: " << pt_kernel_name
+            << " | kernel key: " << pt_kernel_key
+            << " | kernel: " << *pt_kernel_;
+  } else {
+    VLOG(1) << "Static mode ChoosePtenKernel - kernel `" << pt_kernel_name
+            << "` not found.";
+  }
+}
+
+void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
+  // check if op[type] has kernel registered.
+  auto& all_op_kernels = AllOpKernels();
+  auto kernels_iter = all_op_kernels.find(type_);
+  PADDLE_ENFORCE_NE(
+      kernels_iter, all_op_kernels.end(),
+      platform::errors::Unavailable(
+          "There are no kernels which are registered in the %s operator.",
+          type_));
+
+  OpKernelMap& kernels = kernels_iter->second;
+
+  auto expected_kernel_key = InnerGetExpectedKernelType(ctx);
 
   auto kernel_iter = kernels.find(expected_kernel_key);
 #ifdef PADDLE_WITH_MKLDNN
@@ -1562,11 +1616,10 @@ Scope* OperatorWithKernel::PrepareData(
 }
 
 void OperatorWithKernel::ParseInputDataType(
-    const ExecutionContext& ctx, const std::string& name,
+    const std::vector<Variable*>& vars, const std::string& name,
     proto::VarType::Type* data_type) const {
   proto::VarType::Type default_data_type =
       static_cast<proto::VarType::Type>(-1);
-  const std::vector<Variable*> vars = ctx.MultiInputVar(name);
   for (size_t i = 0; i < vars.size(); ++i) {
     const Variable* var = vars[i];
     if (var != nullptr) {
@@ -1588,10 +1641,9 @@ void OperatorWithKernel::ParseInputDataType(
       if (t != nullptr) {
         PADDLE_ENFORCE_EQ(
             t->IsInitialized(), true,
-            platform::errors::InvalidArgument(
-                "The Tensor in the %s Op's Input Variable %s(%s) is "
-                "not initialized.",
-                Type(), name, ctx.InputNames(name).at(i)));
+            platform::errors::InvalidArgument("The %s Op's Input Variable `%s` "
+                                              "contains uninitialized Tensor.",
+                                              Type(), name));
         proto::VarType::Type tmp = t->type();
         PADDLE_ENFORCE(tmp == *data_type || *data_type == default_data_type,
                        platform::errors::InvalidArgument(
@@ -1614,7 +1666,8 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
       static_cast<proto::VarType::Type>(-1);
   proto::VarType::Type data_type = dafault_data_type;
   for (auto& input : ctx.InNameList()) {
-    ParseInputDataType(ctx, input, &data_type);
+    const std::vector<Variable*> vars = ctx.MultiInputVar(input);
+    ParseInputDataType(vars, input, &data_type);
   }
   PADDLE_ENFORCE_NE(
       data_type, dafault_data_type,
@@ -1628,7 +1681,7 @@ proto::VarType::Type OperatorWithKernel::IndicateVarDataType(
   proto::VarType::Type dafault_data_type =
       static_cast<proto::VarType::Type>(-1);
   proto::VarType::Type data_type = dafault_data_type;
-  ParseInputDataType(ctx, name, &data_type);
+  ParseInputDataType(ctx.MultiInputVar(name), name, &data_type);
   PADDLE_ENFORCE_NE(
       data_type, dafault_data_type,
       platform::errors::InvalidArgument(
@@ -1711,5 +1764,134 @@ OpKernelType OperatorWithKernel::GetKernelTypeForVar(
                       tensor.layout());
 }
 
+KernelSignature OperatorWithKernel::GetExpectedPtenKernelArgs(
+    const ExecutionContext& ctx) const {
+  return KernelSignatureMap::Instance().Get(Type());
+}
+
+void OperatorWithKernel::BuildPtenKernelContext(
+    const RuntimeContext& ctx, platform::DeviceContext* dev_ctx) const {
+  // TODO(chenweihang): now only work for very simple case,
+  // many cases need to be deal with later:
+  // 1. the input and output are not tensor
+  // 2. the dispensbale, duplicable input and output
+  // 3. needless attributes remove
+  // 4. use pt Tensor directly
+  // 5. kernel input is not DenseTensor
+  pt_kernel_context_->SetDeviceContext(dev_ctx);
+
+  auto& input_names = std::get<0>(pt_kernel_signature_->args);
+  auto& attr_names = std::get<1>(pt_kernel_signature_->args);
+  auto& output_names = std::get<2>(pt_kernel_signature_->args);
+
+  auto input_defs = pt_kernel_->args_def().input_defs();
+  auto attr_defs = pt_kernel_->args_def().attribute_defs();
+  auto output_defs = pt_kernel_->args_def().output_defs();
+
+  PADDLE_ENFORCE_EQ(input_names.size(), input_defs.size(),
+                    platform::errors::InvalidArgument(
+                        "The size of inputs_args names (%d) must be equal to "
+                        "the size of kernel input_defs (%d).",
+                        input_names.size(), input_defs.size()));
+
+  PADDLE_ENFORCE_EQ(output_names.size(), output_defs.size(),
+                    platform::errors::InvalidArgument(
+                        "The size of outputs_args names (%d) must be equal to "
+                        "the size of kernel output_defs (%d).",
+                        output_names.size(), output_defs.size()));
+
+  PADDLE_ENFORCE_EQ(attr_names.size(), attr_defs.size(),
+                    platform::errors::InvalidArgument(
+                        "The size of attribute_args names (%d) must be equal "
+                        "to the size of kernel attribute_defs (%d).",
+                        attr_names.size(), attr_defs.size()));
+
+  for (size_t i = 0; i < input_names.size(); ++i) {
+    auto& in_def = input_defs.at(i);
+    auto& ins_vector = ctx.inputs.at(input_names[i]);
+    if (pt_kernel_context_->InputsSize() <= i) {
+      paddle::SmallVector<std::shared_ptr<pten::TensorBase>> tmp_inputs;
+      for (auto* var : ins_vector) {
+        tmp_inputs.emplace_back(
+            experimental::MakePtenTensorBaseFromVar(*var, in_def));
+      }
+      pt_kernel_context_->EmplaceBackInputs(std::move(tmp_inputs));
+    } else {
+      size_t input_size = pt_kernel_context_->InputsSize();
+      for (size_t j = 0; j < ins_vector.size(); ++j) {
+        if (input_size > i + j) {
+          experimental::ReMakePtenDenseTensorFromVar(
+              *ins_vector[j], in_def,
+              pt_kernel_context_->MutableInputAt<pten::DenseTensor>(i + j));
+        }
+        // TODO(chenweihang): adapt multi-input case later
+      }
+      pt_kernel_context_->MutableInputRangeAt(i) =
+          std::make_pair(i, i + ins_vector.size());
+    }
+  }
+
+  for (size_t i = 0; i < output_names.size(); ++i) {
+    auto& out_def = output_defs.at(i);
+    auto& outs_vector = ctx.outputs.at(output_names[i]);
+    if (pt_kernel_context_->OutputsSize() <= i) {
+      paddle::SmallVector<std::shared_ptr<pten::TensorBase>> tmp_outputs;
+      for (auto* var : outs_vector) {
+        tmp_outputs.emplace_back(
+            experimental::MakePtenTensorBaseFromVar(var, out_def));
+      }
+      pt_kernel_context_->EmplaceBackOutputs(std::move(tmp_outputs));
+    } else {
+      size_t output_size = pt_kernel_context_->OutputsSize();
+      for (size_t j = 0; j < outs_vector.size(); ++j) {
+        if (output_size > i + j) {
+          experimental::ReMakePtenDenseTensorFromVar(
+              outs_vector[j], out_def,
+              pt_kernel_context_->MutableOutputAt<pten::DenseTensor>(i + j));
+        }
+        // TODO(chenweihang): adapt multi-output case later
+      }
+      pt_kernel_context_->MutableOutputRangeAt(i) =
+          std::make_pair(i, i + outs_vector.size());
+    }
+  }
+
+  for (size_t i = 0; i < attr_names.size(); ++i) {
+    auto& attr = Attrs().at(attr_names[i]);
+    if (attr_defs[i].type_index == std::type_index(typeid(pten::Scalar))) {
+      // TODO(chenweihang): support other attrs later
+      // TODO(zhangyunfei): Scalar should hold scaler type, and we should check
+      // attribtue type by attr_defs
+      if (std::type_index(attr.type()) == std::type_index(typeid(float))) {
+        pt_kernel_context_->EmplaceBackAttr(
+            std::move(pten::Scalar(BOOST_GET_CONST(float, attr))));
+      } else if (std::type_index(attr.type()) ==
+                 std::type_index(typeid(std::string))) {
+        pt_kernel_context_->EmplaceBackAttr(
+            std::move(pten::Scalar(BOOST_GET_CONST(std::string, attr))));
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "unsupported cast op attribute `%s` to Scalar when construct "
+            "KernelContext.",
+            attr_names[i]));
+      }
+    } else {
+      // TODO(chenweihang): support other attrs later
+      if (attr_defs[i].type_index == std::type_index(typeid(int))) {
+        pt_kernel_context_->EmplaceBackAttr(BOOST_GET_CONST(int, attr));
+      } else if (attr_defs[i].type_index == std::type_index(typeid(float))) {
+        pt_kernel_context_->EmplaceBackAttr(BOOST_GET_CONST(float, attr));
+      } else if (attr_defs[i].type_index == std::type_index(typeid(bool))) {
+        pt_kernel_context_->EmplaceBackAttr(BOOST_GET_CONST(bool, attr));
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "unsupported cast op attribute `%s` when construct "
+            "KernelContext.",
+            attr_names[i]));
+      }
+    }
+  }
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index d703a09c476f5..4c071b777fe83 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -30,6 +30,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_kernel_type.h"
+#include "paddle/fluid/framework/pten_utils.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/tensor.h"
@@ -39,6 +40,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/variant.h"
 #include "paddle/utils/flat_hash_map.h"
 
+#include "paddle/pten/include/core.h"
+
 namespace paddle {
 namespace framework {
 class InferShapeContext;
@@ -151,7 +154,7 @@ class OperatorBase {
   virtual void Stop() {}
 
   /// if scope is not null, also show dimensions of arguments
-  virtual std::string DebugStringEx(const Scope* scope) const;
+  virtual std::string DebugStringEx(const ScopeBase* scope) const;
   std::string DebugString() const { return DebugStringEx(nullptr); }
 
   virtual bool SupportGPU() const { return false; }
@@ -529,6 +532,17 @@ class OperatorWithKernel : public OperatorBase {
     return kernel_type_->place_;
   }
 
+  /* member functions for adapting to pten lib */
+  /** In the Tensor calculation library, the new Kernel adopts a clearer and
+    * more streamlined design. The arguments of the Kernel and the input and
+    * output arguments registered in the original OpMaker do not match in some
+    * cases, so we use map to record the arguments required by the kernel.
+    * When selecting Kernel during Op execution, select the arguments of the
+    * original Op according to the GetExpectedPtenKernelArgs returned arguments.
+    */
+  virtual KernelSignature GetExpectedPtenKernelArgs(
+      const ExecutionContext& ctx) const;
+
  private:
   void RunImpl(const Scope& scope, const platform::Place& place) const final;
   void RunImpl(const Scope& scope, const platform::Place& place,
@@ -550,8 +564,9 @@ class OperatorWithKernel : public OperatorBase {
                                const std::vector<std::string>& inplace_vars,
                                const Scope& exec_scope) const;
 
-  void ChooseKernel(const RuntimeContext& ctx, const Scope& scope,
-                    const platform::Place& place) const;
+  OpKernelType InnerGetExpectedKernelType(const ExecutionContext& ctx) const;
+
+  void ChooseKernel(const ExecutionContext& ctx) const;
 
   void HandleComplexGradToRealGrad(const Scope& scope,
                                    RuntimeContext* ctx) const;
@@ -561,12 +576,19 @@ class OperatorWithKernel : public OperatorBase {
   // By default all input data must be same.
   proto::VarType::Type IndicateDataType(const ExecutionContext& ctx) const;
   // used for IndicateDataType
-  void ParseInputDataType(const ExecutionContext& ctx, const std::string& name,
-                          proto::VarType::Type* type) const;
+  void ParseInputDataType(const std::vector<Variable*>& vars,
+                          const std::string& name,
+                          proto::VarType::Type* data_type) const;
   // used for IndicateOrPromoteVarDataTypes
   Tensor* GetTensorFormInputSafely(const ExecutionContext& ctx,
                                    const std::string& name) const;
 
+  /* member functions for adapting to pten lib */
+  void ChoosePtenKernel(const ExecutionContext& ctx) const;
+
+  void BuildPtenKernelContext(const RuntimeContext& ctx,
+                              platform::DeviceContext* dev_ctx) const;
+
  protected:
   mutable std::unique_ptr<OpKernelType> kernel_type_;
   mutable std::unique_ptr<OpKernelFunc> kernel_func_;
@@ -577,6 +599,15 @@ class OperatorWithKernel : public OperatorBase {
   mutable bool all_kernels_must_compute_runtime_shape_ = false;
   mutable std::mutex cache_update_mutex_;
   mutable bool enable_cache_transfer_scope_ = false;
+  // NOTE(chenweihang): Similar op members are used to adapt to
+  // new pten kernel, if there is a better design in the future,
+  // we may polish the implementation here
+  mutable bool run_pten_kernel_ = false;
+  mutable std::unique_ptr<KernelSignature> pt_kernel_signature_;
+  mutable std::unique_ptr<pten::Kernel> pt_kernel_;
+  // In order to reduce the compatibility phase
+  // performance overhead, temporarily cache KernelContext
+  mutable std::unique_ptr<pten::KernelContext> pt_kernel_context_;
 };
 
 extern bool OpSupportGPU(const std::string& op_type);
diff --git a/paddle/fluid/framework/operator_test.cc b/paddle/fluid/framework/operator_test.cc
index 368913700167e..df7e3c4f6dde3 100644
--- a/paddle/fluid/framework/operator_test.cc
+++ b/paddle/fluid/framework/operator_test.cc
@@ -439,9 +439,8 @@ TEST(IndicateVarDataTypeTest, lodtensor) {
     std::string ex_msg = err.what();
     EXPECT_TRUE(
         ex_msg.find(
-            "The Tensor in the indicate_lod_tensor_data_type_test Op's "
-            "Input Variable LoDTensor(lodtensor_1) is not initialized") !=
-        std::string::npos);
+            "The indicate_lod_tensor_data_type_test Op's Input Variable "
+            "`LoDTensor` contains uninitialized Tensor.") != std::string::npos);
   }
   ASSERT_TRUE(caught);
 }
@@ -466,9 +465,9 @@ TEST(IndicateVarDataTypeTest, selectedrows) {
     caught = true;
     std::string ex_msg = err.what();
     EXPECT_TRUE(
-        ex_msg.find("The Tensor in the indicate_selected_rows_data_type_test "
-                    "Op's Input Variable SelectedRows(selected_rows_1) is not "
-                    "initialized") != std::string::npos);
+        ex_msg.find("The indicate_selected_rows_data_type_test Op's "
+                    "Input Variable `SelectedRows` contains uninitialized "
+                    "Tensor.") != std::string::npos);
   }
   ASSERT_TRUE(caught);
 }
diff --git a/paddle/fluid/framework/paddle2cinn/CMakeLists.txt b/paddle/fluid/framework/paddle2cinn/CMakeLists.txt
index 04931c7c4b35e..6eef1a00e1e73 100644
--- a/paddle/fluid/framework/paddle2cinn/CMakeLists.txt
+++ b/paddle/fluid/framework/paddle2cinn/CMakeLists.txt
@@ -1,8 +1,8 @@
 cc_library(cinn_cache_key SRCS cinn_cache_key.cc DEPS boost graph graph_helper lod_tensor proto_desc)
-cc_library(build_cinn_pass SRCS build_cinn_pass.cc DEPS pass subgraph_detector cinn_compiler)
+cc_library(build_cinn_pass SRCS build_cinn_pass.cc DEPS pass subgraph_detector graph_pattern_detector cinn_compiler errors enforce)
 cc_library(transform_desc SRCS transform_desc.cc DEPS proto_desc cinn)
-cc_library(cinn_graph_symbolization SRCS cinn_graph_symbolization.cc DEPS lod_tensor graph graph_helper transform_desc cinn)
-cc_library(cinn_compiler SRCS cinn_compiler.cc DEPS graph lod_tensor cinn_cache_key cinn_graph_symbolization cinn)
+cc_library(cinn_graph_symbolization SRCS cinn_graph_symbolization.cc DEPS lod_tensor graph transform_desc cinn)
+cc_library(cinn_compiler SRCS cinn_compiler.cc DEPS framework_proto graph lod_tensor cinn_cache_key cinn_graph_symbolization cinn)
 
 cc_test(cinn_cache_key_test SRCS cinn_cache_key_test.cc DEPS cinn_cache_key)
 cc_test(build_cinn_pass_test SRCS build_cinn_pass_test.cc DEPS build_cinn_pass cinn_compiler)
diff --git a/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc
index 0664a63c2b72b..0cff68c41eb10 100644
--- a/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc
+++ b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <algorithm>
 #include <iterator>
 #include <memory>
+#include <regex>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
@@ -25,10 +26,19 @@ limitations under the License. */
 
 #include "cinn/frontend/op_mapper_registry.h"
 #include "cinn/frontend/op_mappers/use_op_mappers.h"
+#include "gflags/gflags.h"
+#include "glog/logging.h"
 #include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/ir/node.h"
 #include "paddle/fluid/framework/ir/subgraph_detector.h"
+#include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/errors.h"
+
+DECLARE_string(allow_cinn_ops);
+DECLARE_string(deny_cinn_ops);
 
 namespace paddle {
 namespace framework {
@@ -39,13 +49,100 @@ using framework::ir::Node;
 
 using GraphNodeVec = std::vector<Node*>;
 using GraphNodeSet = std::unordered_set<Node*>;
+using GraphNodeMap = std::unordered_map<Node*, Node*>;
+
+namespace {
+// The delim(`;`) that is used to split the FLAGS_allow_cinn_ops
+// & FLAGS_deny_cinn_ops.
+constexpr char kDelim[] = ";";
+
+const std::unordered_map<std::string, std::unordered_set<std::string>>
+    kDenyParamMap = {{"batch_norm", {"ReserveSpace"}},
+                     {"batch_norm_grad", {"ReserveSpace"}}};
+
+std::unordered_set<std::string> GetDenyVarNames(const GraphNodeSet& cluster) {
+  std::unordered_set<std::string> deny_var_set;
+
+  auto get_debug_info = [](const std::unordered_set<std::string>& var_names) {
+    std::string debug_info = "[";
+    for (auto& var : var_names) {
+      debug_info.append(var);
+      debug_info.append(", ");
+    }
+    debug_info.append("]");
+    return debug_info;
+  };
+
+  for (auto* op : cluster) {
+    if (kDenyParamMap.count(op->Name())) {
+      const auto* desc = op->Op();
+      PADDLE_ENFORCE_NE(desc, nullptr,
+                        platform::errors::PreconditionNotMet(
+                            "The Op %s's OpDesc should not be NULL, which has "
+                            "a parameter in kDenyParamMap.",
+                            op->Name().c_str()));
+
+      auto deny_param_names = kDenyParamMap.at(op->Name());
+      VLOG(4) << "We found deny param " << get_debug_info(deny_param_names)
+              << " in op [" << op->Name() << "].";
+
+      for (const auto& param_name : deny_param_names) {
+        if (desc->Inputs().count(param_name)) {
+          const auto& arg_names = desc->Input(param_name);
+          for (const auto& arg_name : arg_names) {
+            deny_var_set.insert(arg_name);
+            VLOG(4) << "deny param [" << param_name << "]'s argument name"
+                    << " is [" << arg_name << "].";
+          }
+        }
+
+        if (desc->HasOutput(param_name)) {
+          const auto& arg_names = desc->Output(param_name);
+          for (const auto& arg_name : arg_names) {
+            deny_var_set.insert(arg_name);
+            VLOG(4) << "deny param [" << param_name << "]'s argument name"
+                    << " is [" << arg_name << "].";
+          }
+        }
+      }
+    }
+  }
+
+  VLOG(4) << "All deny var names are " << get_debug_info(deny_var_set);
+
+  return deny_var_set;
+}
+
+std::unordered_set<std::string> StringSplit(const std::string& str,
+                                            const std::string& delim) {
+  std::regex reg(delim);
+  std::unordered_set<std::string> elems{
+      std::sregex_token_iterator(str.begin(), str.end(), reg, -1),
+      std::sregex_token_iterator()};
+  elems.erase("");
+  return elems;
+}
+
+int ExtractOpRole(const GraphNodeSet& cluster) {
+  std::unordered_set<int> op_roles;
+  std::string attr_name = OpProtoAndCheckerMaker::OpRoleAttrName();
+  for (auto* n : cluster) {
+    if (n->Op() && n->Op()->HasAttr(attr_name)) {
+      op_roles.insert(BOOST_GET_CONST(int, n->Op()->GetAttr(attr_name)));
+    }
+  }
+  if (op_roles.size() == 1U) {
+    return *(op_roles.begin());
+  } else {
+    return static_cast<int>(OpRole::kNotSpecified);
+  }
+}
 
 // Deal with subgraph's feed input var node:
 // create a new input var node and it's feed op node
-void AddFeedOpAndVar(const std::unordered_set<Node*>& feed_vars,
-                     const GraphNodeSet& cluster,
-                     const std::unordered_map<Node*, Node*>& old_op2new_op,
-                     Graph* graph) {
+void AddFeedOpAndVar(const GraphNodeSet& feed_vars, const GraphNodeSet& cluster,
+                     const GraphNodeMap& old_op2new_op,
+                     const GraphNodeMap& old_var2new_var, Graph* graph) {
   for (auto* old_var : feed_vars) {
     // create feed op
     OpDesc desc;
@@ -53,21 +150,20 @@ void AddFeedOpAndVar(const std::unordered_set<Node*>& feed_vars,
     desc.SetOutput("Out", {old_var->Name()});
     auto op = graph->CreateOpNode(&desc);
 
-    // create new feed var node (SSAGraph)
-    auto var = graph->CreateVarNode(old_var->Var());
+    // get new feed var node
+    auto* var = old_var2new_var.at(old_var);
+    VLOG(4) << "Add Feed Op before: " << var->Name();
 
     // link feed op and feed var
-    op->outputs = {var};
-    var->inputs = {op};
+    IR_NODE_LINK_TO(op, var);
 
     // link feed var to cluster op
     for (auto* old_op : old_var->outputs) {
       if (cluster.count(old_op)) {
-        var->outputs.emplace_back(old_op2new_op.at(old_op));
-        old_op2new_op.at(old_op)->inputs.emplace_back(var);
+        IR_NODE_LINK_TO(var, old_op2new_op.at(old_op));
       }
       // Do not need relink old op or old var here, they will be
-      // fixed in RemoveLinkFromCluster, here we just deal with
+      // fixed in RemoveSubGraphFromGraph, here we just deal with
       // new subgraph's node.
     }
   }
@@ -76,17 +172,16 @@ void AddFeedOpAndVar(const std::unordered_set<Node*>& feed_vars,
 // Deal with subgraph's parameter var node:
 // create a new input var node, it's data will get by scope,
 // so it don't need feed op
-void AddParamVar(const std::unordered_set<Node*>& param_vars,
-                 const GraphNodeSet& cluster,
-                 const std::unordered_map<Node*, Node*>& old_op2new_op,
-                 Graph* graph) {
+void AddParamVar(const GraphNodeSet& param_vars, const GraphNodeSet& cluster,
+                 const GraphNodeMap& old_op2new_op,
+                 const GraphNodeMap& old_var2new_var, Graph* graph) {
   for (auto* old_var : param_vars) {
-    auto var = graph->CreateVarNode(old_var->Var());
+    auto* var = old_var2new_var.at(old_var);
+    VLOG(4) << "Add Param Var Node: " << var->Name();
 
     for (auto* old_op : old_var->outputs) {
       if (cluster.count(old_op)) {
-        var->outputs.emplace_back(old_op2new_op.at(old_op));
-        old_op2new_op.at(old_op)->inputs.emplace_back(var);
+        IR_NODE_LINK_TO(var, old_op2new_op.at(old_op));
       }
     }
   }
@@ -94,17 +189,16 @@ void AddParamVar(const std::unordered_set<Node*>& param_vars,
 
 // Deal with subgraph's outputs var node:
 // create a new output var node and it's fetch op
-void AddOutputVar(const std::unordered_set<Node*>& output_vars,
-                  const GraphNodeSet& cluster,
-                  const std::unordered_map<Node*, Node*>& old_op2new_op,
-                  Graph* graph) {
+void AddOutputVar(const GraphNodeSet& output_vars, const GraphNodeSet& cluster,
+                  const GraphNodeMap& old_op2new_op,
+                  const GraphNodeMap& old_var2new_var, Graph* graph) {
   for (auto* old_var : output_vars) {
-    auto var = graph->CreateVarNode(old_var->Var());
+    auto* var = old_var2new_var.at(old_var);
+    VLOG(4) << "Add Output Var Node: " << var->Name();
 
     for (auto* old_op : old_var->inputs) {
       if (cluster.count(old_op)) {
-        var->inputs.emplace_back(old_op2new_op.at(old_op));
-        old_op2new_op.at(old_op)->outputs.emplace_back(var);
+        IR_NODE_LINK_TO(old_op2new_op.at(old_op), var);
       }
     }
   }
@@ -120,32 +214,45 @@ std::unique_ptr<Graph> CreateNewSubGraph(const GraphNodeSet& cluster,
   // the ProgramDesc is useless, so here we pass a temporary object.
   auto subgraph = std::make_unique<Graph>(framework::ProgramDesc());
 
-  std::unordered_map<Node*, Node*> old_op2new_op;
+  GraphNodeMap old_op2new_op;
   for (auto* op : cluster) {
     auto sub_node = subgraph->CreateOpNode(op->Op());
     old_op2new_op[op] = sub_node;
   }
 
-  std::unordered_map<Node*, Node*> old_var2new_var;
+  GraphNodeMap old_var2new_var;
   for (auto* var : cluster_internals) {
-    Node* sub_node;
-    if (var->Var() == nullptr) {
-      sub_node = subgraph->CreateEmptyNode(var->Name(), var->NodeType());
-    } else {
-      sub_node = subgraph->CreateVarNode(var->Var());
-    }
+    PADDLE_ENFORCE_NOT_NULL(var->Var(),
+                            platform::errors::PreconditionNotMet(
+                                "The var desc of the node in cluster_internals "
+                                "shouldn't be null."));
+    auto* sub_node = subgraph->CreateVarNode(var->Var());
     old_var2new_var[var] = sub_node;
   }
+  for (auto* var : cluster_inputs) {
+    if (var->Var()) {
+      auto* sub_node = subgraph->CreateVarNode(var->Var());
+      old_var2new_var[var] = sub_node;
+    }
+  }
+  for (auto* var : cluster_outputs) {
+    if (var->Var()) {
+      auto* sub_node = subgraph->CreateVarNode(var->Var());
+      old_var2new_var[var] = sub_node;
+    }
+  }
 
-  std::unordered_set<Node*> need_feed_vars;
+  GraphNodeSet need_feed_vars;
   std::unordered_set<Node *> param_vars, output_vars;
   // the subgraph is independently, so here we only need link
   // to the node in new subgraph, and discard the link to
   // out-graph.
   for (auto* op : cluster) {
     for (auto* var : op->inputs) {
-      if (cluster_internals.count(var)) {
-        old_op2new_op[op]->inputs.emplace_back(old_var2new_var[var]);
+      // one output var maybe an input of the cluster
+      if (cluster_internals.count(var) ||
+          (cluster_outputs.count(var) && old_var2new_var.count(var))) {
+        IR_NODE_LINK_TO(old_var2new_var.at(var), old_op2new_op.at(op));
       } else if (cluster_inputs.count(var) && var->Var() != nullptr) {
         if (var->Var()->IsParameter()) {
           // Parameters have been preserved in scope, compared to feed var,
@@ -162,7 +269,7 @@ std::unique_ptr<Graph> CreateNewSubGraph(const GraphNodeSet& cluster,
     }
     for (auto* var : op->outputs) {
       if (cluster_internals.count(var)) {
-        old_op2new_op[op]->outputs.emplace_back(old_var2new_var[var]);
+        IR_NODE_LINK_TO(old_op2new_op.at(op), old_var2new_var.at(var));
       } else if (cluster_outputs.count(var) && var->Var() != nullptr) {
         // Create new output var node to guarantee the independency of
         // subgraph. In other words, the subgraph has no connection with
@@ -172,22 +279,12 @@ std::unique_ptr<Graph> CreateNewSubGraph(const GraphNodeSet& cluster,
     }
   }
 
-  AddFeedOpAndVar(need_feed_vars, cluster, old_op2new_op, subgraph.get());
-  AddParamVar(param_vars, cluster, old_op2new_op, subgraph.get());
-  AddOutputVar(output_vars, cluster, old_op2new_op, subgraph.get());
-
-  for (auto* var : cluster_internals) {
-    for (auto* op : var->inputs) {
-      if (cluster.count(op)) {
-        old_var2new_var[var]->inputs.emplace_back(old_op2new_op[op]);
-      }
-    }
-    for (auto* op : var->outputs) {
-      if (cluster.count(op)) {
-        old_var2new_var[var]->outputs.emplace_back(old_op2new_op[op]);
-      }
-    }
-  }
+  AddFeedOpAndVar(need_feed_vars, cluster, old_op2new_op, old_var2new_var,
+                  subgraph.get());
+  AddParamVar(param_vars, cluster, old_op2new_op, old_var2new_var,
+              subgraph.get());
+  AddOutputVar(output_vars, cluster, old_op2new_op, old_var2new_var,
+               subgraph.get());
 
   return subgraph;
 }
@@ -200,17 +297,24 @@ std::unique_ptr<Graph> CreateNewSubGraph(const GraphNodeSet& cluster,
 // out-graph should not using this node at all.
 // cluster_inputs & cluster_outputs & cluster_internals == NULL
 // cluster_outputs | cluster_internals == all graph op's outputs node
-void AnalyseClusterVariables(const GraphNodeSet& cluster,
-                             GraphNodeSet* cluster_inputs,
-                             GraphNodeSet* cluster_outputs,
-                             GraphNodeSet* cluster_internals) {
+void AnalyseClusterVariables(
+    const GraphNodeSet& cluster,
+    const std::unordered_set<std::string>& deny_var_set,
+    GraphNodeSet* cluster_inputs, GraphNodeSet* cluster_outputs,
+    GraphNodeSet* cluster_internals) {
   // collecting all input and output of op
   for (auto* op_node : cluster) {
+    const auto& op_name = op_node->Name();
     for (auto* input_var_node : op_node->inputs) {
-      cluster_inputs->insert(input_var_node);
+      if (!deny_var_set.count(input_var_node->Name())) {
+        // ignore deny var node
+        cluster_inputs->insert(input_var_node);
+      }
     }
     for (auto* output_var_node : op_node->outputs) {
-      cluster_outputs->insert(output_var_node);
+      if (!deny_var_set.count(output_var_node->Name())) {
+        cluster_outputs->insert(output_var_node);
+      }
     }
   }
   // remove output node from cluster_inputs,
@@ -238,117 +342,81 @@ void AnalyseClusterVariables(const GraphNodeSet& cluster,
   }
 }
 
-Node* AddSpecialOpToGraph(const GraphNodeSet& cluster_inputs,
-                          const GraphNodeSet& cluster_outputs,
-                          const std::string& compilation_key, Graph* graph) {
-  // add special cinn op
-  framework::OpDesc special_op_desc;
-  special_op_desc.SetType(kCinnLaunchOp);
+void AddLinkToCinnOp(const GraphNodeSet& cluster_inputs,
+                     const GraphNodeSet& cluster_outputs, Node* cinn_op_node) {
+  // add new link from cluster_inputs to cinn_op_node
+  for (auto* var_node : cluster_inputs) {
+    IR_NODE_LINK_TO(var_node, cinn_op_node);
+  }
+
+  // add new link from cinn_op_node to cluster_outputs
+  for (auto* var_node : cluster_outputs) {
+    IR_NODE_LINK_TO(cinn_op_node, var_node);
+  }
+}
+
+void AddCinnOpToGraph(const GraphNodeSet& cluster,
+                      const GraphNodeSet& cluster_inputs,
+                      const GraphNodeSet& cluster_outputs,
+                      const std::string& compilation_key,
+                      const std::unordered_set<std::string>& deny_var_set,
+                      Graph* graph) {
+  // Add the cinn launch op
+  framework::OpDesc cinn_op_desc;
+  cinn_op_desc.SetType(kCinnLaunchOp);
   std::vector<std::string> input_names;
+
   std::for_each(cluster_inputs.begin(), cluster_inputs.end(),
-                [&input_names](Node* n) {
-                  if (n->Var() != nullptr) {
+                [&input_names, &deny_var_set](Node* n) {
+                  if (n->Var() != nullptr && !deny_var_set.count(n->Name())) {
                     input_names.emplace_back(n->Name());
                   }
                 });
-  special_op_desc.SetInput("X", input_names);
+  cinn_op_desc.SetInput("X", input_names);
   std::vector<std::string> output_names;
   std::for_each(cluster_outputs.begin(), cluster_outputs.end(),
-                [&output_names](Node* n) {
-                  if (n->Var() != nullptr) {
+                [&output_names, &deny_var_set](Node* n) {
+                  if (n->Var() != nullptr && !deny_var_set.count(n->Name())) {
                     output_names.emplace_back(n->Name());
                   }
                 });
-  special_op_desc.SetOutput("Out", output_names);
-  special_op_desc.SetAttr(kCompilationKey, compilation_key);
-  special_op_desc.Flush();
-  auto* special_op_node = graph->CreateOpNode(&special_op_desc);
-  special_op_node->inputs.assign(cluster_inputs.begin(), cluster_inputs.end());
-  special_op_node->outputs.assign(cluster_outputs.begin(),
-                                  cluster_outputs.end());
-  return special_op_node;
-}
-
-void AddLinkToSpecialOp(const GraphNodeSet& cluster_inputs,
-                        const GraphNodeSet& cluster_outputs,
-                        Node* special_op_node) {
-  // add new link from cluster_inputs to special_op_node
-  for (auto* var_node : cluster_inputs) {
-    var_node->outputs.push_back(special_op_node);
-  }
-
-  // add new link from special_op_node to cluster_outputs
-  for (auto* var_node : cluster_outputs) {
-    var_node->inputs.push_back(special_op_node);
-  }
-}
-
-void RemoveLinkFromCluster(const GraphNodeSet& cluster,
-                           const GraphNodeSet& cluster_inputs,
-                           const GraphNodeSet& cluster_outputs) {
-  // remove all nodes in cluster
-  auto get_preserved_ops = [&cluster](const GraphNodeVec& ops) {
-    GraphNodeVec nodes;
-    for (auto* op_node : ops) {
-      if (cluster.find(op_node) == cluster.end()) {
-        nodes.emplace_back(op_node);
-      }
-    }
-    return nodes;
-  };
-
-  // removing useless link from cluster_inputs to cluster
-  for (auto* var_node : cluster_inputs) {
-    auto preserved_ops = get_preserved_ops(var_node->outputs);
-    var_node->outputs.assign(preserved_ops.begin(), preserved_ops.end());
-    // According to SSA form, a var node must not be any two op's output,
-    // and the cluster_inputs var nodes is defined as an out-graph op's
-    // output, so the cluster_inputs var nodes are not any subgraph op's
-    // output. Do not reassign input list here.
-  }
-
-  // removing useless link from cluster to cluster_outputs
-  for (auto* var_node : cluster_outputs) {
-    auto preserved_ops = get_preserved_ops(var_node->inputs);
-    var_node->inputs.assign(preserved_ops.begin(), preserved_ops.end());
-
-    // Note that cluster_outputs var node maybe some subgraph op's input,
-    // here we need remove them.
-    preserved_ops = get_preserved_ops(var_node->outputs);
-    var_node->outputs.assign(preserved_ops.begin(), preserved_ops.end());
-  }
+  cinn_op_desc.SetOutput("Out", output_names);
+  cinn_op_desc.SetAttr(kCompilationKey, compilation_key);
+  cinn_op_desc.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
+                       ExtractOpRole(cluster));
+  cinn_op_desc.Flush();
+  auto* cinn_op_node = graph->CreateOpNode(&cinn_op_desc);
+  // Add new links from or to the the cinn launch op node
+  AddLinkToCinnOp(cluster_inputs, cluster_outputs, cinn_op_node);
+
+  VLOG(4) << "Add op [" << kCinnLaunchOp << "] into graph.";
 }
 
 // Removing cluster node and internals node from Graph
 void RemoveSubGraphFromGraph(const GraphNodeSet& cluster,
                              const GraphNodeSet& cluster_internals,
                              Graph* graph) {
-  for (auto* op_node : cluster) {
-    graph->RemoveNode(op_node);
-  }
-  for (auto* var_node : cluster_internals) {
-    graph->RemoveNode(var_node);
-  }
+  const std::unordered_set<const Node*> const_cluster{cluster.cbegin(),
+                                                      cluster.cend()};
+  const std::unordered_set<const Node*> const_internals{
+      cluster_internals.cbegin(), cluster_internals.cend()};
+  ir::GraphSafeRemoveNodes(graph, const_cluster);
+  ir::GraphSafeRemoveNodes(graph, const_internals);
 }
 
-// Replacing Cinn subgraph to a special op node, whose op_type is
+// Replacing Cinn subgraph to a cinn op node, whose op_type is
 // kCinnLaunchOp, and inputs ares cluster_inputs and outputs are
 // cluster_outputs.
-// Meanwhile, move all links of cluster to the special op.
-void ReplaceSubGraphWithSpecialOpNode(const GraphNodeSet& cluster,
-                                      const GraphNodeSet& cluster_inputs,
-                                      const GraphNodeSet& cluster_outputs,
-                                      const GraphNodeSet& cluster_internals,
-                                      const std::string& compilation_key,
-                                      Graph* graph) {
-  // First, add the special op node whose name is "kCinnLaunchOp" into graph
-  auto special_op_node = AddSpecialOpToGraph(cluster_inputs, cluster_outputs,
-                                             compilation_key, graph);
-  // Second, remove all graph's links which are from or to cluster nodes
-  RemoveLinkFromCluster(cluster, cluster_inputs, cluster_outputs);
-  // Third, add new links from or to the the special op node
-  AddLinkToSpecialOp(cluster_inputs, cluster_outputs, special_op_node);
-  // Finally, remove the cinn sub graph from graph
+// Meanwhile, move all links of cluster to the cinn op.
+void ReplaceSubGraphWithCinnOpNode(
+    const GraphNodeSet& cluster, const GraphNodeSet& cluster_inputs,
+    const GraphNodeSet& cluster_outputs, const GraphNodeSet& cluster_internals,
+    const std::string& compilation_key,
+    const std::unordered_set<std::string>& deny_var_set, Graph* graph) {
+  // Add the cinn op node whose name is "kCinnLaunchOp" into graph
+  AddCinnOpToGraph(cluster, cluster_inputs, cluster_outputs, compilation_key,
+                   deny_var_set, graph);
+  // Remove the cinn subgraph from graph
   RemoveSubGraphFromGraph(cluster, cluster_internals, graph);
 }
 
@@ -357,31 +425,71 @@ void ReplaceSubGraphWithSpecialOpNode(const GraphNodeSet& cluster,
 // all of op node supported by CINN. We using OpMapperRegistry
 // to check whether the op node supported by CINN.
 void SearchAllSubgraphs(Graph* graph) {
-  auto teller = [](const Node* node) {
-    return ::cinn::frontend::OpMapperRegistry::Global()->Find(node->Name()) !=
-           nullptr;
+  auto allow_ops = StringSplit(FLAGS_allow_cinn_ops, kDelim);
+  auto deny_ops = StringSplit(FLAGS_deny_cinn_ops, kDelim);
+  auto teller = [&allow_ops, &deny_ops](const Node* node) {
+    bool registered = ::cinn::frontend::OpMapperRegistry::Global()->Find(
+                          node->Name()) != nullptr;
+    // if the op type is registered in CINN and allow_ops is not empty, return
+    // true only when it is in allow_ops
+    if (allow_ops.size()) {
+      return registered && allow_ops.count(node->Name());
+    }
+    // if the op type is registered in CINN and deny_ops is not empty, return
+    // true only when it is not in deny_ops
+    if (deny_ops.size()) {
+      return registered && !deny_ops.count(node->Name());
+    }
+    // if the user doesn't set FLAGS_allow_cinn_ops and FLAGS_deny_cinn_ops,
+    // return true only when it is registered in CINN
+    return registered;
   };
+  VLOG(4) << "The allowed Cinn Ops: " << FLAGS_allow_cinn_ops;
+  VLOG(4) << "The denied Cinn Ops: " << FLAGS_deny_cinn_ops;
   std::vector<GraphNodeVec> clusters =
       framework::ir::SubgraphDetector(graph, teller)();
 
+  auto cluster_debug_info = [](const GraphNodeSet& cluster) {
+    std::string res = "(";
+    for (auto* node : cluster) {
+      res.append(node->Name());
+      res.append(", ");
+    }
+    res.append(")");
+    return res;
+  };
+
   auto* cinn_compiler = CinnCompiler::GetInstance();
   for (const auto& node_vec : clusters) {
     // Classify var node to inputs, outputs, and internals.
     GraphNodeSet cluster_set(node_vec.begin(), node_vec.end());
 
+    auto deny_var_set = GetDenyVarNames(cluster_set);
+
     GraphNodeSet cluster_inputs, cluster_outputs, cluster_internals;
-    AnalyseClusterVariables(cluster_set, &cluster_inputs, &cluster_outputs,
-                            &cluster_internals);
+    AnalyseClusterVariables(cluster_set, deny_var_set, &cluster_inputs,
+                            &cluster_outputs, &cluster_internals);
+
+    VLOG(4) << "Cluster Ops: " << cluster_debug_info(cluster_set);
+    VLOG(4) << "Cluster input vars: " << cluster_debug_info(cluster_inputs);
+    VLOG(4) << "Cluster output vars: " << cluster_debug_info(cluster_outputs);
+    VLOG(4) << "Cluster internal vars: "
+            << cluster_debug_info(cluster_internals);
+
     // Create a new subgraph according to the found cluster and
     // save it in CinnCompiler
     std::string compilation_key = cinn_compiler->AddGraph(CreateNewSubGraph(
         cluster_set, cluster_internals, cluster_inputs, cluster_outputs));
-    // Replace the found cluster to a new special op node
-    ReplaceSubGraphWithSpecialOpNode(cluster_set, cluster_inputs,
-                                     cluster_outputs, cluster_internals,
-                                     compilation_key, graph);
+    VLOG(4) << "Compilation Key:\n"
+            << cinn_compiler->ReadableKey(compilation_key);
+
+    // Replace the found cluster to a new cinn op node
+    ReplaceSubGraphWithCinnOpNode(cluster_set, cluster_inputs, cluster_outputs,
+                                  cluster_internals, compilation_key,
+                                  deny_var_set, graph);
   }
 }
+}  // namespace
 
 void BuildCinnPass::ApplyImpl(Graph* graph) const { SearchAllSubgraphs(graph); }
 
diff --git a/paddle/fluid/framework/paddle2cinn/build_cinn_pass.h b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.h
index 556ff228915e4..1c07fb314e92d 100644
--- a/paddle/fluid/framework/paddle2cinn/build_cinn_pass.h
+++ b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.h
@@ -20,7 +20,7 @@ namespace paddle {
 namespace framework {
 namespace paddle2cinn {
 
-constexpr char kCinnLaunchOp[] = "CinnLaunchOp";
+constexpr char kCinnLaunchOp[] = "cinn_launch";
 constexpr char kCompilationKey[] = "compilation_key";
 
 // A pass named BuildCinnPass, the function of this pass is:
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
index 44cea60bdcb8e..97cb7a558d59e 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
@@ -14,14 +14,16 @@
 
 #include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h"
 
+#include <cstdint>
+#include <iterator>
 #include <map>
 #include <memory>
 #include <string>
+#include <unordered_map>
 
 #include "cinn/common/target.h"
 #include "cinn/common/type.h"
 #include "cinn/frontend/decomposer/use_decomposer.h"
-#include "cinn/frontend/net_builder.h"  // need to remove after
 #include "cinn/frontend/pass/use_program_pass.h"
 #include "cinn/frontend/program_pass.h"
 #include "cinn/frontend/syntax.h"
@@ -29,19 +31,26 @@
 #include "cinn/hlir/framework/graph_compiler.h"
 #include "cinn/hlir/framework/pass.h"
 #include "cinn/hlir/pass/use_pass.h"
+#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/ir/node.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.h"
 #include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/rw_lock.h"
 #include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/inference/analysis/dot.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/string/string_helper.h"
 
 namespace paddle {
 namespace framework {
 namespace paddle2cinn {
 
 using ir::Graph;
+using ir::Node;
+using inference::analysis::Dot;
 using ::cinn::common::Target;
 using ::cinn::common::Float;
 using ::cinn::hlir::framework::GraphCompiler;
@@ -54,70 +63,152 @@ CinnCompiler* CinnCompiler::GetInstance() {
   return &instance;
 }
 
+const CinnCompiledObject& CinnCompiler::Compile(
+    const Graph& graph,
+    const std::map<std::string, const LoDTensor*>& input_tensors,
+    const Target& target) {
+  VLOG(1) << "-- The graph to be compiled is:\n" << VizGraph(graph);
+  CinnCacheKey cur_key(graph, input_tensors, target.arch_str());
+  bool exist = false;
+  {
+    AutoRDLock r_guard{&rwlock_};
+    exist = cache_.count(cur_key) != 0;
+  }
+  if (!exist) {
+    std::int64_t compiled_num = real_compiled_num_.fetch_add(1);
+    auto compiled_res =
+        CompileGraph(graph, input_tensors, target, compiled_num);
+    AutoWRLock w_guard{&rwlock_};
+    if (!cache_.count(cur_key)) {
+      cache_[cur_key] = std::move(compiled_res);
+    }
+  }
+  AutoRDLock guard{&rwlock_};
+  const auto& cached_boj = *cache_[cur_key];
+  return cached_boj;
+}
+
+const CinnCompiledObject& CinnCompiler::Compile(
+    const std::string& compilation_key,
+    const std::map<std::string, const LoDTensor*>& input_tensors,
+    const Target& target) {
+  const auto& graph = FindGraph(compilation_key);
+  return Compile(graph, input_tensors, target);
+}
+
 std::string CinnCompiler::AddGraph(std::unique_ptr<Graph> graph) {
   std::string graph_key;
   ProgramDesc program;
   GraphToProgram(*graph, &program);
   program.Proto()->SerializeToString(&graph_key);
-  if (!graphs_.count(graph_key)) {
-    graphs_[graph_key] = std::move(graph);
-  } else {
-    LOG(WARNING)
-        << "The graph being added is already in CinnCompiler. Its key is:\n"
-        << graph_key;
-  }
+
+  PADDLE_ENFORCE_EQ(
+      graphs_.count(graph_key), 0,
+      platform::errors::PreconditionNotMet(
+          "The graph to be added is already in CinnCompiler, which is:\n",
+          VizGraph(graph_key).c_str()));
+  graphs_[graph_key] = std::move(graph);
+  VLOG(4) << "-- Add a graph into CinnCompiler, which is:\n"
+          << VizGraph(graph_key);
   return graph_key;
 }
 
 const Graph& CinnCompiler::FindGraph(const std::string& graph_key) const {
   PADDLE_ENFORCE_NE(
       graphs_.count(graph_key), 0,
-      platform::errors::InvalidArgument("Can not find the target graph: %s",
-                                        graph_key.c_str()));
+      platform::errors::PreconditionNotMet(
+          "Can not find the target graph, of which the key is:\n%s",
+          ReadableKey(graph_key).c_str()));
   return *graphs_.at(graph_key);
 }
 
-const CinnCompiledObject& CinnCompiler::Compile(
-    const Graph& graph,
-    const std::map<std::string, const LoDTensor*>& input_tensors,
-    const Target& target) {
-  CinnCacheKey cur_key(graph, input_tensors, target.arch_str());
-  if (!cache_.count(cur_key)) {
-    real_compiled_num_++;
-    cache_[cur_key] = CompileGraph(graph, input_tensors, target);
+std::string CinnCompiler::VizGraph(const std::string& graph_key) const {
+  const Graph& graph = FindGraph(graph_key);
+  return VizGraph(graph);
+}
+
+std::string CinnCompiler::VizGraph(const Graph& graph) const {
+  Dot dot;
+  std::unordered_map<const Node*, std::string> node2dot;
+  int id = 0;
+  // Create nodes
+  for (const Node* n : graph.Nodes()) {
+    std::string node_id = "Node" + std::to_string(id++);
+    if (n->IsOp()) {
+      dot.AddNode(
+          node_id,
+          {Dot::Attr("shape", "box"), Dot::Attr("style", "rounded,filled,bold"),
+           Dot::Attr("color", "#303A3A"), Dot::Attr("fontcolor", "#ffffff")},
+          n->Name());
+    } else if (n->IsVar()) {
+      auto label = n->Name();
+      if (n->Var() && n->Var()->GetType() == proto::VarType::LOD_TENSOR) {
+        auto shape = n->Var()->GetShape();
+        std::vector<std::string> shape_str(shape.size());
+        std::transform(shape.begin(), shape.end(), shape_str.begin(),
+                       [](const auto& val) { return std::to_string(val); });
+        label += "\n" + string::join_strings(shape_str, ',');
+      }
+      dot.AddNode(
+          node_id,
+          {Dot::Attr("shape", "box"), Dot::Attr("style", "rounded,filled,bold"),
+           Dot::Attr("color", n->Var()->IsParameter() ? "#148b97" : "#dddddd"),
+           Dot::Attr("fontcolor",
+                     n->Var()->IsParameter() ? "#ffffff" : "#000000")},
+          label);
+    }
+    node2dot[n] = node_id;
   }
-  return *cache_[cur_key];
+  // Create edges
+  for (const Node* n : graph.Nodes()) {
+    const auto& src_id = node2dot.at(n);
+    for (auto* out : n->outputs) {
+      const auto& dest_id = node2dot.at(out);
+      dot.AddEdge(src_id, dest_id, {});
+    }
+  }
+  return dot.Build();
 }
 
-const CinnCompiledObject& CinnCompiler::Compile(
-    const std::string& compilation_key,
-    const std::map<std::string, const LoDTensor*>& input_tensors,
-    const Target& target) {
-  const auto& graph = FindGraph(compilation_key);
-  return Compile(graph, input_tensors, target);
+std::string CinnCompiler::ReadableKey(
+    const std::string& compilation_key) const {
+  proto::ProgramDesc desc;
+  desc.ParseFromString(compilation_key);
+  return desc.DebugString();
+}
+
+void CinnCompiler::Clear() {
+  {
+    AutoWRLock guard{&rwlock_};
+    graphs_.clear();
+    cache_.clear();
+  }
+  real_compiled_num_.store(1);
 }
 
 std::unique_ptr<CinnCompiledObject> CinnCompiler::CompileGraph(
     const ir::Graph& graph,
     const std::map<std::string, const LoDTensor*>& input_tensors,
-    const Target& target) const {
-  CinnGraphSymbolization symbol{real_compiled_num_, graph, target,
-                                input_tensors};
+    const Target& target, std::int64_t compiled_num) const {
+  CinnGraphSymbolization symbol{compiled_num, graph, target, input_tensors};
   auto frontend_program = symbol();
   ProgramPass::Apply(&frontend_program, target, {"Decomposer"});
   auto cinn_graph = std::make_shared<::cinn::hlir::framework::Graph>(
       frontend_program, target);
-  VLOG(4) << "The " << real_compiled_num_ << "-th compilation ("
+  VLOG(1) << "-- The " << compiled_num << "-th compilation ("
           << target.arch_str() << "), and its related graph:\n"
           << cinn_graph->Visualize();
   ApplyPass(cinn_graph.get(), "OpFusion");
   auto scope = BuildScope(target, cinn_graph);
-  GraphCompiler graph_compiler(target, scope, cinn_graph);
+
+  auto graph_compiler =
+      std::make_unique<GraphCompiler>(target, scope, cinn_graph);
   GraphCompiler::CompileOptions options;
   options.with_instantiate_variables = false;
-  auto compiled_res = graph_compiler.Build(options);
+  auto compiled_res = graph_compiler->Build(options);
   auto compiled_obj = std::make_unique<CinnCompiledObject>();
-  *compiled_obj = {std::move(compiled_res.runtime_program), scope,
+  *compiled_obj = {std::move(graph_compiler),
+                   std::move(compiled_res.runtime_program), scope,
                    symbol.var_model_to_program_map()};
   return compiled_obj;
 }
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler.h b/paddle/fluid/framework/paddle2cinn/cinn_compiler.h
index 3b0fb5cf6965f..29ec1e424cc23 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.h
+++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <atomic>
+#include <cstdint>
 #include <map>
 #include <memory>
 #include <string>
@@ -25,6 +26,7 @@
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/paddle2cinn/cinn_cache_key.h"
+#include "paddle/fluid/framework/rw_lock.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/platform/macros.h"
 
@@ -33,6 +35,7 @@ namespace framework {
 namespace paddle2cinn {
 
 struct CinnCompiledObject {
+  std::unique_ptr<::cinn::hlir::framework::GraphCompiler> compiler;
   std::unique_ptr<::cinn::hlir::framework::Program> runtime_program;
   std::shared_ptr<::cinn::hlir::framework::Scope> scope;
   std::unordered_map<std::string, std::string> paddle2cinn_varmap;
@@ -61,9 +64,17 @@ class CinnCompiler {
 
   std::string AddGraph(std::unique_ptr<ir::Graph> graph);
 
-  const ir::Graph& FindGraph(const std::string& key) const;
+  const ir::Graph& FindGraph(const std::string& graph_key) const;
 
-  std::int64_t real_compiled_num() const { return real_compiled_num_; }
+  std::string VizGraph(const std::string& graph_key) const;
+
+  std::string VizGraph(const ir::Graph& graph) const;
+
+  std::string ReadableKey(const std::string& compilation_key) const;
+
+  void Clear();
+
+  std::int64_t real_compiled_num() const { return real_compiled_num_.load(); }
 
   ~CinnCompiler() = default;
 
@@ -72,13 +83,14 @@ class CinnCompiler {
   std::unique_ptr<CinnCompiledObject> CompileGraph(
       const ir::Graph& graph,
       const std::map<std::string, const LoDTensor*>& input_tensors,
-      const ::cinn::common::Target& target) const;
+      const ::cinn::common::Target& target, std::int64_t compiled_num) const;
 
   std::unordered_map<std::string, std::unique_ptr<ir::Graph>> graphs_;
   std::unordered_map<CinnCacheKey, std::unique_ptr<CinnCompiledObject>,
                      CinnCacheKey::Hash>
       cache_;
-  std::atomic_int64_t real_compiled_num_{0};
+  std::atomic_int64_t real_compiled_num_{1};
+  mutable RWLock rwlock_;
 
   DISABLE_COPY_AND_ASSIGN(CinnCompiler);
 };
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc
index 22792e0f8c359..145d3d83d4509 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc
@@ -14,12 +14,20 @@
 
 #include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h"
 
+#include <algorithm>
 #include <map>
 #include <memory>
+#include <ostream>
 #include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
 
 #include "cinn/common/target.h"
+#include "gflags/gflags.h"
+#include "glog/logging.h"
 #include "gtest/gtest.h"
+#include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/framework/lod_tensor.h"
@@ -29,13 +37,76 @@
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
 
+DECLARE_string(allow_cinn_ops);
+DECLARE_string(deny_cinn_ops);
+
 namespace paddle {
 namespace framework {
 namespace paddle2cinn {
-
 using ir::Graph;
 using ::cinn::common::Target;
 
+namespace {
+template <typename T, typename Alloc = std::allocator<T>>
+std::ostream& operator<<(std::ostream& os, const std::vector<T, Alloc>& vec) {
+  os << "{ ";
+  for (auto e : vec) {
+    os << e << " ";
+  }
+  os << "}\n";
+  return os;
+}
+
+// Get compilation_key values
+std::vector<std::string> GetCompilationKeys(const Graph& graph) {
+  std::vector<std::string> compilation_keys;
+  for (auto& node : graph.Nodes()) {
+    if (node->IsOp() && node->Name() == kCinnLaunchOp) {
+      compilation_keys.emplace_back(
+          BOOST_GET_CONST(std::string, node->Op()->GetAttr(kCompilationKey)));
+    }
+  }
+  return compilation_keys;
+}
+
+// Extract op types from a graph
+std::unordered_set<std::string> ExtractOpTypes(const Graph& graph) {
+  std::unordered_set<std::string> op_types;
+  for (auto& node : graph.Nodes()) {
+    if (node->IsOp()) {
+      op_types.emplace(node->Name());
+    }
+  }
+  return op_types;
+}
+
+// Get inputs info
+std::unordered_map<std::string, std::vector<int64_t>> GetInputsInfo(
+    const std::string& key, const Graph& graph) {
+  std::unordered_set<std::string> inputs;
+  for (auto& node : graph.Nodes()) {
+    if (node->IsOp() && node->Name() == kCinnLaunchOp) {
+      if (BOOST_GET_CONST(std::string, node->Op()->GetAttr(kCompilationKey)) !=
+          key) {
+        continue;
+      }
+      for (auto in_var_name : node->Op()->InputArgumentNames()) {
+        VLOG(4) << "get an input name: " << in_var_name;
+        inputs.emplace(in_var_name);
+      }
+    }
+  }
+
+  std::unordered_map<std::string, std::vector<int64_t>> inputs_info;
+  for (auto& node : graph.Nodes()) {
+    if (node->IsVar() && inputs.count(node->Name())) {
+      VLOG(4) << node->Name() << " : " << node->Var()->GetShape();
+      inputs_info.emplace(node->Name(), node->Var()->GetShape());
+    }
+  }
+  return inputs_info;
+}
+
 //  X -
 //     | -> mul -> MUL_OUT -
 //  Y -                     | -> elementwise_add -> ADD_OUT -> relu -> RELU_OUT
@@ -65,6 +136,9 @@ std::unique_ptr<Graph> CreateGraph() {
 
   auto* mul_out = global_block->Var("MUL_OUT");
   mul_out->SetType(proto::VarType::LOD_TENSOR);
+  mul_out->SetLoDLevel(0);
+  mul_out->SetDataType(proto::VarType::FP32);
+  mul_out->SetShape({1000, 100});
   mul_op->SetOutput("Out", {mul_out->Name()});
 
   // add
@@ -83,6 +157,9 @@ std::unique_ptr<Graph> CreateGraph() {
 
   auto* add_out = global_block->Var("ADD_OUT");
   add_out->SetType(proto::VarType::LOD_TENSOR);
+  add_out->SetLoDLevel(0);
+  add_out->SetDataType(proto::VarType::FP32);
+  add_out->SetShape({1000, 100});
   add_op->SetOutput("Out", {add_out->Name()});
 
   // relu
@@ -92,11 +169,59 @@ std::unique_ptr<Graph> CreateGraph() {
 
   auto* relu_out = global_block->Var("RELU_OUT");
   relu_out->SetType(proto::VarType::LOD_TENSOR);
+  relu_out->SetLoDLevel(0);
+  relu_out->SetDataType(proto::VarType::FP32);
+  relu_out->SetShape({1000, 100});
   relu_op->SetOutput("Out", {relu_out->Name()});
   program.Flush();
   return std::make_unique<Graph>(program);
 }
 
+}  // namespace
+
+TEST(CinnCompilerTest, FlagController) {
+  // init
+  auto* cinn_compiler = CinnCompiler::GetInstance();
+  auto cinn_pass = ir::PassRegistry::Instance().Get("build_cinn_pass");
+  // apply build_cinn_pass & FLAGS_allow_cinn_ops="add"
+  {
+    FLAGS_allow_cinn_ops = "add";
+    auto graph = CreateGraph();
+    cinn_compiler->Clear();
+    cinn_pass->Apply(graph.get());
+    auto compilation_keys = GetCompilationKeys(*graph);
+    ASSERT_EQ(compilation_keys.size(), 0);
+  }
+  // apply build_cinn_pass & FLAGS_allow_cinn_ops="mul;relu"
+  {
+    FLAGS_allow_cinn_ops = "mul;relu";
+    auto graph = CreateGraph();
+    cinn_compiler->Clear();
+    cinn_pass->Apply(graph.get());
+    auto compilation_keys = GetCompilationKeys(*graph);
+    ASSERT_EQ(compilation_keys.size(), 2);
+  }
+  // apply build_cinn_pass & FLAGS_allow_cinn_ops="" &
+  // FLAGS_deny_cinn_ops="relu"
+  {
+    FLAGS_allow_cinn_ops = "";
+    FLAGS_deny_cinn_ops = "elementwise_add;relu";
+    auto graph = CreateGraph();
+    cinn_compiler->Clear();
+    cinn_pass->Apply(graph.get());
+    auto compilation_keys = GetCompilationKeys(*graph);
+    ASSERT_EQ(compilation_keys.size(), 1);
+    const auto& compiling_graph = cinn_compiler->FindGraph(compilation_keys[0]);
+    auto op_types = ExtractOpTypes(compiling_graph);
+    ASSERT_EQ(op_types.size(), 2);
+    ASSERT_EQ(op_types.count("feed"), 1);
+    ASSERT_EQ(op_types.count("mul"), 1);
+  }
+  // recover flags
+  FLAGS_allow_cinn_ops = "";
+  FLAGS_deny_cinn_ops = "";
+}
+
 TEST(CinnCompilerTest, Compile) {
   auto viz_pass = ir::PassRegistry::Instance().Get("graph_viz_pass");
   auto cinn_pass = ir::PassRegistry::Instance().Get("build_cinn_pass");
@@ -113,32 +238,31 @@ TEST(CinnCompilerTest, Compile) {
   cinn_pass->Apply(graph.get());
   viz_graph("processed_graph.dot", graph.get());
   // get the compilation_key
-  std::vector<std::string> compilation_keys;
-  for (auto& node : graph->Nodes()) {
-    if (node->IsOp() && node->Name() == kCinnLaunchOp) {
-      compilation_keys.emplace_back(
-          BOOST_GET_CONST(std::string, node->Op()->GetAttr(kCompilationKey)));
-    }
-  }
+  auto compilation_keys = GetCompilationKeys(*graph);
   ASSERT_EQ(compilation_keys.size(), 1);
 
   const auto& compilation_key = compilation_keys[0];
   auto* cinn_compiler = CinnCompiler::GetInstance();
+  VLOG(4) << "The graph to be compiled:\n"
+          << cinn_compiler->VizGraph(compilation_key);
   const auto& compiling_graph = cinn_compiler->FindGraph(compilation_key);
-  // viz_graph("compiling_graph.dot", const_cast<Graph*>(&compiling_graph));
+  viz_graph("compiling_graph.dot", const_cast<Graph*>(&compiling_graph));
 
   EXPECT_THROW(cinn_compiler->FindGraph("no_existed"),
                paddle::platform::EnforceNotMet);
 
-  LoDTensor tensor1, tensor2, tensor3;
-  tensor1.Resize({1000, 784});
-  tensor2.Resize({784, 100});
-  tensor3.Resize({100});
-  tensor1.mutable_data<float>(platform::CPUPlace());
-  tensor2.mutable_data<float>(platform::CPUPlace());
-  tensor3.mutable_data<float>(platform::CPUPlace());
-  std::map<std::string, const LoDTensor*> input_tensors = {
-      {"X", &tensor1}, {"Y", &tensor2}, {"Z", &tensor3}};
+  auto inputs_info = GetInputsInfo(compilation_key, *graph);
+  std::unordered_map<std::string, LoDTensor> create_inputs;
+  for (const auto& pair : inputs_info) {
+    auto& tensor = create_inputs[pair.first];
+    tensor.Resize(make_ddim(pair.second));
+    tensor.mutable_data<float>(platform::CPUPlace());
+  }
+  std::map<std::string, const LoDTensor*> input_tensors;
+  std::for_each(create_inputs.begin(), create_inputs.end(),
+                [&input_tensors](const auto& val) {
+                  input_tensors.emplace(val.first, &val.second);
+                });
 
   auto compile_fn = [&](const Target& target) {
     const auto& compiled_obj =
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.cc b/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.cc
index e4e16498b8440..941e82cef1bcc 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.cc
@@ -15,16 +15,18 @@ limitations under the License. */
 #include "paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.h"
 
 #include <algorithm>
-#include <iterator>
 #include <queue>
+#include <unordered_map>
+#include <unordered_set>
 #include <vector>
 
-#include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/paddle2cinn/transform_desc.h"
 #include "paddle/fluid/framework/variable.h"
 
 #include "cinn/frontend/op_mappers/use_op_mappers.h"
 #include "cinn/frontend/var_type_utils.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/errors.h"
 
 namespace paddle {
 namespace framework {
@@ -57,8 +59,21 @@ FeedInfoMap CinnGraphSymbolization::GetFeedInfoMapFromInput() const {
   for (auto& feed_pair : input_tensors_) {
     const auto& feed_name = feed_pair.first;
     const auto* tensor = feed_pair.second;
+    PADDLE_ENFORCE_NE(tensor, nullptr,
+                      platform::errors::PreconditionNotMet(
+                          "The input variable %s's tensor cannot be NULL,"
+                          "we need the variable's dtype and shape from tensor.",
+                          feed_name.c_str()));
 
+    VLOG(4) << "Get feed info from input: " << feed_name;
     feed_map[feed_name] = utils::GetCinnFeedInfoFromTensor(*tensor);
+
+    PADDLE_ENFORCE_NE(
+        feed_map[feed_name].shape.size(), 0UL,
+        platform::errors::PreconditionNotMet(
+            "The input variable %s's tensor shape cannot be empty,"
+            "we need the variable's dtype and shape from tensor.",
+            feed_name.c_str()));
   }
   return feed_map;
 }
@@ -86,35 +101,99 @@ CinnGraphSymbolization::GetGraphInputParameterNames() const {
 // Transform paddle scope to cinn, note that we only preserve the graph’s
 // input parameter variable and ignore others.
 std::shared_ptr<::cinn::hlir::framework::Scope>
-CinnGraphSymbolization::CreateCinnScope(const FeedInfoMap& feed_map) const {
+CinnGraphSymbolization::CreateCinnScope(const FeedInfoMap& feed_map) {
   auto cinn_scope = ::cinn::hlir::framework::Scope::Create();
 
   // get the graph's input parameter variable name list
   auto parameter_names = GetGraphInputParameterNames();
 
   for (const auto& param_name : parameter_names) {
-    VLOG(4) << "add param var [" << param_name << "] info scope";
+    PADDLE_ENFORCE_GT(
+        feed_map.count(param_name), 0UL,
+        platform::errors::NotFound("Cannot find parameter %s from input list,"
+                                   "please add the tensor into input.",
+                                   param_name.c_str()));
+
     // if cannot find var in graph input, skip.
     // scope accepte the CINN format name, so here we need transform
     // paddle format name to CINN format.
-    auto* cinn_var = cinn_scope->Var<CinnTensor>(
-        ::cinn::utils::TransValidVarName(param_name));
+    auto valid_name = ::cinn::utils::TransValidVarName(param_name);
+    auto* cinn_var = cinn_scope->Var<CinnTensor>(valid_name);
 
     auto& cinn_tensor = absl::get<CinnTensor>(*cinn_var);
     // here we only need preserve dtype and shape, do not need preserve data
     auto feed_info = feed_map.at(param_name);
     cinn_tensor->set_type(feed_info.type);
     cinn_tensor->Resize(::cinn::hlir::framework::Shape(feed_info.shape));
+    VLOG(4) << "add paddle param var [" << param_name
+            << "] info cinn scope var[" << valid_name << "]";
+    var_model_to_program_map_[param_name] = valid_name;
   }
 
   return cinn_scope;
 }
 
+std::vector<Node*> CinnGraphSymbolization::TopologicalSort() const {
+  std::unordered_set<Node*> op_nodes;
+  std::for_each(graph_.Nodes().begin(), graph_.Nodes().end(),
+                [&op_nodes](Node* n) {
+                  if (n->IsOp()) {
+                    op_nodes.emplace(n);
+                  }
+                });
+
+  std::unordered_map<Node*, std::unordered_map<Node*, size_t>> adj_list;
+  std::unordered_map<Node*, size_t> in_degrees;
+  for (auto* n : op_nodes) {
+    // the op's input is var
+    for (auto* in_var : n->inputs) {
+      // the var's input is op
+      for (auto* in_op : in_var->inputs) {
+        if (op_nodes.count(in_op)) {
+          ++adj_list[in_op][n];
+          ++in_degrees[n];
+        }
+      }
+    }
+  }
+
+  // find topology entries
+  std::queue<Node*> queue;
+  for (auto* n : op_nodes) {
+    if (!in_degrees[n]) {
+      queue.push(n);
+    }
+  }
+
+  // topological sorting
+  std::vector<Node*> sorted_ops;
+  while (!queue.empty()) {
+    auto* cur_op = queue.front();
+    queue.pop();
+
+    VLOG(4) << "topological sort insert: " << cur_op->Name() << " "
+            << reinterpret_cast<void*>(cur_op) << " input "
+            << cur_op->inputs.size();
+    sorted_ops.emplace_back(cur_op);
+    for (const auto& adj_pair : adj_list[cur_op]) {
+      in_degrees.at(adj_pair.first) -= adj_pair.second;
+      if (!in_degrees[adj_pair.first]) {
+        queue.push(adj_pair.first);
+      }
+    }
+  }
+
+  PADDLE_ENFORCE_EQ(sorted_ops.size(), op_nodes.size(),
+                    platform::errors::PreconditionNotMet(
+                        "The sorting graph contains cycles."));
+  return sorted_ops;
+}
+
 std::vector<std::unique_ptr<CinnOpDesc>>
 CinnGraphSymbolization::TransformAllGraphOpToCinn() const {
   std::vector<std::unique_ptr<CinnOpDesc>> cinn_op_descs;
 
-  const auto& sorted_ops = ir::TopologySortOperations(graph_);
+  auto sorted_ops = TopologicalSort();
   for (auto* node : sorted_ops) {
     cinn_op_descs.emplace_back(std::make_unique<CinnOpDesc>());
     auto& cinn_desc = cinn_op_descs.back();
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.h b/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.h
index b6b4b24c6ee3d..af60493044cf3 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.h
+++ b/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.h
@@ -102,6 +102,9 @@ class CinnGraphSymbolization {
   // transform all paddle var desc in feed list into cinn_var_descs_
   FeedInfoMap GetFeedInfoMapFromInput() const;
 
+  // get the topological sort of the graph_
+  std::vector<ir::Node*> TopologicalSort() const;
+
   // transform all paddle op desc in graph into cinn op desc
   std::vector<std::unique_ptr<CinnOpDesc>> TransformAllGraphOpToCinn() const;
 
@@ -115,7 +118,7 @@ class CinnGraphSymbolization {
 
   // create cinn scope and add parameter's feed info into scope
   std::shared_ptr<::cinn::hlir::framework::Scope> CreateCinnScope(
-      const FeedInfoMap& feed_map) const;
+      const FeedInfoMap& feed_map);
 
   // get the graph op's input persistable var name set
   std::unordered_set<std::string> GetGraphInputParameterNames() const;
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization_test.cc b/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization_test.cc
index 940228314a1d4..be2ca2f73e186 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization_test.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization_test.cc
@@ -268,7 +268,7 @@ TEST_F(CinnGraphSymbolizationTest, sortgraph) {
     sort_names.emplace_back(desc->Type());
   }
   ASSERT_EQ(sort_names,
-            std::vector<std::string>({"feed", "mul", "feed", "add", "relu"}));
+            std::vector<std::string>({"feed", "feed", "mul", "add", "relu"}));
 }
 
 TEST_F(CinnGraphSymbolizationTest, runop) {
diff --git a/paddle/fluid/framework/pass_desc.proto b/paddle/fluid/framework/pass_desc.proto
index c95e40a1d25e8..86a1effb2896e 100644
--- a/paddle/fluid/framework/pass_desc.proto
+++ b/paddle/fluid/framework/pass_desc.proto
@@ -16,20 +16,68 @@ package paddle.framework.proto;
 
 // Describes one subsitute subgraph.
 message PassDesc {
+  enum RoleType {
+    kVariable = 0;
+    kOperator = 1;
+  }
+  enum OperationType {
+    kAdd = 0;
+    kSub = 1;
+    kMul = 2;
+    kDiv = 3;
+    kSize = 4;
+  }
+  enum ConditionType {
+    kEQ = 0;
+    kNE = 1;
+    kGT = 2;
+    kGE = 3;
+    kLT = 4;
+    kLE = 5;
+  }
+  // Representation of attr in var or operator.
+  message Attr {
+    required RoleType role = 1;
+    optional string var_name = 2;
+    optional int32 op_index = 3;
+    required string name = 4;
+    optional string element_name = 5;
+    optional int32 element_index = 6;
+    optional OperationType operation = 7;
+  }
+  // The operation to be performed.
+  message Operation {
+    required OperationType type = 1;
+    optional Attr attr = 2;
+    optional OpDesc.Attr value = 3;
+  }
   message VarMap {
     required string pattern_var = 1;
     required string replace_var = 2;
   }
   message AttrMap {
-    required int32 pattern_op_idx = 1;
-    required int32 replace_op_idx = 2;
-    required string pattern_name = 3;
-    required string replace_name = 4;
+    required Attr pattern_attr = 1;
+    required Attr replace_attr = 2;
+    optional Operation operation = 3;
+  }
+  message AttrCondition {
+    required Attr attr = 1;
+    required ConditionType type = 2;
+    optional Attr condition_attr = 3;
+    optional OpDesc.Attr condition_value = 4;
+    optional Operation operation = 5;
   }
-  required ProgramDesc pattern = 1;
-  required ProgramDesc replace = 2;
+  // A pair of subgraphs for matching and rewriting.
+  repeated OpDesc pattern = 1;
+  repeated OpDesc replace = 2;
+  // Mapping vars between pattern and replace subgraphs.
   repeated VarMap var_maps = 3;
-  repeated AttrMap attr_maps = 4;
+  // Mapping attrs of vars and ops between pattern and replace subgraphs.
+  repeated AttrMap var_attr_maps = 4;
+  repeated AttrMap op_attr_maps = 5;
+  // Limit the attrs of vars and ops in pattern subgraph.
+  repeated AttrCondition var_attr_conditions = 6;
+  repeated AttrCondition op_attr_conditions = 7;
 }
 
 // A series of PassDesc.
diff --git a/paddle/fluid/framework/pten_utils.cc b/paddle/fluid/framework/pten_utils.cc
new file mode 100644
index 0000000000000..b423d0e05e174
--- /dev/null
+++ b/paddle/fluid/framework/pten_utils.cc
@@ -0,0 +1,208 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <sstream>
+
+#include "paddle/fluid/framework/pten_utils.h"
+#include "paddle/pten/core/kernel_factory.h"
+
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_info.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/string/string_helper.h"
+
+namespace paddle {
+namespace framework {
+
+class KernelArgsNameMakerByOpProto : public KernelArgsNameMaker {
+ public:
+  explicit KernelArgsNameMakerByOpProto(
+      const framework::proto::OpProto* op_proto)
+      : op_proto_(op_proto) {
+    PADDLE_ENFORCE_NOT_NULL(op_proto_, platform::errors::InvalidArgument(
+                                           "Op proto cannot be nullptr."));
+  }
+
+  ~KernelArgsNameMakerByOpProto() {}
+
+  const paddle::SmallVector<std::string>& GetInputArgsNames() override;
+  const paddle::SmallVector<std::string>& GetOutputArgsNames() override;
+  const paddle::SmallVector<std::string>& GetAttrsArgsNames() override;
+
+  KernelSignature GetKernelSignature();
+
+ private:
+  DISABLE_COPY_AND_ASSIGN(KernelArgsNameMakerByOpProto);
+
+ private:
+  const framework::proto::OpProto* op_proto_;
+
+  paddle::SmallVector<std::string> input_names_;
+  paddle::SmallVector<std::string> output_names_;
+  paddle::SmallVector<std::string> attr_names_;
+};
+
+OpKernelType TransPtenKernelKeyToOpKernelType(
+    const pten::KernelKey& kernel_key) {
+  proto::VarType::Type data_type =
+      pten::TransToProtoVarType(kernel_key.dtype());
+  platform::Place place = pten::TransToFluidPlace(kernel_key.backend());
+  DataLayout data_layout = pten::TransToFluidDataLayout(kernel_key.layout());
+  LibraryType library_type = LibraryType::kPlain;
+  if (kernel_key.backend() == pten::Backend::MKLDNN) {
+    library_type = LibraryType::kMKLDNN;
+  } else if (kernel_key.backend() == pten::Backend::CUDNN) {
+    library_type = LibraryType::kCUDNN;
+  } else {
+    // do nothing
+  }
+  // TODO(chenweihang): the customized_type_value is lost
+  return OpKernelType(data_type, place, data_layout, library_type);
+}
+
+pten::KernelKey TransOpKernelTypeToPtenKernelKey(
+    const OpKernelType& kernel_type) {
+  pten::Backend backend = pten::TransToPtenBackend(kernel_type.place_);
+  if (kernel_type.library_type_ == LibraryType::kMKLDNN) {
+    backend = pten::Backend::MKLDNN;
+  } else if (kernel_type.library_type_ == LibraryType::kCUDNN) {
+    backend = pten::Backend::CUDNN;
+  } else {
+    // do
+  }
+  paddle::experimental::DataLayout layout =
+      pten::TransToPtenDataLayout(kernel_type.data_layout_);
+  paddle::experimental::DataType dtype =
+      pten::TransToPtenDataType(kernel_type.data_type_);
+  return pten::KernelKey(backend, layout, dtype);
+}
+
+KernelSignatureMap* KernelSignatureMap::kernel_signature_map_ = nullptr;
+std::once_flag KernelSignatureMap::init_flag_;
+
+KernelSignatureMap& KernelSignatureMap::Instance() {
+  std::call_once(init_flag_, [] {
+    kernel_signature_map_ = new KernelSignatureMap();
+    for (const auto& pair : OpInfoMap::Instance().map()) {
+      const auto& op_type = pair.first;
+      const auto* op_proto = pair.second.proto_;
+      if (pten::KernelFactory::Instance().HasCompatiblePtenKernel(op_type)) {
+        KernelArgsNameMakerByOpProto maker(op_proto);
+        VLOG(10) << "Register kernel signature for " << op_type;
+        auto success =
+            kernel_signature_map_->map_
+                .emplace(op_type, std::move(maker.GetKernelSignature()))
+                .second;
+        PADDLE_ENFORCE_EQ(
+            success, true,
+            platform::errors::PermissionDenied(
+                "Kernel signature of the operator %s has been registered.",
+                op_type));
+      }
+    }
+  });
+  return *kernel_signature_map_;
+}
+
+bool KernelSignatureMap::Has(const std::string& op_type) const {
+  return map_.find(op_type) != map_.end();
+}
+
+const KernelSignature& KernelSignatureMap::Get(
+    const std::string& op_type) const {
+  auto it = map_.find(op_type);
+  PADDLE_ENFORCE_NE(
+      it, map_.end(),
+      platform::errors::NotFound(
+          "Operator `%s`'s kernel signature is not registered.", op_type));
+  return it->second;
+}
+
+const paddle::SmallVector<std::string>&
+KernelArgsNameMakerByOpProto::GetInputArgsNames() {
+  for (int i = 0; i < op_proto_->inputs_size(); ++i) {
+    auto& in = op_proto_->inputs()[i];
+    auto& in_name = in.name();
+    if ((in.has_extra() && in.extra()) || (in.has_quant() && in.quant())) {
+      VLOG(1) << "Parse PtenKernel input: skip extra & quant input - "
+              << in_name;
+      continue;
+    }
+    // If contains dispensable input, we should override the
+    // GetExpectedPtenKernelArgs method self
+    if (in.has_dispensable() && in.dispensable()) {
+      VLOG(1) << "Parse PtenKernel input: skip dispensable input - " << in_name;
+      continue;
+    }
+    VLOG(1) << "Parse PtenKernel input: " << in_name;
+    input_names_.emplace_back(in_name);
+  }
+  return input_names_;
+}
+
+const paddle::SmallVector<std::string>&
+KernelArgsNameMakerByOpProto::GetOutputArgsNames() {
+  for (int i = 0; i < op_proto_->outputs_size(); ++i) {
+    auto& out = op_proto_->outputs()[i];
+    auto& out_name = out.name();
+    // TODO(chenweihang): outputs also need skip some cases
+    VLOG(1) << "Parse PtenKernel output: " << out_name;
+    output_names_.emplace_back(out_name);
+  }
+  return output_names_;
+}
+
+const paddle::SmallVector<std::string>&
+KernelArgsNameMakerByOpProto::GetAttrsArgsNames() {
+  for (int i = 0; i < op_proto_->attrs_size(); ++i) {
+    auto& attr = op_proto_->attrs()[i];
+    auto& attr_name = attr.name();
+    if (attr_name == "use_mkldnn" || attr_name == "op_role" ||
+        attr_name == "op_role_var" || attr_name == "op_namescope" ||
+        attr_name == "op_callstack" || attr_name == "op_device") {
+      VLOG(1) << "Parse PtenKernel attribute: skip needless attr - "
+              << attr_name;
+      continue;
+    }
+    if ((attr.has_extra() && attr.extra()) ||
+        (attr.has_quant() && attr.quant())) {
+      VLOG(1) << "Parse PtenKernel attribute: skip extra & quant attr - "
+              << attr_name;
+      continue;
+    }
+    VLOG(1) << "Parse PtenKernel attribute: " << attr_name;
+    attr_names_.emplace_back(attr_name);
+  }
+
+  return attr_names_;
+}
+
+KernelSignature KernelArgsNameMakerByOpProto::GetKernelSignature() {
+  return KernelSignature(op_proto_->type(), GetInputArgsNames(),
+                         GetAttrsArgsNames(), GetOutputArgsNames());
+}
+
+std::string KernelSignatureToString(const KernelSignature& signature) {
+  std::stringstream os;
+  os << "Kernel Signature - name: " << signature.name
+     << "; inputs: " << string::join_strings(std::get<0>(signature.args), ", ")
+     << "; attributes: "
+     << string::join_strings(std::get<1>(signature.args), ", ") << "; outputs: "
+     << string::join_strings(std::get<2>(signature.args), ", ");
+  return os.str();
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/pten_utils.h b/paddle/fluid/framework/pten_utils.h
new file mode 100644
index 0000000000000..fd893e04d3ca4
--- /dev/null
+++ b/paddle/fluid/framework/pten_utils.h
@@ -0,0 +1,94 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <mutex>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/op_kernel_type.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/imperative/type_defs.h"
+#include "paddle/fluid/platform/macros.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/pten/api/lib/utils/tensor_utils.h"
+#include "paddle/pten/include/core.h"
+#include "paddle/utils/flat_hash_map.h"
+#include "paddle/utils/small_vector.h"
+
+namespace paddle {
+namespace framework {
+
+/* Kernel Key translate */
+
+OpKernelType TransPtenKernelKeyToOpKernelType(
+    const pten::KernelKey& kernel_key);
+pten::KernelKey TransOpKernelTypeToPtenKernelKey(
+    const OpKernelType& kernel_type);
+
+/* Kernel Args parse */
+
+struct KernelSignature {
+  std::string name;
+  KernelArgsTuple args;
+
+  KernelSignature() = default;
+  KernelSignature(std::string&& kernel_name,
+                  paddle::SmallVector<std::string>&& inputs,
+                  paddle::SmallVector<std::string>&& attrs,
+                  paddle::SmallVector<std::string>&& outputs)
+      : name(std::move(kernel_name)),
+        args(std::make_tuple(inputs, attrs, outputs)) {}
+  KernelSignature(const std::string& kernel_name,
+                  const paddle::SmallVector<std::string>& inputs,
+                  const paddle::SmallVector<std::string>& attrs,
+                  const paddle::SmallVector<std::string>& outputs)
+      : name(kernel_name), args(std::make_tuple(inputs, attrs, outputs)) {}
+};
+
+// TODO(chenweihang): we can generate this map by proto info in compile time
+class KernelSignatureMap {
+ public:
+  static KernelSignatureMap& Instance();
+
+  bool Has(const std::string& op_type) const;
+
+  const KernelSignature& Get(const std::string& op_type) const;
+
+ private:
+  KernelSignatureMap() = default;
+  DISABLE_COPY_AND_ASSIGN(KernelSignatureMap);
+
+ private:
+  static KernelSignatureMap* kernel_signature_map_;
+  static std::once_flag init_flag_;
+
+  paddle::flat_hash_map<std::string, KernelSignature> map_;
+};
+
+class KernelArgsNameMaker {
+ public:
+  virtual ~KernelArgsNameMaker() {}
+  virtual const paddle::SmallVector<std::string>& GetInputArgsNames() = 0;
+  virtual const paddle::SmallVector<std::string>& GetOutputArgsNames() = 0;
+  virtual const paddle::SmallVector<std::string>& GetAttrsArgsNames() = 0;
+};
+
+std::string KernelSignatureToString(const KernelSignature& signature);
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/pten_utils_test.cc b/paddle/fluid/framework/pten_utils_test.cc
new file mode 100644
index 0000000000000..ab2d60a34303a
--- /dev/null
+++ b/paddle/fluid/framework/pten_utils_test.cc
@@ -0,0 +1,55 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/pten_utils.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/variable.h"
+
+TEST(PtenUtils, TransPtenKernelKeyToOpKernelType) {
+  pten::KernelKey kernel_key(pten::Backend::CPU, pten::DataLayout::NCHW,
+                             pten::DataType::FLOAT32);
+  auto op_kernel_type =
+      paddle::framework::TransPtenKernelKeyToOpKernelType(kernel_key);
+  ASSERT_EQ(op_kernel_type.data_type_, paddle::framework::proto::VarType::FP32);
+  ASSERT_EQ(op_kernel_type.data_layout_, paddle::framework::DataLayout::kNCHW);
+  ASSERT_TRUE(paddle::platform::is_cpu_place(op_kernel_type.place_));
+  ASSERT_EQ(op_kernel_type.library_type_,
+            paddle::framework::LibraryType::kPlain);
+
+#ifdef PADDLE_WITH_MKLDNN
+  pten::KernelKey kernel_key_mkldnn(
+      pten::Backend::MKLDNN, pten::DataLayout::NCHW, pten::DataType::FLOAT32);
+  op_kernel_type =
+      paddle::framework::TransPtenKernelKeyToOpKernelType(kernel_key_mkldnn);
+  ASSERT_EQ(op_kernel_type.data_type_, paddle::framework::proto::VarType::FP32);
+  ASSERT_EQ(op_kernel_type.data_layout_, paddle::framework::DataLayout::kNCHW);
+  ASSERT_TRUE(paddle::platform::is_cpu_place(op_kernel_type.place_));
+  ASSERT_EQ(op_kernel_type.library_type_,
+            paddle::framework::LibraryType::kMKLDNN);
+#endif
+
+#ifdef PADDLE_WITH_CUDA
+  pten::KernelKey kernel_key_cudnn(pten::Backend::CUDNN, pten::DataLayout::NCHW,
+                                   pten::DataType::FLOAT32);
+  op_kernel_type =
+      paddle::framework::TransPtenKernelKeyToOpKernelType(kernel_key_cudnn);
+  ASSERT_EQ(op_kernel_type.data_type_, paddle::framework::proto::VarType::FP32);
+  ASSERT_EQ(op_kernel_type.data_layout_, paddle::framework::DataLayout::kNCHW);
+  ASSERT_TRUE(paddle::platform::is_gpu_place(op_kernel_type.place_));
+  ASSERT_EQ(op_kernel_type.library_type_,
+            paddle::framework::LibraryType::kCUDNN);
+#endif
+}
diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h
index bab57e529df08..ab29a7a88fc00 100644
--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@@ -39,6 +39,16 @@ class Variable;
 namespace paddle {
 namespace framework {
 
+// TODO(zhiqiu): add more function in base class
+class ScopeBase {
+ public:
+  /// Find a variable in the scope or any of its ancestors.  Returns
+  /// nullptr if cannot find.
+  /// Caller doesn't own the returned Variable.
+  virtual Variable* FindVar(const std::string& name) const = 0;
+  virtual ~ScopeBase() {}
+};
+
 class Scope;
 
 /**
@@ -49,7 +59,7 @@ class Scope;
  * One net can run in different scopes and update different variable in the
  * scope.
  */
-class Scope {
+class Scope : public ScopeBase {
  public:
   Scope() {}
   ~Scope();
diff --git a/paddle/fluid/framework/type_defs.h b/paddle/fluid/framework/type_defs.h
index 951daea47bde3..7f7785b374ead 100644
--- a/paddle/fluid/framework/type_defs.h
+++ b/paddle/fluid/framework/type_defs.h
@@ -17,11 +17,13 @@ limitations under the License. */
 #include <map>
 #include <memory>
 #include <string>
+#include <tuple>
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
 #include "paddle/fluid/imperative/type_defs.h"
 #include "paddle/fluid/platform/variant.h"
+#include "paddle/utils/small_vector.h"
 
 namespace paddle {
 namespace framework {
@@ -33,8 +35,8 @@ class BlockDesc;
 class Variable;
 class InferNoNeedBufferVarsFN;
 
-using VariableNameMap = std::map<std::string, std::vector<std::string>>;
 // TODO(panyx0718): Replace vector with something like gtl::Vector.
+using VariableNameMap = std::map<std::string, std::vector<std::string>>;
 using VariableValueMap = std::map<std::string, std::vector<Variable*>>;
 
 // The order should be as same as framework.proto
@@ -82,5 +84,10 @@ using InferShapeFN = std::function<void(InferShapeContext*)>;
 using InplacePair = std::unordered_map<std::string, std::string>;
 using InferInplaceOpFN = std::function<InplacePair(bool /*use_cuda*/)>;
 
+// tuple(input_names, attr_names, output_names)
+using KernelArgsTuple = std::tuple<paddle::SmallVector<std::string>,
+                                   paddle::SmallVector<std::string>,
+                                   paddle::SmallVector<std::string>>;
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h
index c8c3cf364e0fc..f4c41197a9dfa 100644
--- a/paddle/fluid/framework/var_type_traits.h
+++ b/paddle/fluid/framework/var_type_traits.h
@@ -69,6 +69,7 @@ class BKCLCommunicator;
 
 namespace framework {
 class LoDRankTable;
+class ScopeBase;
 class LoDTensor;
 class ReaderHolder;
 class Scope;
diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt
index cb744fb2b6aa2..8f196636af489 100644
--- a/paddle/fluid/imperative/CMakeLists.txt
+++ b/paddle/fluid/imperative/CMakeLists.txt
@@ -1,14 +1,13 @@
 cc_library(imperative_flag SRCS flags.cc DEPS gflags flags)
-
 IF(WITH_XPU)
-cc_library(prepared_operator SRCS prepared_operator.cc DEPS xpu_op_list proto_desc operator device_context lod_tensor selected_rows var_type_traits op_kernel_type data_transform nan_inf_utils)
+cc_library(prepared_operator SRCS prepared_operator.cc DEPS xpu_op_list proto_desc operator device_context lod_tensor selected_rows var_type_traits op_kernel_type data_transform nan_inf_utils pten pten_utils)
 ELSE()
-cc_library(prepared_operator SRCS prepared_operator.cc DEPS proto_desc operator device_context lod_tensor selected_rows var_type_traits op_kernel_type data_transform nan_inf_utils)
+cc_library(prepared_operator SRCS prepared_operator.cc DEPS proto_desc operator device_context lod_tensor selected_rows var_type_traits op_kernel_type data_transform nan_inf_utils pten pten_utils)
 ENDIF()
 cc_library(layer SRCS layer.cc DEPS prepared_operator math_function imperative_flag variable_helper op_registry)
 add_subdirectory(jit)
 cc_library(amp SRCS amp_auto_cast.cc DEPS layer )
-cc_library(tracer SRCS tracer.cc DEPS layer engine program_desc_tracer amp denormal)
+cc_library(tracer SRCS tracer.cc DEPS layer engine program_desc_tracer amp denormal garbage_collector)
 cc_library(basic_engine SRCS basic_engine.cc DEPS layer gradient_accumulator)
 cc_library(engine SRCS basic_engine.cc partial_grad_engine.cc DEPS layer gradient_accumulator)
 cc_library(imperative_profiler SRCS profiler.cc DEPS flags)
diff --git a/paddle/fluid/imperative/amp_auto_cast.cc b/paddle/fluid/imperative/amp_auto_cast.cc
index b0d86f6db9f96..f2ea692ad0880 100644
--- a/paddle/fluid/imperative/amp_auto_cast.cc
+++ b/paddle/fluid/imperative/amp_auto_cast.cc
@@ -191,6 +191,14 @@ NameVarBaseMap AutoCastInputs(const std::string& op_type,
         continue;
       }
 
+      if ((op_type == "fused_attention" || op_type == "fused_feedforward")) {
+        if (pair.first == "LnScale" || pair.first == "LnBias" ||
+            pair.first == "Ln2Scale" || pair.first == "Ln2Bias" ||
+            pair.first == "Ln1Scale" || pair.first == "Ln1Bias") {
+          continue;
+        }
+      }
+
       VLOG(5) << "Op(" << op_type << "): Cast " << pair.first << " from "
               << GetDtypeStr(*pair.second.cbegin()) << " to float16";
       for (auto& var : pair.second) {
@@ -223,6 +231,14 @@ NameVarBaseMap AutoCastInputs(const std::string& op_type,
           pair.first == "X" && dst_type == framework::proto::VarType::FP32) {
         continue;
       }
+      if ((op_type == "fused_attention" || op_type == "fused_feedforwad") &&
+          dst_type == framework::proto::VarType::FP32) {
+        if (pair.first != "LnScale" && pair.first != "LnBias" &&
+            pair.first != "Ln2Scale" && pair.first != "Ln2Bias" &&
+            pair.first != "Ln1Scale" && pair.first != "Ln1Bias") {
+          continue;
+        }
+      }
       VLOG(5) << "Op(" << op_type << "): Cast " << pair.first << " from "
               << GetDtypeStr(*pair.second.cbegin()) << " to "
               << framework::DataTypeToString(dst_type);
diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc
index 53ae5b8127fdb..b584b928f96b9 100644
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -356,6 +356,8 @@ void VarBase::BumpInplaceVersion() {
   MutableVar()->BumpInplaceVersion();
 }
 
+pten::KernelContext OpBase::pt_kernel_context_;
+
 void OpBase::SetType(const std::string& type) {
   op_ = framework::OpRegistry::CreateOp(type, {}, {}, {}, false);
 }
@@ -371,7 +373,8 @@ static void OpBaseRunImpl(const framework::OperatorBase& op,
                           const NameVarMap<VarType>& outs,
                           const framework::AttributeMap& attrs,
                           const framework::AttributeMap& default_attrs,
-                          const platform::Place& place) {
+                          const platform::Place& place,
+                          pten::KernelContext* pt_kernel_context) {
   auto* op_kernel = dynamic_cast<const framework::OperatorWithKernel*>(&op);
   PADDLE_ENFORCE_NOT_NULL(
       op_kernel, platform::errors::PermissionDenied(
@@ -412,8 +415,8 @@ static void OpBaseRunImpl(const framework::OperatorBase& op,
    * after the execution of op, but the original input is directly
    * overwritten in the previous dynamic graph implemention.
    */
-  auto prepared_op =
-      PreparedOp::Prepare(ins, outs, *op_kernel, place, attrs, default_attrs);
+  auto prepared_op = PreparedOp::Prepare(ins, outs, *op_kernel, place, attrs,
+                                         default_attrs, pt_kernel_context);
   auto tmp_ins_ptr =
       PrepareData<VarType>(*op_kernel, ins, prepared_op.kernel_type());
   if (tmp_ins_ptr == nullptr) {
@@ -441,7 +444,8 @@ void OpBase::Run(const framework::OperatorBase& op,
                  const framework::AttributeMap& attrs,
                  const framework::AttributeMap& default_attrs,
                  const platform::Place& place) {
-  OpBaseRunImpl<VarBase>(op, ins, outs, attrs, default_attrs, place);
+  OpBaseRunImpl<VarBase>(op, ins, outs, attrs, default_attrs, place,
+                         &pt_kernel_context_);
 }
 
 void OpBase::Run(const framework::OperatorBase& op,
@@ -450,7 +454,8 @@ void OpBase::Run(const framework::OperatorBase& op,
                  const framework::AttributeMap& attrs,
                  const framework::AttributeMap& default_attrs,
                  const platform::Place& place) {
-  OpBaseRunImpl<VariableWrapper>(op, ins, outs, attrs, default_attrs, place);
+  OpBaseRunImpl<VariableWrapper>(op, ins, outs, attrs, default_attrs, place,
+                                 &pt_kernel_context_);
 }
 
 void ClearNoNeedBufferInputs(OpBase* op) {
diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h
index 16580627ed196..9108155a043b7 100644
--- a/paddle/fluid/imperative/layer.h
+++ b/paddle/fluid/imperative/layer.h
@@ -36,6 +36,7 @@
 #include "paddle/fluid/imperative/variable_wrapper.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/macros.h"
+#include "paddle/pten/include/core.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/imperative/op_base.h b/paddle/fluid/imperative/op_base.h
index acb125a82925d..4122e2af3deda 100644
--- a/paddle/fluid/imperative/op_base.h
+++ b/paddle/fluid/imperative/op_base.h
@@ -25,6 +25,7 @@
 #include "paddle/fluid/imperative/type_defs.h"
 #include "paddle/fluid/imperative/variable_wrapper.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/pten/include/core.h"
 
 namespace paddle {
 namespace imperative {
@@ -183,6 +184,8 @@ class OpBase {
                   const framework::AttributeMap& default_attrs,
                   const platform::Place& place);
 
+  static pten::KernelContext* GetKernelContext() { return &pt_kernel_context_; }
+
  private:
   static const std::string& UnknownOpType() {
     static std::string kUnknownOpType{"unknown"};
@@ -197,6 +200,9 @@ class OpBase {
   std::unique_ptr<framework::OperatorBase> op_;
   platform::Place place_;
   size_t id_{-1UL};
+  // In order to reduce the compatibility phase
+  // performance overhead, temporarily cache KernelContext
+  static pten::KernelContext pt_kernel_context_;
 };
 
 class GradOpNode {
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index c31464bf20acc..c9e211809a406 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -17,10 +17,14 @@
 #include "paddle/fluid/framework/data_type_transform.h"
 #include "paddle/fluid/framework/details/nan_inf_utils.h"
 #include "paddle/fluid/imperative/infer_shape_context.h"
+#include "paddle/fluid/imperative/tracer.h"
+#include "paddle/pten/common/scalar.h"
+#include "paddle/utils/small_vector.h"
 #ifdef PADDLE_WITH_XPU
 #include "paddle/fluid/platform/xpu/xpu_op_list.h"
 #endif
 DECLARE_bool(check_nan_inf);
+DECLARE_bool(run_pten_kernel);
 DECLARE_bool(benchmark);
 
 namespace paddle {
@@ -46,6 +50,21 @@ const framework::Tensor* GetTensorFromVar(const framework::Variable& var) {
   }
 }
 
+static const framework::Attribute& GetAttr(
+    const framework::AttributeMap& attrs,
+    const framework::AttributeMap& default_attrs, const std::string& name) {
+  auto it = attrs.find(name);
+  bool found = it != attrs.end();
+  if (!found) {
+    it = default_attrs.find(name);
+    found = it != default_attrs.end();
+  }
+  PADDLE_ENFORCE_EQ(
+      found, true,
+      platform::errors::NotFound("(%s) is not found in AttributeMap.", name));
+  return it->second;
+}
+
 template <typename VarType>
 static void HandleComplexGradToRealGrad(const NameVarMap<VarType>& outs) {
   for (auto& pair : outs) {
@@ -89,13 +108,31 @@ PreparedOp::PreparedOp(const framework::OperatorBase& op,
       func_(func),
       dev_ctx_(dev_ctx) {}
 
+PreparedOp::PreparedOp(const framework::OperatorBase& op,
+                       const framework::RuntimeContext& ctx,
+                       const framework::OpKernelType& kernel_type,
+                       const framework::KernelSignature& kernel_signature,
+                       const pten::Kernel& pt_kernel,
+                       pten::KernelContext* pt_kernel_context,
+                       platform::DeviceContext* dev_ctx)
+    : op_(op),
+      ctx_(ctx),
+      kernel_type_(kernel_type),
+      func_(nullptr),
+      dev_ctx_(dev_ctx),
+      run_pten_kernel_(true),
+      pt_kernel_signature_(kernel_signature),
+      pt_kernel_(pt_kernel),
+      pt_kernel_context_(pt_kernel_context) {}
+
 template <typename VarType>
 PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
                        const NameVarMap<VarType>& outs,
                        const framework::OperatorWithKernel& op,
                        const platform::Place& place,
                        const framework::AttributeMap& attrs,
-                       const framework::AttributeMap& default_attrs) {
+                       const framework::AttributeMap& default_attrs,
+                       pten::KernelContext* pt_kernel_context) {
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
   auto* dev_ctx = pool.Get(place);
 
@@ -115,11 +152,36 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
 #endif
 
   // 1. get expected kernel key
-  auto expected_kernel_key = op.GetExpectedKernelType(
-      DygraphExecutionContext<VarType>(op, framework::Scope(), *dev_ctx, ctx,
-                                       ins, outs, attrs, default_attrs));
+  auto dygraph_exe_ctx = DygraphExecutionContext<VarType>(
+      op, framework::Scope(), *dev_ctx, ctx, ins, outs, attrs, default_attrs);
+  auto expected_kernel_key = op.GetExpectedKernelType(dygraph_exe_ctx);
   VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
 
+  if (FLAGS_run_pten_kernel &&
+      pten::KernelFactory::Instance().HasCompatiblePtenKernel(op.Type())) {
+    auto pt_kernel_signature = op.GetExpectedPtenKernelArgs(dygraph_exe_ctx);
+
+    VLOG(1) << framework::KernelSignatureToString(pt_kernel_signature);
+
+    auto pt_kernel_name = pten::KernelName(pt_kernel_signature.name);
+    auto pt_kernel_key = TransOpKernelTypeToPtenKernelKey(expected_kernel_key);
+    auto pt_kernel = pten::KernelFactory::Instance().SelectKernel(
+        pt_kernel_name, pt_kernel_key);
+
+    if (pt_kernel.IsValid()) {
+      VLOG(1) << "Dynamic mode PrepareImpl - kernel name: " << pt_kernel_name
+              << " | kernel key: " << pt_kernel_key
+              << " | kernel: " << pt_kernel;
+
+      // TODO(chenweihang): using CPUKernel when miss device kernel case
+      return PreparedOp(op, ctx, expected_kernel_key, pt_kernel_signature,
+                        pt_kernel, pt_kernel_context, dev_ctx);
+    } else {
+      VLOG(1) << "Dynamic mode ChoosePtenKernel - kernel `" << pt_kernel_name
+              << "` not found.";
+    }
+  }
+
   // 2. check if op[type] has kernel registered.
   auto& all_op_kernels = op.AllOpKernels();
   auto kernels_iter = all_op_kernels.find(op.Type());
@@ -153,7 +215,8 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
     kernel_iter = kernels.find(expected_kernel_key);
   }
 #endif
-  // TODO(jiabin): Add operator.cc's line 1000 part back when we need that case
+  // TODO(jiabin): Add operator.cc's line 1000 part back when we need that
+  // case
   PADDLE_ENFORCE_NE(kernel_iter, kernels.end(),
                     platform::errors::NotFound(
                         "Operator %s does not have kernel for %s.", op.Type(),
@@ -171,8 +234,10 @@ PreparedOp PreparedOp::Prepare(const NameVarMap<VarBase>& ins,
                                const framework::OperatorWithKernel& op,
                                const platform::Place& place,
                                const framework::AttributeMap& attrs,
-                               const framework::AttributeMap& default_attrs) {
-  return PrepareImpl<VarBase>(ins, outs, op, place, attrs, default_attrs);
+                               const framework::AttributeMap& default_attrs,
+                               pten::KernelContext* pt_kernel_context) {
+  return PrepareImpl<VarBase>(ins, outs, op, place, attrs, default_attrs,
+                              pt_kernel_context);
 }
 
 PreparedOp PreparedOp::Prepare(const NameVarMap<VariableWrapper>& ins,
@@ -180,9 +245,141 @@ PreparedOp PreparedOp::Prepare(const NameVarMap<VariableWrapper>& ins,
                                const framework::OperatorWithKernel& op,
                                const platform::Place& place,
                                const framework::AttributeMap& attrs,
-                               const framework::AttributeMap& default_attrs) {
+                               const framework::AttributeMap& default_attrs,
+                               pten::KernelContext* pt_kernel_context) {
   return PrepareImpl<VariableWrapper>(ins, outs, op, place, attrs,
-                                      default_attrs);
+                                      default_attrs, pt_kernel_context);
+}
+
+template <typename VarType>
+static void BuildDygraphPtenKernelContext(
+    const framework::KernelSignature& pt_kernel_signature,
+    const pten::Kernel& pt_kernel, const NameVarMap<VarType>& ins,
+    const NameVarMap<VarType>& outs, const framework::AttributeMap& attrs,
+    const framework::AttributeMap& default_attrs,
+    platform::DeviceContext* dev_ctx, pten::KernelContext* kernel_ctx) {
+  // TODO(chenweihang): now only work for very simple case,
+  // many cases need to be deal with later:
+  // 1. the input and output are not tensor
+  // 2. the dispensbale, duplicable input and output
+  // 3. needless attributes remove
+  // 4. use pt Tensor directly
+  // 5. kernel input is not DenseTensor
+  kernel_ctx->SetDeviceContext(dev_ctx);
+
+  auto& input_names = std::get<0>(pt_kernel_signature.args);
+  auto& attr_names = std::get<1>(pt_kernel_signature.args);
+  auto& output_names = std::get<2>(pt_kernel_signature.args);
+
+  auto& input_defs = pt_kernel.args_def().input_defs();
+  auto& output_defs = pt_kernel.args_def().output_defs();
+  auto& attr_defs = pt_kernel.args_def().attribute_defs();
+
+  PADDLE_ENFORCE_EQ(input_names.size(), input_defs.size(),
+                    platform::errors::InvalidArgument(
+                        "the size of inputs_args names (%d) must be equal to "
+                        "the size of kernel input_defs (%d).",
+                        input_names.size(), input_defs.size()));
+
+  PADDLE_ENFORCE_EQ(output_names.size(), output_defs.size(),
+                    platform::errors::InvalidArgument(
+                        "the size of outputs_args names (%d) must be equal to "
+                        "the size of kernel output_defs (%d).",
+                        output_names.size(), output_defs.size()));
+
+  PADDLE_ENFORCE_EQ(attr_names.size(), attr_defs.size(),
+                    platform::errors::InvalidArgument(
+                        "the size of attribute_args names (%d) must be equal "
+                        "to the size of kernel attribute_defs (%d).",
+                        attr_names.size(), attr_defs.size()));
+
+  for (size_t i = 0; i < input_names.size(); ++i) {
+    auto& in_def = input_defs.at(i);
+    auto& ins_vector = ins.at(input_names[i]);
+    if (kernel_ctx->InputsSize() <= i) {
+      paddle::SmallVector<std::shared_ptr<pten::TensorBase>> tmp_inputs;
+      for (const auto& var : ins_vector) {
+        const auto& variable = var->Var();
+        tmp_inputs.emplace_back(
+            experimental::MakePtenTensorBaseFromVar(variable, in_def));
+      }
+      kernel_ctx->EmplaceBackInputs(std::move(tmp_inputs));
+    } else {
+      size_t input_size = kernel_ctx->InputsSize();
+      for (size_t j = 0; j < ins_vector.size(); ++j) {
+        if (input_size > i + j) {
+          experimental::ReMakePtenDenseTensorFromVar(
+              ins_vector[j]->Var(), in_def,
+              kernel_ctx->MutableInputAt<pten::DenseTensor>(i + j));
+        }
+        // TODO(chenweihang): adapt multi-input case later
+      }
+      kernel_ctx->MutableInputRangeAt(i) =
+          std::make_pair(i, i + ins_vector.size());
+    }
+  }
+
+  for (size_t i = 0; i < output_names.size(); ++i) {
+    auto& out_def = output_defs.at(i);
+    auto& outs_vector = outs.at(output_names[i]);
+    if (kernel_ctx->OutputsSize() <= i) {
+      paddle::SmallVector<std::shared_ptr<pten::TensorBase>> tmp_outputs;
+      for (auto& var : outs_vector) {
+        auto* variable = var->MutableVar();
+        tmp_outputs.emplace_back(
+            experimental::MakePtenTensorBaseFromVar(variable, out_def));
+      }
+      kernel_ctx->EmplaceBackOutputs(std::move(tmp_outputs));
+    } else {
+      size_t output_size = kernel_ctx->OutputsSize();
+      for (size_t j = 0; j < outs_vector.size(); ++j) {
+        if (output_size > i + j) {
+          experimental::ReMakePtenDenseTensorFromVar(
+              outs_vector[j]->MutableVar(), out_def,
+              kernel_ctx->MutableOutputAt<pten::DenseTensor>(i + j));
+        }
+        // TODO(chenweihang): adapt multi-output case later
+      }
+      kernel_ctx->MutableOutputRangeAt(i) =
+          std::make_pair(i, i + outs_vector.size());
+    }
+  }
+
+  for (size_t i = 0; i < attr_names.size(); ++i) {
+    auto& attr = GetAttr(attrs, default_attrs, attr_names[i]);
+    if (attr_defs[i].type_index == std::type_index(typeid(pten::Scalar))) {
+      // TODO(chenweihang): support other attrs later
+      // TODO(zhangyunfei): Scalar should hold scaler type, and we should check
+      // attribtue type by attr_defs
+      if (std::type_index(attr.type()) == std::type_index(typeid(float))) {
+        kernel_ctx->EmplaceBackAttr(
+            std::move(pten::Scalar(BOOST_GET_CONST(float, attr))));
+      } else if (std::type_index(attr.type()) ==
+                 std::type_index(typeid(std::string))) {
+        kernel_ctx->EmplaceBackAttr(
+            std::move(pten::Scalar(BOOST_GET_CONST(std::string, attr))));
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "unsupported cast op attribute `%s` to Scalar when construct "
+            "KernelContext in dygraph.",
+            attr_names[i]));
+      }
+    } else {
+      // TODO(chenweihang): support other attrs later
+      if (attr_defs[i].type_index == std::type_index(typeid(int))) {
+        kernel_ctx->EmplaceBackAttr(BOOST_GET_CONST(int, attr));
+      } else if (attr_defs[i].type_index == std::type_index(typeid(float))) {
+        kernel_ctx->EmplaceBackAttr(BOOST_GET_CONST(float, attr));
+      } else if (attr_defs[i].type_index == std::type_index(typeid(bool))) {
+        kernel_ctx->EmplaceBackAttr(BOOST_GET_CONST(bool, attr));
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "unsupported cast op attribute `%s` when construct "
+            "KernelContext in dygraph.",
+            attr_names[i]));
+      }
+    }
+  }
 }
 
 template <typename VarType>
@@ -239,20 +436,58 @@ static void PreparedOpRunImpl(
   }
 }
 
+template <typename VarType>
+static void PreparedOpRunPtImpl(
+    const framework::OperatorBase& op,
+    const framework::KernelSignature& pt_kernel_signature,
+    const pten::Kernel& pt_kernel, pten::KernelContext* pt_kernel_context,
+    platform::DeviceContext* dev_ctx, const NameVarMap<VarType>& ins,
+    const NameVarMap<VarType>& outs, const framework::AttributeMap& attrs,
+    const framework::AttributeMap& default_attrs) {
+  DygraphInferShapeContext<VarType> infer_shape_ctx(&ins, &outs, &attrs,
+                                                    &default_attrs, op.Type());
+  static_cast<const framework::OperatorWithKernel&>(op).InferShape(
+      &infer_shape_ctx);
+
+  BuildDygraphPtenKernelContext<VarType>(pt_kernel_signature, pt_kernel, ins,
+                                         outs, attrs, default_attrs, dev_ctx,
+                                         pt_kernel_context);
+
+  pt_kernel(pt_kernel_context);
+
+  // Ensure that it does not affect the VarBase life cycle management
+  pt_kernel_context->ClearData();
+
+  // TODO(chenweihang): add debug flags later
+  // TODO(chenweihang): deal with complex cases later
+}
+
 void PreparedOp::Run(const NameVarMap<VarBase>& ins,
                      const NameVarMap<VarBase>& outs,
                      const framework::AttributeMap& attrs,
                      const framework::AttributeMap& default_attrs) {
-  PreparedOpRunImpl<VarBase>(op_, ctx_, kernel_type_, func_, dev_ctx_, ins,
-                             outs, attrs, default_attrs);
+  if (run_pten_kernel_) {
+    PreparedOpRunPtImpl<VarBase>(op_, pt_kernel_signature_, pt_kernel_,
+                                 pt_kernel_context_, dev_ctx_, ins, outs, attrs,
+                                 default_attrs);
+  } else {
+    PreparedOpRunImpl<VarBase>(op_, ctx_, kernel_type_, func_, dev_ctx_, ins,
+                               outs, attrs, default_attrs);
+  }
 }
 
 void PreparedOp::Run(const NameVarMap<VariableWrapper>& ins,
                      const NameVarMap<VariableWrapper>& outs,
                      const framework::AttributeMap& attrs,
                      const framework::AttributeMap& default_attrs) {
-  PreparedOpRunImpl<VariableWrapper>(op_, ctx_, kernel_type_, func_, dev_ctx_,
-                                     ins, outs, attrs, default_attrs);
+  if (run_pten_kernel_) {
+    PreparedOpRunPtImpl<VariableWrapper>(op_, pt_kernel_signature_, pt_kernel_,
+                                         pt_kernel_context_, dev_ctx_, ins,
+                                         outs, attrs, default_attrs);
+  } else {
+    PreparedOpRunImpl<VariableWrapper>(op_, ctx_, kernel_type_, func_, dev_ctx_,
+                                       ins, outs, attrs, default_attrs);
+  }
 }
 
 }  // namespace imperative
diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h
index 53f876c498cd0..5262b265b1b53 100644
--- a/paddle/fluid/imperative/prepared_operator.h
+++ b/paddle/fluid/imperative/prepared_operator.h
@@ -21,10 +21,14 @@
 #include "paddle/fluid/framework/data_transform.h"
 #include "paddle/fluid/framework/op_kernel_type.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/pten_utils.h"
+#include "paddle/fluid/framework/type_defs.h"
 #include "paddle/fluid/imperative/execution_context.h"
 #include "paddle/fluid/imperative/layer.h"
 #include "paddle/fluid/imperative/type_defs.h"
 
+#include "paddle/pten/include/core.h"
+
 DECLARE_bool(use_mkldnn);
 
 namespace paddle {
@@ -147,19 +151,29 @@ class PreparedOp {
              const framework::OperatorWithKernel::OpKernelFunc& func,
              platform::DeviceContext* dev_ctx);
 
+  PreparedOp(const framework::OperatorBase& op,
+             const framework::RuntimeContext& ctx,
+             const framework::OpKernelType& kernel_type,
+             const framework::KernelSignature& kernel_signature,
+             const pten::Kernel& pt_kernel,
+             pten::KernelContext* pt_kernel_context,
+             platform::DeviceContext* dev_ctx);
+
   static PreparedOp Prepare(const NameVarMap<VarBase>& ins,
                             const NameVarMap<VarBase>& outs,
                             const framework::OperatorWithKernel& op,
                             const platform::Place& place,
                             const framework::AttributeMap& attrs,
-                            const framework::AttributeMap& default_attrs);
+                            const framework::AttributeMap& default_attrs,
+                            pten::KernelContext* pt_kernel_context = nullptr);
 
   static PreparedOp Prepare(const NameVarMap<VariableWrapper>& ins,
                             const NameVarMap<VariableWrapper>& outs,
                             const framework::OperatorWithKernel& op,
                             const platform::Place& place,
                             const framework::AttributeMap& attrs,
-                            const framework::AttributeMap& default_attrs);
+                            const framework::AttributeMap& default_attrs,
+                            pten::KernelContext* pt_kernel_context = nullptr);
 
   void Run(const NameVarMap<VarBase>& in, const NameVarMap<VarBase>& out,
            const framework::AttributeMap& attrs,
@@ -178,6 +192,15 @@ class PreparedOp {
   framework::OpKernelType kernel_type_;
   framework::OperatorWithKernel::OpKernelFunc func_;
   platform::DeviceContext* dev_ctx_;
+  // NOTE(chenweihang): Similar op members are used to adapt to
+  // new pten kernel, if there is a better design in the future,
+  // we may polish the implementation here
+  bool run_pten_kernel_{false};
+  framework::KernelSignature pt_kernel_signature_;
+  pten::Kernel pt_kernel_;
+  // In order to reduce the compatibility phase
+  // performance overhead, temporarily cache KernelContext
+  pten::KernelContext* pt_kernel_context_;
 };
 
 }  // namespace imperative
diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc
index 0f363d0ea1bff..1d06a63e38f8d 100644
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -213,6 +213,8 @@ void Tracer::TraceOp(const std::string& type, const NameVarBaseMap& ins,
     OpBase::Run(*op, new_ins, outs, attrs, default_attrs, place);
   } catch (platform::EnforceNotMet& exception) {
     framework::AppendErrorOpHint(type, &exception);
+    // Compatible impl: clear pten kernel context data when throw error
+    OpBase::GetKernelContext()->ClearData();
     throw std::move(exception);
   } catch (std::exception& ex) {
     PADDLE_THROW(platform::errors::Fatal(
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index 13dc22c4dff84..09c72cb13b803 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -35,6 +35,7 @@ endif()
 
 # fluid_modules exclude API-interface of inference/api and inference/capi_exp
 get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
+get_property(pten_modules GLOBAL PROPERTY PTEN_MODULES)
 
 # Adapt to custom op mechanism: Include the header files related to the data type
 # to avoid exposing the path of the underlying file
@@ -50,9 +51,9 @@ set(STATIC_INFERENCE_API paddle_inference_api analysis_predictor
         analysis_config paddle_pass_builder activation_functions ${mkldnn_quantizer_cfg})
 #TODO(wilber, T8T9): Do we still need to support windows gpu static library?
 if(WIN32 AND WITH_GPU)
-  cc_library(paddle_inference DEPS ${fluid_modules} ${STATIC_INFERENCE_API})
+  cc_library(paddle_inference DEPS ${fluid_modules} ${pten_modules} ${STATIC_INFERENCE_API})
 else()
-  create_static_lib(paddle_inference ${fluid_modules} ${STATIC_INFERENCE_API})
+  create_static_lib(paddle_inference ${fluid_modules} ${pten_modules} ${STATIC_INFERENCE_API})
 endif()
 
 if(NOT APPLE)
@@ -82,7 +83,7 @@ set(SHARED_INFERENCE_SRCS
     ${PADDLE_CUSTOM_OP_SRCS})
 
 # shared inference library deps
-set(SHARED_INFERENCE_DEPS ${fluid_modules} analysis_predictor)
+set(SHARED_INFERENCE_DEPS ${fluid_modules} ${pten_modules} analysis_predictor)
 
 if (WITH_CRYPTO) 
     set(SHARED_INFERENCE_DEPS ${SHARED_INFERENCE_DEPS} paddle_crypto)
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index d996474f3d677..dcbbee97a772c 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -56,10 +56,18 @@ void IRPassManager::CreatePasses(Argument *argument,
     auto pass = framework::ir::PassRegistry::Instance().Get(pass_name);
 
     if (pass_name == "graph_viz_pass") {
-      std::string dot_file_path = std::to_string(pass_num) + "_ir_" +
-                                  (pre_pass.empty() ? "origin" : pre_pass) +
-                                  ".dot";
+      std::string optim_cache_dir = argument->optim_cache_dir();
+      std::string dot_file_path;
+      if (optim_cache_dir.empty()) {
+        dot_file_path = std::to_string(pass_num) + "_ir_" +
+                        (pre_pass.empty() ? "origin" : pre_pass) + ".dot";
+      } else {
+        dot_file_path = optim_cache_dir + "/" + std::to_string(pass_num) +
+                        "_ir_" + (pre_pass.empty() ? "origin" : pre_pass) +
+                        ".dot";
+      }
       pass->Set("graph_viz_path", new std::string(std::move(dot_file_path)));
+      pass->Set("optim_cache_dir", new std::string(std::move(optim_cache_dir)));
       pass_num++;
     } else if (pass_name == "mkldnn_placement_pass") {
       pass->Set("mkldnn_enabled_op_types",
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index 5d056e054f51c..0440801cfc538 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -12,7 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <sstream>
 #include <string>
+#include <tuple>
 #include "paddle/fluid/inference/api/paddle_analysis_config.h"
 #include "paddle/fluid/inference/api/paddle_pass_builder.h"
 #include "paddle/fluid/inference/utils/table_printer.h"
@@ -20,6 +22,10 @@
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/gpu_info.h"
 
+#ifdef PADDLE_WITH_TENSORRT
+#include "paddle/fluid/inference/tensorrt/helper.h"
+#endif
+
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 DECLARE_uint64(initial_gpu_memory_in_mb);
 #endif
@@ -758,17 +764,6 @@ std::string AnalysisConfig::Summary() {
       {"mkldnn_cache_capacity", std::to_string(mkldnn_cache_capacity_)});
   os.InsetDivider();
 
-  auto Precision2String =
-      [](paddle::AnalysisConfig::Precision prec) -> std::string {
-    if (prec == Precision::kFloat32)
-      return "fp32";
-    else if (prec == Precision::kHalf)
-      return "fp16";
-    else if (prec == Precision::kInt8)
-      return "int8";
-    else
-      return "None";
-  };
   // gpu info
   os.InsertRow({"use_gpu", use_gpu_ ? "true" : "false"});
   if (use_gpu_) {
@@ -780,6 +775,33 @@ std::string AnalysisConfig::Summary() {
 
     os.InsertRow({"use_tensorrt", use_tensorrt_ ? "true" : "false"});
     if (use_tensorrt_) {
+#ifdef PADDLE_WITH_TENSORRT
+      auto Precision2String =
+          [](paddle::AnalysisConfig::Precision prec) -> std::string {
+        if (prec == Precision::kFloat32)
+          return "fp32";
+        else if (prec == Precision::kHalf)
+          return "fp16";
+        else if (prec == Precision::kInt8)
+          return "int8";
+        else
+          return "None";
+      };
+      auto version2string =
+          [](const std::tuple<int, int, int> &ver) -> std::string {
+        std::ostringstream os;
+        int major = std::get<0>(ver);
+        int minor = std::get<1>(ver);
+        int patch = std::get<2>(ver);
+        os << major << "." << minor << "." << patch;
+        return os.str();
+      };
+      os.InsertRow(
+          {"trt_compile_version",
+           version2string(inference::tensorrt::GetTrtCompileVersion())});
+      os.InsertRow(
+          {"trt_runtime_version",
+           version2string(inference::tensorrt::GetTrtRuntimeVersion())});
       os.InsertRow({"tensorrt_precision_mode",
                     Precision2String(tensorrt_precision_mode_)});
       os.InsertRow({"tensorrt_workspace_size",
@@ -805,6 +827,7 @@ std::string AnalysisConfig::Summary() {
       if (trt_use_dla_) {
         os.InsertRow({"tensorrt_dla_core", std::to_string(trt_dla_core_)});
       }
+#endif
     }
   }
   os.InsetDivider();
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index dda4be8f81c63..ad0647236acb9 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -1415,6 +1415,7 @@ USE_TRT_CONVERTER(tile);
 USE_TRT_CONVERTER(conv3d);
 USE_TRT_CONVERTER(conv3d_transpose);
 USE_TRT_CONVERTER(mish);
+USE_TRT_CONVERTER(deformable_conv);
 USE_TRT_CONVERTER(pool3d)
 #endif
 
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index d107ea9e6fdfe..334a70d3e0647 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -198,15 +198,15 @@ CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) {
                   // "embedding_fc_lstm_fuse_pass", //
                   // TODO(wilber): fix correctness problem.
                   // "fc_lstm_fuse_pass",                    //
-                  "mul_lstm_fuse_pass",         //
-                  "fc_gru_fuse_pass",           //
-                  "mul_gru_fuse_pass",          //
-                  "seq_concat_fc_fuse_pass",    //
-                  "squeeze2_matmul_fuse_pass",  //
-                  "reshape2_matmul_fuse_pass",  //
-                  "flatten2_matmul_fuse_pass",  //
-                  "map_matmul_v2_to_mul_pass",  //
-                  // "map_matmul_v2_to_matmul_pass",            //
+                  "mul_lstm_fuse_pass",                      //
+                  "fc_gru_fuse_pass",                        //
+                  "mul_gru_fuse_pass",                       //
+                  "seq_concat_fc_fuse_pass",                 //
+                  "squeeze2_matmul_fuse_pass",               //
+                  "reshape2_matmul_fuse_pass",               //
+                  "flatten2_matmul_fuse_pass",               //
+                  "map_matmul_v2_to_mul_pass",               //
+                  "map_matmul_v2_to_matmul_pass",            //
                   "map_matmul_to_mul_pass",                  //
                   "fc_fuse_pass",                            //
                   "repeated_fc_relu_fuse_pass",              //
@@ -249,6 +249,7 @@ void CpuPassStrategy::EnableMKLDNN() {
              "conv_relu6_mkldnn_fuse_pass",                //
              "conv_swish_mkldnn_fuse_pass",                //
              "conv_hard_swish_mkldnn_fuse_pass",           //
+             "conv_hard_sigmoid_mkldnn_fuse_pass",         //
              "scale_matmul_fuse_pass",                     //
              "reshape_transpose_matmul_mkldnn_fuse_pass",  //
              "matmul_transpose_reshape_fuse_pass",         //
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index b6aa0a230cc2d..a885b69fa7fbc 100644
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -20,6 +20,7 @@ nv_library(tensorrt_converter
                 mish_op.cc
                 nearest_interp_v2_op.cc
                 pool3d_op.cc
+                deformable_conv_op.cc
            DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry)
 
 nv_test(test_op_converter SRCS test_op_converter.cc DEPS
diff --git a/paddle/fluid/inference/tensorrt/convert/deformable_conv_op.cc b/paddle/fluid/inference/tensorrt/convert/deformable_conv_op.cc
new file mode 100644
index 0000000000000..02d460ffa1cbb
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/deformable_conv_op.cc
@@ -0,0 +1,111 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cstdio>
+#include <vector>
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/plugin/deformable_conv_op_plugin.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+class DeformableConvOpConverter : public OpConverter {
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    VLOG(3) << "convert a deformable conv op to tensorrt plugin";
+
+    framework::OpDesc op_desc(op, nullptr);
+    std::string input_name = op_desc.Input("Input").front();
+    std::string offset_name = op_desc.Input("Offset").front();
+    std::string mask_name = op_desc.Input("Mask").front();
+    std::string filter_name = op_desc.Input("Filter").front();
+
+    auto* input_tensor = engine_->GetITensor(input_name);
+    auto* offset_tensor = engine_->GetITensor(offset_name);
+    auto* mask_tensor = engine_->GetITensor(mask_name);
+    auto* filter_var = scope.FindVar(filter_name);
+    auto* filter_tensor = filter_var->GetMutable<framework::LoDTensor>();
+
+    float* filter_data =
+        engine_->GetWeightCPUData(filter_name, filter_tensor, false);
+
+    const int c_o = filter_tensor->dims()[0];
+    const int c_i = filter_tensor->dims()[1];
+    const int k_h = filter_tensor->dims()[2];
+    const int k_w = filter_tensor->dims()[3];
+    std::vector<int> kernel_dims = {c_o, c_i, k_h, k_w};
+
+    auto strides =
+        BOOST_GET_CONST(std::vector<int>, op_desc.GetAttr("strides"));
+    auto paddings =
+        BOOST_GET_CONST(std::vector<int>, op_desc.GetAttr("paddings"));
+    auto dilations =
+        BOOST_GET_CONST(std::vector<int>, op_desc.GetAttr("dilations"));
+
+    auto groups = BOOST_GET_CONST(int, op_desc.GetAttr("groups"));
+    auto deformable_groups =
+        BOOST_GET_CONST(int, op_desc.GetAttr("deformable_groups"));
+    auto im2col_step = BOOST_GET_CONST(int, op_desc.GetAttr("im2col_step"));
+
+    nvinfer1::Weights weights;
+    weights.count = filter_tensor->numel();
+    if (engine_->WithFp16()) {
+      auto half_filter_data = new half[filter_tensor->numel()];
+      for (int i = 0; i < filter_tensor->numel(); i++) {
+        half_filter_data[i] = static_cast<half>(filter_data[i]);
+      }
+      weights.type = nvinfer1::DataType::kHALF;
+      weights.values = half_filter_data;
+    } else {
+      weights.type = nvinfer1::DataType::kFLOAT;
+      weights.values = filter_data;
+    }
+    auto* deformable_conv_plugin = new plugin::DeformableConvPlugin(
+        engine_->WithFp16() ? nvinfer1::DataType::kHALF
+                            : nvinfer1::DataType::kFLOAT,
+        weights, kernel_dims, strides, paddings, dilations, groups,
+        deformable_groups, im2col_step);
+
+    std::vector<nvinfer1::ITensor*> deformable_conv_inputs;
+    deformable_conv_inputs.push_back(input_tensor);
+    deformable_conv_inputs.push_back(offset_tensor);
+    deformable_conv_inputs.push_back(mask_tensor);
+
+    auto* deformable_conv_layer = engine_->network()->addPluginV2(
+        deformable_conv_inputs.data(), deformable_conv_inputs.size(),
+        *deformable_conv_plugin);
+
+    std::vector<std::string> output_names;
+    output_names.push_back(op_desc.Output("Output").front());
+
+    RreplenishLayerAndOutput(deformable_conv_layer, "deformable_conv",
+                             output_names, test_mode);
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(deformable_conv, DeformableConvOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/matmul_op.cc b/paddle/fluid/inference/tensorrt/convert/matmul_op.cc
index 0358c86926bec..7b017900a02c9 100644
--- a/paddle/fluid/inference/tensorrt/convert/matmul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/matmul_op.cc
@@ -61,6 +61,38 @@ class MatMulOpConverter : public OpConverter {
     if (fabs(alpha - 1.0) < std::numeric_limits<float>::epsilon()) {
       engine_->SetITensor(output_name, layer->getOutput(0));
     } else {
+      // IScaleLayer requires the input must have at least
+      // three dimensions in static shape mode and at least
+      // four dimensions in dynamic shape mode.
+      auto* matmul_out = layer->getOutput(0);
+      nvinfer1::Dims out_shape = matmul_out->getDimensions();
+      const int out_dims = out_shape.nbDims;
+      bool need_change_dim = false;
+
+      if (engine_->with_dynamic_shape()) {
+        if (out_dims == 3) {
+          need_change_dim = true;
+        }
+      } else {
+        if (out_dims == 2) {
+          need_change_dim = true;
+        }
+      }
+
+      if (need_change_dim) {
+        nvinfer1::Dims reshape_dim;
+        reshape_dim.nbDims = out_dims + 1;
+        reshape_dim.d[out_dims] = 1;
+        for (int i = 0; i < out_dims; i++) {
+          reshape_dim.d[i] = out_shape.d[i];
+        }
+
+        auto* reshape_layer =
+            TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *matmul_out);
+        reshape_layer->setReshapeDimensions(reshape_dim);
+        matmul_out = reshape_layer->getOutput(0);
+      }
+
       auto create_weights = [&](float data, const std::string& type) -> float* {
         std::unique_ptr<framework::Tensor> tmp_tensor(new framework::Tensor());
         tmp_tensor->Resize({1});
@@ -80,9 +112,18 @@ class MatMulOpConverter : public OpConverter {
       TensorRTEngine::Weight nv_power{nvinfer1::DataType::kFLOAT,
                                       static_cast<void*>(power_data), 1};
       auto* scale_layer = TRT_ENGINE_ADD_LAYER(
-          engine_, Scale, *layer->getOutput(0), nvinfer1::ScaleMode::kUNIFORM,
+          engine_, Scale, *matmul_out, nvinfer1::ScaleMode::kUNIFORM,
           nv_shift.get(), nv_alpha.get(), nv_power.get());
-      engine_->SetITensor(output_name, scale_layer->getOutput(0));
+      auto* scale_out = scale_layer->getOutput(0);
+
+      if (need_change_dim) {
+        auto* reshape_layer =
+            TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *scale_out);
+        reshape_layer->setReshapeDimensions(out_shape);
+        scale_out = reshape_layer->getOutput(0);
+      }
+
+      engine_->SetITensor(output_name, scale_out);
     }
     if (test_mode) {  // the test framework can not determine which is the
                       // output, so place the declaration inside.
diff --git a/paddle/fluid/inference/tensorrt/convert/pool3d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool3d_op.cc
index 9baed499f14a7..b8e87a8d94d1f 100644
--- a/paddle/fluid/inference/tensorrt/convert/pool3d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/pool3d_op.cc
@@ -30,8 +30,8 @@ namespace tensorrt {
 
 inline void DealCeilMode(const nvinfer1::Dims &input_shape,
                          std::vector<int> ksize, std::vector<int> strides,
-                         std::vector<int> paddings, nvinfer1::DimsCHW *pre_pad,
-                         nvinfer1::DimsCHW *post_pad, int input_dims) {
+                         std::vector<int> paddings, nvinfer1::Dims3 *pre_pad,
+                         nvinfer1::Dims3 *post_pad, int input_dims) {
   int input_depth = input_shape.d[input_dims - 3];
   int input_height = input_shape.d[input_dims - 2];
   int input_width = input_shape.d[input_dims - 1];
@@ -56,15 +56,15 @@ inline void DealCeilMode(const nvinfer1::Dims &input_shape,
       1;
 
   if (floor_d_output_size != ceil_d_output_size) {
-    post_pad->c() = strides[0] - 1;
+    post_pad->d[0] = strides[0] - 1;
   }
 
   if (floor_h_output_size != ceil_h_output_size) {
-    post_pad->h() = strides[1] - 1;
+    post_pad->d[1] = strides[1] - 1;
   }
 
   if (floor_w_output_size != ceil_w_output_size) {
-    post_pad->w() = strides[2] - 1;
+    post_pad->d[2] = strides[2] - 1;
   }
 }
 
@@ -118,9 +118,9 @@ class Pool3dOpConverter : public OpConverter {
       reduce_operation = nvinfer1::ReduceOperation::kAVG;
       plugin_pool_type = plugin::Pool3DPlugin::Pool3DType::avg;
     }
-    nvinfer1::DimsCHW nv_ksize(ksize[0], ksize[1], ksize[2]);
-    nvinfer1::DimsCHW nv_strides(strides[0], strides[1], strides[2]);
-    nvinfer1::DimsCHW nv_paddings(paddings[0], paddings[1], paddings[2]);
+    nvinfer1::Dims3 nv_ksize(ksize[0], ksize[1], ksize[2]);
+    nvinfer1::Dims3 nv_strides(strides[0], strides[1], strides[2]);
+    nvinfer1::Dims3 nv_paddings(paddings[0], paddings[1], paddings[2]);
     nvinfer1::ILayer *layer = nullptr;
     if (op_desc.HasAttr("enable_int8")) {
       CHECK(op_desc.HasAttr("X_scale"));
diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc
index 26182a7932199..64116b7973e71 100644
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -148,12 +148,21 @@ void TensorRTEngine::FreezeNetwork() {
       // and outputs have scales,
       // this layer's precision and output type are set to float32.
       // This step has no effect if this layer is fused during TRT optimization.
+      int layers_no_int8 = 0;
       for (int i = 0; i < network()->getNbLayers(); i++) {
         auto layer = network()->getLayer(i);
         if (!is_layer_int8(layer)) {
           layer->setPrecision(nvinfer1::DataType::kFLOAT);
+          ++layers_no_int8;
         }
       }
+      // Disable int8 or build engine failed if all layers aren't int8
+      if (layers_no_int8 == network()->getNbLayers()) {
+        nvinfer1::BuilderFlags flags = infer_builder_config_->getFlags();
+        flags = flags & ~(1U << static_cast<int>(nvinfer1::BuilderFlag::kINT8));
+        // reset flags
+        infer_builder_config_->setFlags(flags);
+      }
 #else
       LOG(WARNING) << "If your TensorRT version is lower than 5.1.2.2, you "
                       "must provide quantization scales for all tensors using "
@@ -233,11 +242,11 @@ void TensorRTEngine::FreezeNetwork() {
       *network(), *infer_builder_config_));
 #else
   infer_builder_config_->setFlag(nvinfer1::BuilderFlag::kSPARSE_WEIGHTS);
-  infer_ptr<nvinfer1::IHostMemory> plan(infer_builder_->buildSerializedNetwork(
+  ihost_memory_.reset(infer_builder_->buildSerializedNetwork(
       *network(), *infer_builder_config_));
   infer_ptr<nvinfer1::IRuntime> runtime(createInferRuntime(&logger_));
-  infer_engine_.reset(
-      runtime->deserializeCudaEngine(plan->data(), plan->size()));
+  infer_engine_.reset(runtime->deserializeCudaEngine(ihost_memory_->data(),
+                                                     ihost_memory_->size()));
 #endif
 
   PADDLE_ENFORCE_NOT_NULL(
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index 0e1b9fe3366ca..9397d4e89de42 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -273,7 +273,14 @@ class TensorRTEngine {
         infer_engine_,
         platform::errors::InvalidArgument(
             "The TensorRT engine must be built first before serialization"));
+#if IS_TRT_VERSION_LT(8000)
     ihost_memory_.reset(infer_engine_->serialize());
+#else
+    PADDLE_ENFORCE_NOT_NULL(
+        ihost_memory_,
+        platform::errors::InvalidArgument(
+            "TensorRT >= 8.0 requires that buildSerializedNetwork is called"));
+#endif
     return ihost_memory_.get();
   }
 
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index 13504f444109b..603c7282074ac 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -143,7 +143,8 @@ struct SimpleOpTypeSetTeller : public Teller {
                                              "conv3d_transpose",
                                              "mish",
                                              "nearest_interp_v2",
-                                             "pool3d"};
+                                             "pool3d",
+                                             "deformable_conv"};
 };
 
 bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
@@ -332,6 +333,51 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
 #endif
     }
 
+    if (op_type == "deformable_conv") {
+      if (with_dynamic_shape) {
+        VLOG(3) << "Deformable conv trt plugin does not support dynamic shape";
+        return false;
+      }
+      auto* block = desc.Block();
+      auto input_name = desc.Input("Input")[0];
+      auto* input_desc = block->FindVar(input_name);
+      const auto input_shape = input_desc->GetShape();
+
+      if (input_shape.size() != 4) {
+        VLOG(3) << "Input of deformable conv should be 4-D Tensor, but got "
+                << input_shape.size();
+        return false;
+      }
+
+      auto filter_name = desc.Input("Filter")[0];
+      auto* filter_desc = block->FindVar(filter_name);
+      const auto filter_shape = filter_desc->GetShape();
+
+      int groups = BOOST_GET_CONST(int, desc.GetAttr("groups"));
+      if (input_shape[1] != filter_shape[1] * groups) {
+        VLOG(3) << "The number of input channels should be equal to filter "
+                << "channels * groups. But got input channels "
+                << input_shape[1] << "filter channels " << filter_shape[1];
+        return false;
+      }
+
+      const std::vector<int> strides =
+          BOOST_GET_CONST(std::vector<int>, desc.GetAttr("strides"));
+      if (strides.size() != 2) {
+        VLOG(3) << "The size of strides should be 2, but got "
+                << strides.size();
+        return false;
+      }
+
+      const std::vector<int> paddings =
+          BOOST_GET_CONST(std::vector<int>, desc.GetAttr("paddings"));
+      if (paddings.size() != 2) {
+        VLOG(3) << "The size of paddings shoule be 2, but got "
+                << paddings.size();
+        return false;
+      }
+    }
+
     if (op_type == "matmul") {
       auto* block = desc.Block();
       if (block == nullptr) {
@@ -1504,7 +1550,7 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
             !BOOST_GET_CONST(bool, desc.GetAttr("keep_dim")))
           return false;
       }
-      if (desc.HasAttr("reduce_all")) {
+      if (desc.HasAttr("out_dtype")) {
         int out_dtype = BOOST_GET_CONST(int32_t, desc.GetAttr("out_dtype"));
         if (out_dtype != -1) {
           return false;
diff --git a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
index 9e93894e623c0..3eece7e500e68 100644
--- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
@@ -11,6 +11,7 @@ nv_library(tensorrt_plugin
            gather_nd_op_plugin.cu
            mish_op_plugin.cu
            pool3d_op_plugin.cu
+           deformable_conv_op_plugin.cu
            DEPS enforce tensorrt_engine prelu tensor bert_encoder_functor)
 
 nv_test(test_split_plugin SRCS test_split_plugin.cc DEPS
diff --git a/paddle/fluid/inference/tensorrt/plugin/deformable_conv_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/deformable_conv_op_plugin.cu
new file mode 100644
index 0000000000000..760f379eb07cb
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/deformable_conv_op_plugin.cu
@@ -0,0 +1,616 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include <algorithm>
+#include <cstdio>
+
+#include "paddle/fluid/inference/tensorrt/plugin/deformable_conv_op_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+static constexpr int kNumCUDAThreads = 512;
+static constexpr int kNumMaximumNumBlocks = 4096;
+
+static inline int NumBlocks(const int N) {
+  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
+                  kNumMaximumNumBlocks);
+}
+
+static inline int ConvOutputSize(int input_size, int filter_size, int dilation,
+                                 int padding, int stride) {
+  const int dkernel = dilation * (filter_size - 1) + 1;
+  int output_size = (input_size + 2 * padding - dkernel) / stride + 1;
+  return output_size;
+}
+
+nvinfer1::Weights DeformableConvPlugin::copyToDevice(const void* hostData,
+                                                     size_t count) {
+  int num_bytes = (data_type_ == nvinfer1::DataType::kFLOAT ? 4 : 2);
+  void* deviceData;
+  PADDLE_ENFORCE_CUDA_SUCCESS(cudaMalloc(&deviceData, count * num_bytes));
+  PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpy(
+      deviceData, hostData, count * num_bytes, cudaMemcpyHostToDevice));
+  return nvinfer1::Weights{data_type_, deviceData, int64_t(count)};
+}
+
+void DeformableConvPlugin::serializeFromDevice(
+    void** hostBuffer, const nvinfer1::Weights& deviceWeights) const {
+  int num_bytes = (data_type_ == nvinfer1::DataType::kFLOAT ? 4 : 2);
+  PADDLE_ENFORCE_CUDA_SUCCESS(
+      cudaMemcpy(static_cast<char*>(*hostBuffer), deviceWeights.values,
+                 deviceWeights.count * num_bytes, cudaMemcpyDeviceToHost));
+  hostBuffer += deviceWeights.count * num_bytes;
+}
+
+nvinfer1::Weights DeformableConvPlugin::deserializeToDevice(
+    const void** hostBuffer, size_t count) {
+  int num_bytes = (data_type_ == nvinfer1::DataType::kFLOAT ? 4 : 2);
+  nvinfer1::Weights w =
+      copyToDevice(static_cast<const char*>(*hostBuffer), count);
+  hostBuffer += count * num_bytes;
+  return w;
+}
+
+DeformableConvPlugin::DeformableConvPlugin(
+    const nvinfer1::DataType data_type, const nvinfer1::Weights& weights,
+    const std::vector<int>& kernel_dims, const std::vector<int>& strides,
+    const std::vector<int>& paddings, const std::vector<int>& dilations,
+    const int groups, const int deformable_groups, const int im2col_step)
+    : data_type_(data_type),
+      groups_(groups),
+      deformable_groups_(deformable_groups),
+      im2col_step_(im2col_step) {
+  weights_ = copyToDevice(weights.values, weights.count);
+  kernel_dims_.insert(kernel_dims_.end(), kernel_dims.cbegin(),
+                      kernel_dims.cend());
+
+  strides_.insert(strides_.end(), strides.cbegin(), strides.cend());
+  paddings_.insert(paddings_.end(), paddings.cbegin(), paddings.cend());
+  dilations_.insert(dilations_.end(), dilations.cbegin(), dilations.cend());
+  PADDLE_ENFORCE_EQ(data_type_ == nvinfer1::DataType::kFLOAT ||
+                        data_type_ == nvinfer1::DataType::kHALF,
+                    true, platform::errors::InvalidArgument(
+                              "The DeformableConv TRT Plugin's input type "
+                              "should be float or half."));
+  PADDLE_ENFORCE_EQ(
+      paddings_.size(), strides_.size(),
+      platform::errors::InvalidArgument(
+          "The size of paddings (%d) is not equal to the size of strides (%d).",
+          paddings_.size(), strides_.size()));
+}
+
+DeformableConvPlugin::DeformableConvPlugin(
+    const nvinfer1::DataType data_type, const nvinfer1::Weights& weights,
+    const std::vector<int>& kernel_dims, const std::vector<int>& strides,
+    const std::vector<int>& paddings, const std::vector<int>& dilations,
+    const int groups, const int deformable_groups, const int im2col_step,
+    const std::vector<int>& input_dim, const std::vector<int>& offset_dim,
+    const std::vector<int>& mask_dim, const std::vector<int>& output_dim)
+    : data_type_(data_type),
+      groups_(groups),
+      deformable_groups_(deformable_groups),
+      im2col_step_(im2col_step) {
+  weights_ = copyToDevice(weights.values, weights.count);
+  kernel_dims_.insert(kernel_dims_.end(), kernel_dims.cbegin(),
+                      kernel_dims.cend());
+
+  strides_.insert(strides_.end(), strides.cbegin(), strides.cend());
+  paddings_.insert(paddings_.end(), paddings.cbegin(), paddings.cend());
+  dilations_.insert(dilations_.end(), dilations.cbegin(), dilations.cend());
+  input_dim_.insert(input_dim_.end(), input_dim.cbegin(), input_dim.cend());
+  offset_dim_.insert(offset_dim_.end(), offset_dim.cbegin(), offset_dim.cend());
+  mask_dim_.insert(mask_dim_.end(), mask_dim.cbegin(), mask_dim.cend());
+  output_dim_.insert(output_dim_.end(), output_dim.cbegin(), output_dim.cend());
+  PADDLE_ENFORCE_EQ(data_type_ == nvinfer1::DataType::kFLOAT ||
+                        data_type_ == nvinfer1::DataType::kHALF,
+                    true, platform::errors::InvalidArgument(
+                              "The DeformableConv TRT Plugin's input type "
+                              "should be float or half."));
+  PADDLE_ENFORCE_EQ(
+      paddings_.size(), strides_.size(),
+      platform::errors::InvalidArgument(
+          "The size of paddings (%d) is not equal to the size of strides (%d).",
+          paddings_.size(), strides_.size()));
+}
+
+DeformableConvPlugin::DeformableConvPlugin(const void* data, size_t length) {
+  DeserializeValue(&data, &length, &data_type_);
+  DeserializeValue(&data, &length, &strides_);
+  DeserializeValue(&data, &length, &paddings_);
+  DeserializeValue(&data, &length, &dilations_);
+  DeserializeValue(&data, &length, &groups_);
+  DeserializeValue(&data, &length, &deformable_groups_);
+  DeserializeValue(&data, &length, &im2col_step_);
+  DeserializeValue(&data, &length, &kernel_dims_);
+  int64_t count;
+  DeserializeValue(&data, &length, &count);
+  weights_ = deserializeToDevice(&data, count);
+  DeserializeValue(&data, &length, &input_dim_);
+  DeserializeValue(&data, &length, &offset_dim_);
+  DeserializeValue(&data, &length, &mask_dim_);
+  DeserializeValue(&data, &length, &output_dim_);
+}
+
+DeformableConvPlugin::~DeformableConvPlugin() {
+  if (weights_.values) {
+    cudaFree(const_cast<void*>(weights_.values));
+    weights_.values = nullptr;
+  }
+}
+
+const char* DeformableConvPlugin::getPluginType() const TRT_NOEXCEPT {
+  return "deformable_conv_plugin";
+}
+
+const char* DeformableConvPlugin::getPluginVersion() const TRT_NOEXCEPT {
+  return "1";
+}
+
+int DeformableConvPlugin::getNbOutputs() const TRT_NOEXCEPT { return 1; }
+
+nvinfer1::Dims DeformableConvPlugin::getOutputDimensions(
+    int index, const nvinfer1::Dims* inputs, int nb_input_dims) TRT_NOEXCEPT {
+  PADDLE_ENFORCE_EQ(nb_input_dims, 3,
+                    platform::errors::InvalidArgument(
+                        "The number of inputs should be equal to 3, but got %d",
+                        nb_input_dims));
+  nvinfer1::Dims ret;
+  ret.nbDims = inputs[0].nbDims;
+  ret.d[0] = kernel_dims_[0];
+  ret.d[1] = ConvOutputSize(inputs[0].d[1], kernel_dims_[2], dilations_[0],
+                            paddings_[0], strides_[0]);
+  ret.d[2] = ConvOutputSize(inputs[0].d[2], kernel_dims_[3], dilations_[1],
+                            paddings_[1], strides_[1]);
+  return ret;
+}
+
+bool DeformableConvPlugin::supportsFormat(
+    nvinfer1::DataType type, nvinfer1::TensorFormat format) const TRT_NOEXCEPT {
+  return ((type == data_type_ || type == nvinfer1::DataType::kINT32) &&
+          format == nvinfer1::TensorFormat::kLINEAR);
+}
+
+size_t DeformableConvPlugin::getWorkspaceSize(int max_batch_size) const
+    TRT_NOEXCEPT {
+  int c_i = input_dim_[0], h_i = input_dim_[1], w_i = input_dim_[2];
+  int k_h = kernel_dims_[2], k_w = kernel_dims_[3];
+  int c_o = output_dim_[0], h_o = output_dim_[1], w_o = output_dim_[2];
+  int num_bytes = (data_type_ == nvinfer1::DataType::kFLOAT ? 4 : 2);
+  size_t data_col_size = static_cast<size_t>(c_i * k_h * k_w * im2col_step_ *
+                                             h_o * w_o * num_bytes);
+  return data_col_size;
+}
+
+int DeformableConvPlugin::enqueue(int batch_size, const void* const* inputs,
+#if IS_TRT_VERSION_LT(8000)
+                                  void** outputs, void* workspace,
+#else
+                                  void* const* outputs, void* workspace,
+#endif
+                                  cudaStream_t stream) TRT_NOEXCEPT {
+  if (data_type_ == nvinfer1::DataType::kFLOAT) {
+    enqueue_impl<float>(batch_size, inputs, outputs, workspace, stream);
+  } else if (data_type_ == nvinfer1::DataType::kHALF) {
+#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
+    enqueue_impl<half>(batch_size, inputs, outputs, workspace, stream);
+#else
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Current CUDA arch dose not support fp16. Please use fp32 instead."));
+#endif
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "The DeformableConv TRT Plugin's input type should be float or half."));
+  }
+  return cudaGetLastError() != cudaSuccess;
+}
+
+template <typename T>
+__device__ T kFloor(T x);
+
+template <>
+__device__ half kFloor<half>(half x) {
+  return hfloor(x);
+}
+
+template <>
+__device__ float kFloor<float>(float x) {
+  return floor(x);
+}
+
+template <typename T>
+__device__ T DmcnIm2colBilinear(const T* bottom_data, const int data_width,
+                                const int height, const int width, T h, T w) {
+  int h_low = kFloor<T>(h);
+  int w_low = kFloor<T>(w);
+  int h_high = h_low + 1;
+  int w_high = w_low + 1;
+
+  T h_low_t = h_low, w_low_t = w_low, one = 1.0f;
+  T lh = h - h_low_t;
+  T lw = w - w_low_t;
+  T hh = one - lh, hw = one - lw;
+
+  T v1 = 0;
+  if (h_low >= 0 && w_low >= 0) v1 = bottom_data[h_low * data_width + w_low];
+  T v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+    v2 = bottom_data[h_low * data_width + w_high];
+  T v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+    v3 = bottom_data[h_high * data_width + w_low];
+  T v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+    v4 = bottom_data[h_high * data_width + w_high];
+
+  T w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+template <typename T>
+__global__ void ModulatedDeformableIm2colGpuKernel(
+    const int nthreads, const T* data_im, const T* data_offset,
+    const T* data_mask, const int height, const int width, const int kernel_h,
+    const int kernel_w, const int pad_h, const int pad_w, const int stride_h,
+    const int stride_w, const int dilation_h, const int dilation_w,
+    const int channel_per_deformable_group, const int batch_size,
+    const int num_channels, const int deformable_group, const int height_col,
+    const int width_col, T* data_col) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+
+  T minus_one = -1.0f, height_t = height, width_t = width;
+  for (size_t i = index; i < nthreads; i += offset) {
+    const int w_col = i % width_col;
+    const int h_col = (i / width_col) % height_col;
+    const int b_col = (i / width_col) / height_col % batch_size;
+    const int c_im = (i / width_col / height_col) / batch_size;
+    const int c_col = c_im * kernel_h * kernel_w;
+
+    const int deformable_group_index = c_im / channel_per_deformable_group;
+
+    const int h_in = h_col * stride_h - pad_h;
+    const int w_in = w_col * stride_w - pad_w;
+
+    T* data_col_ptr =
+        data_col +
+        ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
+    const T* data_im_ptr =
+        data_im + (b_col * num_channels + c_im) * height * width;
+    const T* data_offset_ptr =
+        data_offset +
+        (b_col * deformable_group + deformable_group_index) * 2 * kernel_h *
+            kernel_w * height_col * width_col;
+    const T* data_mask_ptr =
+        data_mask +
+        (b_col * deformable_group + deformable_group_index) * kernel_h *
+            kernel_w * height_col * width_col;
+
+    for (int i = 0; i < kernel_h; ++i) {
+      for (int j = 0; j < kernel_w; ++j) {
+        const int data_offset_h_ptr =
+            ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
+        const int data_offset_w_ptr =
+            ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
+            w_col;
+        const int data_mask_hw_ptr =
+            ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
+
+        const T offset_h = data_offset_ptr[data_offset_h_ptr];
+        const T offset_w = data_offset_ptr[data_offset_w_ptr];
+        const T mask = data_mask_ptr[data_mask_hw_ptr];
+        T val = 0;
+        T h_im_t = h_in + i * dilation_h, w_im_t = w_in + j * dilation_w;
+        const T h_im = h_im_t + offset_h;
+        const T w_im = w_im_t + offset_w;
+        if (h_im > minus_one && w_im > minus_one && h_im < height_t &&
+            w_im < width_t) {
+          val = DmcnIm2colBilinear<T>(data_im_ptr, width, height, width, h_im,
+                                      w_im);
+        }
+        *data_col_ptr = val * mask;
+        data_col_ptr += batch_size * height_col * width_col;
+      }
+    }
+  }
+}
+
+template <typename T>
+void gemm_impl(cublasHandle_t handle, cublasOperation_t transa,
+               cublasOperation_t transb, int m, int n, int k, const T* alpha,
+               const T* A, int lda, const T* B, int ldb, const T* beta, T* C,
+               int ldc);
+
+template <>
+void gemm_impl<float>(cublasHandle_t handle, cublasOperation_t transa,
+                      cublasOperation_t transb, int m, int n, int k,
+                      const float* alpha, const float* A, int lda,
+                      const float* B, int ldb, const float* beta, float* C,
+                      int ldc) {
+  platform::dynload::cublasSgemm(handle, transa, transb, m, n, k, alpha, A, lda,
+                                 B, ldb, beta, C, ldc);
+}
+
+template <>
+void gemm_impl<half>(cublasHandle_t handle, cublasOperation_t transa,
+                     cublasOperation_t transb, int m, int n, int k,
+                     const half* alpha, const half* A, int lda, const half* B,
+                     int ldb, const half* beta, half* C, int ldc) {
+  platform::dynload::cublasHgemm(handle, transa, transb, m, n, k, alpha, A, lda,
+                                 B, ldb, beta, C, ldc);
+}
+
+template <typename T>
+int DeformableConvPlugin::enqueue_impl(int batch_size,
+                                       const void* const* inputs,
+                                       void* const* outputs, void* workspace,
+                                       cudaStream_t stream) {
+  const T* input = reinterpret_cast<const T*>(inputs[0]);
+  const T* offset = reinterpret_cast<const T*>(inputs[1]);
+  const T* mask = reinterpret_cast<const T*>(inputs[2]);
+  const T* filter = reinterpret_cast<const T*>(weights_.values);
+  T* output = reinterpret_cast<T*>(outputs[0]);
+
+  int c_i = input_dim_[0], h_i = input_dim_[1], w_i = input_dim_[2];
+  int k_h = kernel_dims_[2], k_w = kernel_dims_[3];
+  int c_o = output_dim_[0], h_o = output_dim_[1], w_o = output_dim_[2];
+
+  int input_stride = c_i * h_i * w_i;
+  int offset_stride = offset_dim_[0] * offset_dim_[1] * offset_dim_[2];
+  int mask_stride = mask_dim_[0] * mask_dim_[1] * mask_dim_[2];
+  int output_stride = c_o * h_o * w_o;
+
+  int M = c_o / groups_;
+  int N = im2col_step_ * h_o * w_o;
+  int K = c_i * k_h * k_w / groups_;
+
+  // c_i / deformable_groups
+  int channel_per_deformable_group = c_i / deformable_groups_;
+  // c_i * im2col_step * h_o * w_o
+  int num_kernels = c_i * im2col_step_ * h_o * w_o;
+
+  int blocks = NumBlocks(num_kernels);
+  int threads = kNumCUDAThreads;
+
+  T alpha = static_cast<T>(1.0f);
+  T beta = static_cast<T>(0.0f);
+
+  for (int i = 0; i < batch_size / im2col_step_; ++i) {
+    const T* data_im = input + i * im2col_step_ * input_stride;
+    const T* data_offset = offset + i * im2col_step_ * offset_stride;
+    const T* data_mask = mask + i * im2col_step_ * mask_stride;
+    T* data_col = reinterpret_cast<T*>(workspace);
+
+    ModulatedDeformableIm2colGpuKernel<T><<<blocks, threads, 0, stream>>>(
+        num_kernels, data_im, data_offset, data_mask, h_i, w_i, k_h, k_w,
+        paddings_[0], paddings_[1], strides_[0], strides_[1], dilations_[0],
+        dilations_[1], channel_per_deformable_group, im2col_step_, c_i,
+        deformable_groups_, h_o, w_o, data_col);
+
+    for (int g = 0; g < groups_; ++g) {
+      const T* weight = filter + g * M * K;
+      const T* col = data_col + g * K * N;
+      T* out = output + i * im2col_step_ * output_stride + g * M * N;
+      gemm_impl<T>(cublasHandle_, CUBLAS_OP_N, CUBLAS_OP_N, N, M, K, &alpha,
+                   col, N, weight, K, &beta, out, N);
+    }
+  }
+  return 0;
+}
+
+int DeformableConvPlugin::initialize() TRT_NOEXCEPT { return 0; }
+
+void DeformableConvPlugin::terminate() TRT_NOEXCEPT {}
+
+size_t DeformableConvPlugin::getSerializationSize() const TRT_NOEXCEPT {
+  size_t serialize_size = 0;
+  serialize_size += SerializedSize(data_type_);
+  serialize_size += SerializedSize(strides_);
+  serialize_size += SerializedSize(paddings_);
+  serialize_size += SerializedSize(dilations_);
+  serialize_size += SerializedSize(groups_);
+  serialize_size += SerializedSize(deformable_groups_);
+  serialize_size += SerializedSize(im2col_step_);
+  serialize_size += SerializedSize(kernel_dims_);
+  serialize_size += SerializedSize(weights_.count);
+  int num_bytes = (data_type_ == nvinfer1::DataType::kFLOAT ? 4 : 2);
+  serialize_size += weights_.count * num_bytes;
+  serialize_size += SerializedSize(input_dim_);
+  serialize_size += SerializedSize(offset_dim_);
+  serialize_size += SerializedSize(mask_dim_);
+  serialize_size += SerializedSize(output_dim_);
+  return serialize_size;
+}
+
+void DeformableConvPlugin::serialize(void* buffer) const TRT_NOEXCEPT {
+  SerializeValue(&buffer, data_type_);
+  SerializeValue(&buffer, strides_);
+  SerializeValue(&buffer, paddings_);
+  SerializeValue(&buffer, dilations_);
+  SerializeValue(&buffer, groups_);
+  SerializeValue(&buffer, deformable_groups_);
+  SerializeValue(&buffer, im2col_step_);
+  SerializeValue(&buffer, kernel_dims_);
+  SerializeValue(&buffer, weights_.count);
+  serializeFromDevice(&buffer, weights_);
+  SerializeValue(&buffer, input_dim_);
+  SerializeValue(&buffer, offset_dim_);
+  SerializeValue(&buffer, mask_dim_);
+  SerializeValue(&buffer, output_dim_);
+}
+
+void DeformableConvPlugin::destroy() TRT_NOEXCEPT {}
+
+void DeformableConvPlugin::setPluginNamespace(const char* lib_namespace)
+    TRT_NOEXCEPT {
+  namespace_ = std::string(lib_namespace);
+}
+
+const char* DeformableConvPlugin::getPluginNamespace() const TRT_NOEXCEPT {
+  return namespace_.c_str();
+}
+
+nvinfer1::DataType DeformableConvPlugin::getOutputDataType(
+    int index, const nvinfer1::DataType* input_type,
+    int nb_inputs) const TRT_NOEXCEPT {
+  return data_type_;
+}
+
+bool DeformableConvPlugin::isOutputBroadcastAcrossBatch(
+    int output_index, const bool* input_is_broadcast,
+    int nb_inputs) const TRT_NOEXCEPT {
+  return false;
+}
+
+bool DeformableConvPlugin::canBroadcastInputAcrossBatch(int input_index) const
+    TRT_NOEXCEPT {
+  return false;
+}
+
+void DeformableConvPlugin::attachToContext(
+    cudnnContext* cudnnContext, cublasContext* cublasContext,
+    nvinfer1::IGpuAllocator* gpuAllocator) TRT_NOEXCEPT {
+  cublasHandle_ = cublasContext;
+}
+
+void DeformableConvPlugin::configurePlugin(
+    const nvinfer1::Dims* input_dims, int nb_inputs,
+    const nvinfer1::Dims* output_dims, int nb_outputs,
+    const nvinfer1::DataType* input_types,
+    const nvinfer1::DataType* output_types, const bool* input_is_broadcast,
+    const bool* output_is_broadcast, nvinfer1::PluginFormat float_format,
+    int max_batct_size) TRT_NOEXCEPT {
+  PADDLE_ENFORCE_EQ(
+      nb_inputs, 3,
+      platform::errors::InvalidArgument(
+          "The number of inputs should be equal to 3, but got %d", nb_inputs));
+  PADDLE_ENFORCE_EQ(
+      nb_outputs, 1,
+      platform::errors::InvalidArgument(
+          "The number of inputs should be equal to 1, but got %d", nb_outputs));
+
+  for (int i = 0; i < input_dims[0].nbDims; i++) {
+    input_dim_.push_back(input_dims[0].d[i]);
+  }
+  for (int i = 0; i < input_dims[1].nbDims; i++) {
+    offset_dim_.push_back(input_dims[1].d[i]);
+  }
+  for (int i = 0; i < input_dims[2].nbDims; i++) {
+    mask_dim_.push_back(input_dims[2].d[i]);
+  }
+  for (int i = 0; i < output_dims[0].nbDims; i++) {
+    output_dim_.push_back(output_dims[0].d[i]);
+  }
+}
+
+nvinfer1::IPluginV2Ext* DeformableConvPlugin::clone() const TRT_NOEXCEPT {
+  return new DeformableConvPlugin(data_type_, weights_, kernel_dims_, strides_,
+                                  paddings_, dilations_, groups_,
+                                  deformable_groups_, im2col_step_, input_dim_,
+                                  offset_dim_, mask_dim_, output_dim_);
+}
+
+void DeformableConvPluginCreator::setPluginNamespace(const char* lib_namespace)
+    TRT_NOEXCEPT {
+  namespace_ = std::string(lib_namespace);
+}
+
+const char* DeformableConvPluginCreator::getPluginNamespace() const
+    TRT_NOEXCEPT {
+  return namespace_.c_str();
+}
+
+const char* DeformableConvPluginCreator::getPluginName() const TRT_NOEXCEPT {
+  return "deformable_conv_plugin";
+}
+
+const char* DeformableConvPluginCreator::getPluginVersion() const TRT_NOEXCEPT {
+  return "1";
+}
+
+const nvinfer1::PluginFieldCollection*
+DeformableConvPluginCreator::getFieldNames() TRT_NOEXCEPT {
+  return &field_collection_;
+}
+
+nvinfer1::IPluginV2Ext* DeformableConvPluginCreator::createPlugin(
+    const char* name, const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT {
+  const nvinfer1::PluginField* fields = fc->fields;
+
+  nvinfer1::DataType data_type;
+  std::vector<int> strides, paddings, dilations, kernel_dims;
+  nvinfer1::Weights weights;
+  int groups = -1;
+  int deformable_groups = -1;
+  int im2col_step = -1;
+
+  for (int i = 0; i < fc->nbFields; ++i) {
+    const std::string field_name(fc->fields[i].name);
+    if (field_name.compare("data_type") == 0) {
+      data_type = *static_cast<const nvinfer1::DataType*>(fc->fields[i].data);
+    } else if (field_name.compare("strides")) {
+      const int length = fc->fields[i].length;
+      const int* data = static_cast<const int*>(fc->fields[i].data);
+      strides.insert(strides.end(), data, data + length);
+    } else if (field_name.compare("paddings")) {
+      const int length = fc->fields[i].length;
+      const int* data = static_cast<const int*>(fc->fields[i].data);
+      paddings.insert(paddings.end(), data, data + length);
+    } else if (field_name.compare("dilations")) {
+      const int length = fc->fields[i].length;
+      const int* data = static_cast<const int*>(fc->fields[i].data);
+      dilations.insert(dilations.end(), data, data + length);
+    } else if (field_name.compare("groups")) {
+      groups = *static_cast<const int*>(fc->fields[i].data);
+    } else if (field_name.compare("deformable_groups")) {
+      deformable_groups = *static_cast<const int*>(fc->fields[i].data);
+    } else if (field_name.compare("im2col_step")) {
+      im2col_step = *static_cast<const int*>(fc->fields[i].data);
+    } else if (field_name.compare("kernel_dims")) {
+      const int length = fc->fields[i].length;
+      const int* data = static_cast<const int*>(fc->fields[i].data);
+      kernel_dims.insert(kernel_dims.end(), data, data + length);
+    } else if (field_name.compare("weights")) {
+      weights.count = fc->fields[i].length;
+      weights.values = fc->fields[i].data;
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Unknown plugin field name [%s] in the DeformableConv TRT Plugin.",
+          field_name));
+    }
+  }
+  weights.type = data_type;
+  return new DeformableConvPlugin(data_type, weights, kernel_dims, strides,
+                                  paddings, dilations, groups,
+                                  deformable_groups, im2col_step);
+}
+
+nvinfer1::IPluginV2Ext* DeformableConvPluginCreator::deserializePlugin(
+    const char* name, const void* serial_data,
+    size_t serial_length) TRT_NOEXCEPT {
+  auto plugin = new DeformableConvPlugin(serial_data, serial_length);
+  plugin->setPluginNamespace(namespace_.c_str());
+  return plugin;
+}
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/deformable_conv_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/deformable_conv_op_plugin.h
new file mode 100644
index 0000000000000..8ba19288ce564
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/deformable_conv_op_plugin.h
@@ -0,0 +1,148 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <cstdio>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/inference/tensorrt/engine.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
+#include "paddle/fluid/platform/dynload/cublas.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+class DeformableConvPlugin : public nvinfer1::IPluginV2Ext {
+ public:
+  explicit DeformableConvPlugin(
+      const nvinfer1::DataType data_type, const nvinfer1::Weights& weights,
+      const std::vector<int>& kernel_dims, const std::vector<int>& strides,
+      const std::vector<int>& paddings, const std::vector<int>& dilations,
+      const int groups, const int deformable_groups, const int im2col_step);
+  explicit DeformableConvPlugin(
+      const nvinfer1::DataType data_type, const nvinfer1::Weights& weights,
+      const std::vector<int>& kernel_dims, const std::vector<int>& strides,
+      const std::vector<int>& paddings, const std::vector<int>& dilations,
+      const int groups, const int deformable_groups, const int im2col_step,
+      const std::vector<int>& input_dim, const std::vector<int>& offset_dim,
+      const std::vector<int>& mask_dim, const std::vector<int>& output_dim);
+  DeformableConvPlugin(const void* data, size_t length);
+  ~DeformableConvPlugin() override;
+
+  const char* getPluginType() const TRT_NOEXCEPT override;
+  const char* getPluginVersion() const TRT_NOEXCEPT override;
+  int getNbOutputs() const TRT_NOEXCEPT override;
+  nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs,
+                                     int nb_input_dims) TRT_NOEXCEPT override;
+  bool supportsFormat(nvinfer1::DataType type, nvinfer1::TensorFormat format)
+      const TRT_NOEXCEPT override;
+  size_t getWorkspaceSize(int max_batch_size) const TRT_NOEXCEPT override;
+#if IS_TRT_VERSION_LT(8000)
+  int enqueue(int batch_size, const void* const* inputs, void** outputs,
+#else
+  int enqueue(int batch_size, const void* const* inputs, void* const* outputs,
+#endif
+              void* workspace, cudaStream_t stream) TRT_NOEXCEPT override;
+  int initialize() TRT_NOEXCEPT override;
+  void terminate() TRT_NOEXCEPT override;
+  size_t getSerializationSize() const TRT_NOEXCEPT override;
+  void serialize(void* buffer) const TRT_NOEXCEPT override;
+  void destroy() TRT_NOEXCEPT override;
+  void setPluginNamespace(const char* lib_namespace) TRT_NOEXCEPT override;
+  const char* getPluginNamespace() const TRT_NOEXCEPT override;
+  nvinfer1::DataType getOutputDataType(
+      int index, const nvinfer1::DataType* input_type,
+      int nb_inputs) const TRT_NOEXCEPT override;
+  bool isOutputBroadcastAcrossBatch(int output_index,
+                                    const bool* input_is_broadcast,
+                                    int nb_inputs) const TRT_NOEXCEPT override;
+  bool canBroadcastInputAcrossBatch(int input_index) const
+      TRT_NOEXCEPT override;
+
+  void attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext,
+                       nvinfer1::IGpuAllocator* gpuAllocator)
+      TRT_NOEXCEPT override;
+
+  void configurePlugin(const nvinfer1::Dims* input_dims, int nb_inputs,
+                       const nvinfer1::Dims* output_dims, int nb_outputs,
+                       const nvinfer1::DataType* input_types,
+                       const nvinfer1::DataType* output_types,
+                       const bool* input_is_broadcast,
+                       const bool* output_is_broadcast,
+                       nvinfer1::PluginFormat float_format,
+                       int max_batct_size) TRT_NOEXCEPT override;
+  nvinfer1::IPluginV2Ext* clone() const TRT_NOEXCEPT override;
+
+ private:
+  template <typename T>
+  int enqueue_impl(int batch_size, const void* const* inputs,
+                   void* const* outputs, void* workspace, cudaStream_t stream);
+  nvinfer1::Weights copyToDevice(const void* hostData, size_t count);
+  void serializeFromDevice(void** hostBuffer,
+                           const nvinfer1::Weights& deviceWeights) const;
+  nvinfer1::Weights deserializeToDevice(const void** hostBuffer, size_t count);
+
+  nvinfer1::DataType data_type_;
+  nvinfer1::Weights weights_;
+  std::vector<int> kernel_dims_;
+  std::vector<int> strides_;
+  std::vector<int> paddings_;
+  std::vector<int> dilations_;
+  int groups_;
+  int deformable_groups_;
+  int im2col_step_;
+  std::string namespace_;
+
+  std::vector<int> input_dim_;
+  std::vector<int> offset_dim_;
+  std::vector<int> mask_dim_;
+  std::vector<int> output_dim_;
+
+  cublasHandle_t cublasHandle_;
+};
+
+class DeformableConvPluginCreator : public nvinfer1::IPluginCreator {
+ public:
+  DeformableConvPluginCreator() = default;
+  ~DeformableConvPluginCreator() override = default;
+
+  void setPluginNamespace(const char* lib_namespace) TRT_NOEXCEPT override;
+  const char* getPluginNamespace() const TRT_NOEXCEPT override;
+  const char* getPluginName() const TRT_NOEXCEPT override;
+  const char* getPluginVersion() const TRT_NOEXCEPT override;
+  const nvinfer1::PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override;
+
+  nvinfer1::IPluginV2Ext* createPlugin(
+      const char* name,
+      const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT override;
+  nvinfer1::IPluginV2Ext* deserializePlugin(
+      const char* name, const void* serial_data,
+      size_t serial_length) TRT_NOEXCEPT override;
+
+ private:
+  std::string namespace_;
+  nvinfer1::PluginFieldCollection field_collection_;
+};
+
+REGISTER_TRT_PLUGIN_V2(DeformableConvPluginCreator);
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc
index 6bfa8a821ae8c..8f8b73044232a 100644
--- a/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc
@@ -47,6 +47,10 @@ TEST(Analyzer_int8_image_classification, quantization) {
     std::shared_ptr<std::vector<PaddleTensor>> warmup_data =
         paddle::inference::GetWarmupData(input_slots_all);
 
+    // INT8 implies FC oneDNN passes to be used
+    q_cfg.pass_builder()->AppendPass("fc_mkldnn_pass");
+    q_cfg.pass_builder()->AppendPass("fc_act_mkldnn_fuse_pass");
+
     // configure quantizer
     q_cfg.EnableMkldnnQuantizer();
     q_cfg.mkldnn_quantizer_config()->SetWarmupData(warmup_data);
diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h
index 8e6b8b197d7f2..01953bd721f3e 100644
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -465,7 +465,8 @@ void PredictionWarmUp(PaddlePredictor *predictor,
   }
   int iterations = 1;
   if (FLAGS_warmup_iters > 1)
-    iterations = std::min(FLAGS_warmup_iters, static_cast<int>(inputs.size()));
+    iterations =
+        (std::min)(FLAGS_warmup_iters, static_cast<int>(inputs.size()));
   outputs->resize(iterations);
   Timer warmup_timer;
   double elapsed_time = 0;
diff --git a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_fp16_serialize_deserialize_test.cc b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_fp16_serialize_deserialize_test.cc
index a5b9e6825c8d4..7e9f71c8b3c0c 100644
--- a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_fp16_serialize_deserialize_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_fp16_serialize_deserialize_test.cc
@@ -13,9 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <dirent.h>
+#ifndef _WIN32
+#include <unistd.h>
+#else  // headers below are substitute of unistd.h in windows
+#include <io.h>
+#include <process.h>
+#endif
+#define GLOG_NO_ABBREVIATED_SEVERITIES
 #include <glog/logging.h>
 #include <gtest/gtest.h>
-#include <unistd.h>
 #include "gflags/gflags.h"
 
 #include "paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_serialize_deserialize_test.h"
diff --git a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_serialize_deserialize_test.cc b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_serialize_deserialize_test.cc
index 084169da3403d..209dd90c48070 100644
--- a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_serialize_deserialize_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_serialize_deserialize_test.cc
@@ -13,9 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <dirent.h>
+#ifndef _WIN32
+#include <unistd.h>
+#else  // headers below are substitute of unistd.h in windows
+#include <io.h>
+#include <process.h>
+#endif
+#define GLOG_NO_ABBREVIATED_SEVERITIES
 #include <glog/logging.h>
 #include <gtest/gtest.h>
-#include <unistd.h>
 #include "gflags/gflags.h"
 
 #include "paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_serialize_deserialize_test.h"
diff --git a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_serialize_deserialize_test.h b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_serialize_deserialize_test.h
index 86a5223cafe3c..5ae14576dfeb0 100644
--- a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_serialize_deserialize_test.h
+++ b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_serialize_deserialize_test.h
@@ -14,7 +14,12 @@ limitations under the License. */
 #pragma once
 #include <glog/logging.h>
 #include <gtest/gtest.h>
+#ifndef _WIN32
 #include <unistd.h>
+#else  // headers below are substitute of unistd.h in windows
+#include <io.h>
+#include <process.h>
+#endif
 #include <functional>
 #include <map>
 #include <string>
diff --git a/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt b/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt
index 9d590509a1eb6..fcc76538b9b03 100644
--- a/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt
@@ -194,7 +194,7 @@ if(WITH_GPU)
     if(USE_TENSORRT)
       set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/nvinfer${CMAKE_STATIC_LIBRARY_SUFFIX})
       set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/nvinfer_plugin${CMAKE_STATIC_LIBRARY_SUFFIX})
-      if(${TENSORRT_MAJOR_VERSION} GREATER_EQUAL 7)
+      if(${TENSORRT_MAJOR_VERSION} EQUAL 7)
         set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/myelin64_1${CMAKE_STATIC_LIBRARY_SUFFIX})
       endif()
     endif()
@@ -229,7 +229,7 @@ if(WIN32)
             COMMAND ${CMAKE_COMMAND} -E copy ${TENSORRT_LIB_DIR}/nvinfer_plugin${CMAKE_SHARED_LIBRARY_SUFFIX}
               ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE}
     )
-    if(${TENSORRT_MAJOR_VERSION} GREATER_EQUAL 7)
+    if(${TENSORRT_MAJOR_VERSION} EQUAL 7)
       add_custom_command(TARGET ${DEMO_NAME} POST_BUILD
               COMMAND ${CMAKE_COMMAND} -E copy ${TENSORRT_LIB_DIR}/myelin64_1${CMAKE_SHARED_LIBRARY_SUFFIX}
                 ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE})
diff --git a/paddle/fluid/inference/tests/infer_ut/test_ppyolov2_r50vd.cc b/paddle/fluid/inference/tests/infer_ut/test_ppyolov2_r50vd.cc
index 9689ec20956a1..67b0c5ca17c2f 100644
--- a/paddle/fluid/inference/tests/infer_ut/test_ppyolov2_r50vd.cc
+++ b/paddle/fluid/inference/tests/infer_ut/test_ppyolov2_r50vd.cc
@@ -73,7 +73,7 @@ TEST(tensorrt_tester_ppyolov2_r50vd, multi_thread2_trt_fp32_bz1) {
                   FLAGS_modeldir + "/model.pdiparams");
   config.EnableUseGpu(100, 0);
   config.EnableTensorRtEngine(
-      1 << 20, 2, 10, paddle_infer::PrecisionType::kFloat32, false, false);
+      1 << 28, 2, 10, paddle_infer::PrecisionType::kFloat32, false, false);
   LOG(INFO) << config.Summary();
   // get groudtruth by disbale ir
   paddle_infer::services::PredictorPool pred_pool_no_ir(config_no_ir, 1);
diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt
index 4aa1900f53f5e..58979d6c3e185 100644
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -18,6 +18,9 @@ if (WITH_GPU)
   nv_library(thread_local_allocator SRCS thread_local_allocator.cc DEPS allocator)
   nv_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator)
   cc_test(thread_local_allocator_test SRCS thread_local_allocator_test.cc DEPS thread_local_allocator)
+  if(CUDA_VERSION GREATER_EQUAL 10.2)
+    nv_library(cuda_virtual_mem_allocator SRCS cuda_virtual_mem_allocator.cc DEPS dynload_cuda)
+  endif()
 endif()
 
 if (WITH_ROCM)
@@ -36,6 +39,9 @@ cc_library(retry_allocator SRCS retry_allocator.cc DEPS allocator)
 
 if (WITH_GPU OR WITH_ROCM)
     set(AllocatorFacadeDeps gpu_info cuda_allocator pinned_allocator cuda_device_guard thread_local_allocator)
+    if(CUDA_VERSION GREATER_EQUAL 10.2)
+      list(APPEND AllocatorFacadeDeps cuda_virtual_mem_allocator)
+    endif()
 elseif(WITH_XPU)
     set(AllocatorFacadeDeps xpu_info)
 elseif(WITH_ASCEND)
@@ -72,7 +78,7 @@ else()
                 cpu_allocator)
 endif()
 
-list(APPEND AllocatorFacadeDeps cpu_allocator locked_allocator aligned_allocator retry_allocator buffered_allocator naive_best_fit_allocator auto_growth_best_fit_allocator best_fit_allocator)
+list(APPEND AllocatorFacadeDeps cpu_allocator locked_allocator aligned_allocator retry_allocator buffered_allocator naive_best_fit_allocator auto_growth_best_fit_allocator virtual_memory_auto_growth_best_fit_allocator best_fit_allocator)
 
 if (WITH_ASCEND_CL)
     list(APPEND AllocatorFacadeDeps npu_pinned_allocator)
@@ -107,6 +113,8 @@ cc_library(auto_growth_best_fit_allocator SRCS auto_growth_best_fit_allocator.cc
 cc_test(auto_growth_best_fit_allocator_facade_test SRCS auto_growth_best_fit_allocator_facade_test.cc DEPS cpu_allocator auto_growth_best_fit_allocator)
 cc_test(auto_growth_best_fit_allocator_test SRCS auto_growth_best_fit_allocator_test.cc DEPS auto_growth_best_fit_allocator)
 
+cc_library(virtual_memory_auto_growth_best_fit_allocator SRCS virtual_memory_auto_growth_best_fit_allocator.cc DEPS allocator aligned_allocator)
+
 if(NOT WIN32)
   cc_library(mmap_allocator SRCS mmap_allocator.cc DEPS allocator)
   cc_test(mmap_allocator_test SRCS mmap_allocator_test.cc DEPS mmap_allocator allocator)
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 281902f3a2b12..9da735636fc00 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -33,6 +33,11 @@
 #include "paddle/fluid/memory/allocation/thread_local_allocator.h"
 #include "paddle/fluid/platform/gpu_info.h"
 #endif
+#if CUDA_VERSION >= 10020
+#include "paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.h"
+#include "paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.h"
+#include "paddle/fluid/platform/dynload/cuda_driver.h"
+#endif
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/cuda_graph.h"
 #endif
@@ -51,6 +56,9 @@ PADDLE_DEFINE_EXPORTED_bool(
     "Whether to use system allocator to allocate CPU and GPU memory. "
     "Only used for unittests.");
 
+PADDLE_DEFINE_EXPORTED_bool(use_virtual_memory_auto_growth, false,
+                            "Use VirtualMemoryAutoGrowthBestFitAllocator.");
+
 DECLARE_string(allocator_strategy);
 
 namespace paddle {
@@ -202,7 +210,7 @@ class AllocatorFacadePrivate {
 
   inline const std::shared_ptr<Allocator>& GetAllocator(
       const platform::Place& place, size_t size) {
-    VLOG(4) << "GetAllocator"
+    VLOG(6) << "GetAllocator"
             << " " << place << " " << size;
     const auto& allocators =
         (size > 0 ? (UNLIKELY(FLAGS_use_system_allocator) ? system_allocators_
@@ -258,6 +266,40 @@ class AllocatorFacadePrivate {
 
   void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p,
                                    bool allow_free_idle_chunk) {
+#if defined(PADDLE_WITH_HIP)
+    auto cuda_allocator = std::make_shared<CUDAAllocator>(p);
+    allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
+        cuda_allocator, platform::GpuMinChunkSize(), allow_free_idle_chunk);
+#endif
+
+#if defined(PADDLE_WITH_CUDA)
+#if CUDA_VERSION >= 10020
+    CUdevice device;
+    int val;
+    try {
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          paddle::platform::dynload::cuDeviceGet(&device, p.GetDeviceId()));
+
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          paddle::platform::dynload::cuDeviceGetAttribute(
+              &val, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED,
+              device));
+    } catch (...) {
+      val = 0;
+    }
+
+    if (val > 0 && FLAGS_use_virtual_memory_auto_growth) {
+      auto cuda_allocator = std::make_shared<CUDAVirtualMemAllocator>(p);
+      allocators_[p] =
+          std::make_shared<VirtualMemoryAutoGrowthBestFitAllocator>(
+              cuda_allocator, platform::GpuMinChunkSize(), p);
+    } else {
+      auto cuda_allocator = std::make_shared<CUDAAllocator>(p);
+      allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
+          cuda_allocator, platform::GpuMinChunkSize(), allow_free_idle_chunk);
+    }
+
+#else
     auto cuda_allocator = std::make_shared<CUDAAllocator>(p);
     auto alignment = platform::GpuMinChunkSize();
     bool need_addr_align = true;
@@ -292,6 +334,8 @@ class AllocatorFacadePrivate {
     }
     allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
         underlying_allocator, alignment, 0, allow_free_idle_chunk);
+#endif
+#endif
   }
 #endif
 
diff --git a/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.cc b/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.cc
new file mode 100644
index 0000000000000..ef64c3bdb355e
--- /dev/null
+++ b/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.cc
@@ -0,0 +1,225 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifdef PADDLE_WITH_CUDA
+#include <cuda.h>
+#include <cuda_runtime.h>
+#endif
+
+#include <string>
+#include "paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.h"
+#include "paddle/fluid/platform/enforce.h"
+
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/cuda_device_guard.h"
+#include "paddle/fluid/platform/dynload/cuda_driver.h"
+#include "paddle/fluid/platform/gpu_info.h"
+#endif
+#if CUDA_VERSION >= 10020
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+CUDAVirtualMemAllocator::CUDAVirtualMemAllocator(
+    const platform::CUDAPlace& place)
+    : place_(place) {
+  CUmemAllocationProp prop = {};
+
+  // Setup the properties common for all the chunks
+  // The allocations will be device pinned memory.
+  // This property structure describes the physical location where the memory
+  // will be allocated via cuMemCreate allong with additional properties In this
+  // case, the allocation will be pinnded device memory local to a given device.
+  prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+  prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+  prop.location.id = place.device;
+  prop_ = prop;
+
+  // Prepare the access descriptor array indicating where and how the backings
+  // should be visible.
+  access_desc_.resize(platform::GetCUDADeviceCount());
+  for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount(); ++dev_id) {
+    if (place.device != dev_id) {
+      int capable = 0;
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          cudaDeviceCanAccessPeer(&capable, place.device, dev_id));
+      if (!capable) {
+        continue;
+      }
+    }
+    // Specify which device we are adding mappings for.
+    access_desc_[dev_id].location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+    access_desc_[dev_id].location.id = dev_id;
+
+    // Specify both read and write access.
+    access_desc_[dev_id].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+  }
+
+  // Get the minimum granularity needed for all devices
+  // (the max of the minimum granularity of each participating device)
+  granularity_ = 0;
+  for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount(); ++dev_id) {
+    size_t granularity;
+    prop.location.id = dev_id;
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        paddle::platform::dynload::cuMemGetAllocationGranularity(
+            &granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
+    granularity_ = std::max(granularity, granularity_);
+  }
+
+  size_t actual_avail, actual_total;
+  paddle::platform::CUDADeviceGuard guard(place.device);
+  PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemGetInfo(&actual_avail, &actual_total));
+
+  virtual_mem_size_ = AlignedSize(actual_total, granularity_);
+
+  // Reserve the required contiguous virtual address space for the allocations
+  // The maximum video memory size we can apply for is the video memory size of
+  // GPU,
+  // so the virtual address space size we reserve is equal to the GPU video
+  // memory size
+  PADDLE_ENFORCE_CUDA_SUCCESS(paddle::platform::dynload::cuMemAddressReserve(
+      &virtual_mem_base_, virtual_mem_size_, 0, 0, 0));
+
+  virtual_mem_alloced_offset_ = 0;
+}
+
+bool CUDAVirtualMemAllocator::IsAllocThreadSafe() const { return false; }
+
+void CUDAVirtualMemAllocator::FreeImpl(Allocation* allocation) {
+  PADDLE_ENFORCE_EQ(
+      BOOST_GET_CONST(platform::CUDAPlace, allocation->place()), place_,
+      platform::errors::PermissionDenied(
+          "GPU memory is freed in incorrect device. This may be a bug"));
+
+  auto iter = virtual_2_physical_map_.find(
+      reinterpret_cast<CUdeviceptr>(allocation->ptr()));
+  if (iter == virtual_2_physical_map_.end()) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Can not find virtual memory address at %s", allocation->ptr()));
+  }
+
+  int prev_id;
+  cudaGetDevice(&prev_id);
+  if (prev_id != place_.device) {
+    cudaSetDevice(place_.device);
+  }
+
+  auto result =
+      paddle::platform::dynload::cuMemUnmap(iter->first, iter->second.second);
+  if (result != CUDA_ERROR_DEINITIALIZED) {
+    PADDLE_ENFORCE_CUDA_SUCCESS(result);
+  }
+
+  if (result != CUDA_ERROR_DEINITIALIZED) {
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::RecordedCuMemRelease(
+        iter->second.first, iter->second.second, place_.device));
+  }
+
+  if (prev_id != place_.device) {
+    cudaSetDevice(prev_id);
+  }
+
+  virtual_2_physical_map_.erase(iter);
+
+  delete allocation;
+}
+
+Allocation* CUDAVirtualMemAllocator::AllocateImpl(size_t size) {
+  size = AlignedSize(size, granularity_);
+
+  CUdeviceptr ptr = virtual_mem_base_ + virtual_mem_alloced_offset_;
+
+  if (ptr + size > virtual_mem_base_ + virtual_mem_size_) {
+    PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
+        "\n\nOut of memory error on GPU Virtual Memory %d. "
+        "Cannot allocate %s memory on GPU Virtual Memory %d, %s memory has "
+        "been allocated and "
+        "available memory is only %s.\n\n"
+        "Please decrease the batch size of your model.\n\n",
+        place_.device, string::HumanReadableSize(size), place_.device,
+        string::HumanReadableSize(virtual_mem_alloced_offset_),
+        string::HumanReadableSize(virtual_mem_size_ -
+                                  virtual_mem_alloced_offset_),
+        place_.device));
+    return nullptr;
+  }
+
+  CUmemGenericAllocationHandle handle;
+
+  paddle::platform::CUDADeviceGuard guard(place_.device);
+
+  // Create physical memory backing allocation.
+  auto result =
+      platform::RecordedCuMemCreate(&handle, size, &prop_, 0, place_.device);
+
+  if (result != CUDA_SUCCESS) {
+    if (result == CUDA_ERROR_OUT_OF_MEMORY) {
+      size_t actual_avail, actual_total;
+      PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemGetInfo(&actual_avail, &actual_total));
+      size_t actual_allocated = actual_total - actual_avail;
+
+      PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
+          "\n\nOut of memory error on GPU %d. "
+          "Cannot allocate %s memory on GPU %d, %s memory has been allocated "
+          "and "
+          "available memory is only %s.\n\n"
+          "Please check whether there is any other process using GPU %d.\n"
+          "1. If yes, please stop them, or start PaddlePaddle on another GPU.\n"
+          "2. If no, please decrease the batch size of your model.\n\n",
+          place_.device, string::HumanReadableSize(size), place_.device,
+          string::HumanReadableSize(actual_allocated),
+          string::HumanReadableSize(actual_avail), place_.device));
+    } else {
+      PADDLE_ENFORCE_CUDA_SUCCESS(result);
+    }
+    return nullptr;
+  }
+
+  // Assign the chunk to the appropriate VA range and release the handle.
+  // After mapping the memory, it can be referenced by virtual address.
+  // The allocation will be kept live until it is unmapped.
+  result = paddle::platform::dynload::cuMemMap(ptr, size, 0, handle, 0);
+
+  if (result != CUDA_SUCCESS) {
+    platform::RecordedCuMemRelease(handle, size, place_.device);
+    PADDLE_ENFORCE_CUDA_SUCCESS(result);
+    return nullptr;
+  }
+
+  // Apply the access descriptors to the whole VA range.
+  result = paddle::platform::dynload::cuMemSetAccess(
+      ptr, size, access_desc_.data(), access_desc_.size());
+
+  if (result != CUDA_SUCCESS) {
+    paddle::platform::dynload::cuMemUnmap(ptr, size);
+    platform::RecordedCuMemRelease(handle, size, place_.device);
+    PADDLE_ENFORCE_CUDA_SUCCESS(result);
+    return nullptr;
+  }
+
+  virtual_2_physical_map_.emplace(ptr, std::make_pair(handle, size));
+
+  virtual_mem_alloced_offset_ += size;
+
+  return new Allocation(reinterpret_cast<void*>(ptr), size,
+                        platform::Place(place_));
+}
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
+
+#endif
diff --git a/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.h b/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.h
new file mode 100644
index 0000000000000..c51b56566bb02
--- /dev/null
+++ b/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.h
@@ -0,0 +1,62 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#ifdef PADDLE_WITH_CUDA
+#include <cuda_runtime.h>
+#include "paddle/fluid/platform/cuda_device_guard.h"
+#endif
+
+#include <mutex>  // NOLINT
+#include "paddle/fluid/memory/allocation/allocator.h"
+#include "paddle/fluid/platform/place.h"
+
+#if CUDA_VERSION >= 10020
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+// Allocate memory using NVIDIA's virtual memory management technology
+class CUDAVirtualMemAllocator : public Allocator {
+ public:
+  explicit CUDAVirtualMemAllocator(const platform::CUDAPlace& place);
+
+  bool IsAllocThreadSafe() const override;
+
+ protected:
+  void FreeImpl(Allocation* allocation) override;
+  Allocation* AllocateImpl(size_t size) override;
+
+ private:
+  platform::CUDAPlace place_;
+
+  CUdeviceptr virtual_mem_base_;
+  size_t virtual_mem_size_;
+  size_t virtual_mem_alloced_offset_;
+  size_t granularity_;
+
+  CUmemAllocationProp prop_;
+  std::vector<CUmemAccessDesc> access_desc_;
+
+  std::map<CUdeviceptr, std::pair<CUmemGenericAllocationHandle, size_t>>
+      virtual_2_physical_map_;
+};
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
+
+#endif
diff --git a/paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.cc b/paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.cc
new file mode 100644
index 0000000000000..5c7e8e2d933f3
--- /dev/null
+++ b/paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.cc
@@ -0,0 +1,254 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <mutex>
+
+#include "paddle/fluid/memory/allocation/aligned_allocator.h"
+#include "paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.h"
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+bool NeedSplit(size_t block_size, size_t alignment, size_t allock_size) {
+  return block_size > (allock_size * 2) ||
+         (block_size - allock_size) > alignment;
+}
+
+VirtualMemoryAutoGrowthBestFitAllocator::
+    VirtualMemoryAutoGrowthBestFitAllocator(
+        const std::shared_ptr<Allocator> &underlying_allocator,
+        size_t alignment, const platform::CUDAPlace &place)
+    : underlying_allocator_(
+          std::make_shared<AlignedAllocator>(underlying_allocator, alignment)),
+      alignment_(alignment),
+      place_(place) {}
+
+Allocation *VirtualMemoryAutoGrowthBestFitAllocator::AllocateImpl(size_t size) {
+  std::lock_guard<SpinLock> guard(spinlock_);
+  size = AlignedSize(size, alignment_);
+  auto result = AllocFromFreeBlocks(size);
+
+  if (!result) {
+    ExtendAndMerge(size);
+    result = AllocFromFreeBlocks(size);
+  }
+
+  return result;
+}
+
+void VirtualMemoryAutoGrowthBestFitAllocator::FreeImpl(Allocation *allocation) {
+  std::lock_guard<SpinLock> guard(spinlock_);
+  auto block_it = static_cast<BlockAllocation *>(allocation)->block_it_;
+  TryMergeBlock2Blocks(block_it);
+  delete allocation;
+}
+
+void VirtualMemoryAutoGrowthBestFitAllocator::TryMergeBlock2Blocks(
+    std::list<Block>::iterator block) {
+  if (block->ptr_ == all_blocks_.front().ptr_ &&
+      block->ptr_ == all_blocks_.back().ptr_) {
+    block->is_free_ = true;
+    free_blocks_.emplace(std::make_pair(block->size_, block->ptr_), block);
+  } else if (block->ptr_ == all_blocks_.front().ptr_) {
+    auto next = std::next(block);
+    if (next->is_free_ &&
+        reinterpret_cast<uint8_t *>(block->ptr_) + block->size_ == next->ptr_) {
+      // merge with next
+      block->size_ += next->size_;
+      block->is_free_ = true;
+      free_blocks_.erase(std::make_pair(next->size_, next->ptr_));
+      all_blocks_.erase(next);
+      free_blocks_.emplace(std::make_pair(block->size_, block->ptr_), block);
+    } else {
+      block->is_free_ = true;
+      free_blocks_.emplace(std::make_pair(block->size_, block->ptr_), block);
+    }
+  } else if (block->ptr_ == all_blocks_.back().ptr_) {
+    auto pre = std::prev(block);
+    if (pre->is_free_ &&
+        reinterpret_cast<uint8_t *>(pre->ptr_) + pre->size_ == block->ptr_) {
+      // merge with pre
+      free_blocks_.erase(std::make_pair(pre->size_, pre->ptr_));
+      pre->size_ += block->size_;
+      all_blocks_.erase(block);
+      free_blocks_.emplace(std::make_pair(pre->size_, pre->ptr_), pre);
+    } else {
+      block->is_free_ = true;
+      free_blocks_.emplace(std::make_pair(block->size_, block->ptr_), block);
+    }
+  } else {
+    auto pre = std::prev(block);
+    auto next = std::next(block);
+    if (pre->is_free_ &&
+        reinterpret_cast<uint8_t *>(pre->ptr_) + pre->size_ == block->ptr_ &&
+        !(next->is_free_ &&
+          reinterpret_cast<uint8_t *>(block->ptr_) + block->size_ ==
+              next->ptr_)) {
+      // merge with pre
+      free_blocks_.erase(std::make_pair(pre->size_, pre->ptr_));
+      pre->size_ += block->size_;
+      all_blocks_.erase(block);
+      free_blocks_.emplace(std::make_pair(pre->size_, pre->ptr_), pre);
+    } else if (next->is_free_ &&
+               reinterpret_cast<uint8_t *>(block->ptr_) + block->size_ ==
+                   next->ptr_ &&
+               !(pre->is_free_ &&
+                 reinterpret_cast<uint8_t *>(pre->ptr_) + pre->size_ ==
+                     block->ptr_)) {
+      // merge with next
+      block->size_ += next->size_;
+      block->is_free_ = true;
+      free_blocks_.erase(std::make_pair(next->size_, next->ptr_));
+      all_blocks_.erase(next);
+      free_blocks_.emplace(std::make_pair(block->size_, block->ptr_), block);
+    } else if (pre->is_free_ &&
+               reinterpret_cast<uint8_t *>(pre->ptr_) + pre->size_ ==
+                   block->ptr_ &&
+               next->is_free_ &&
+               reinterpret_cast<uint8_t *>(block->ptr_) + block->size_ ==
+                   next->ptr_) {
+      // merge with pre and next
+      free_blocks_.erase(std::make_pair(pre->size_, pre->ptr_));
+      free_blocks_.erase(std::make_pair(next->size_, next->ptr_));
+      pre->size_ += (block->size_ + next->size_);
+      all_blocks_.erase(block);
+      all_blocks_.erase(next);
+      free_blocks_.emplace(std::make_pair(pre->size_, pre->ptr_), pre);
+    } else {
+      block->is_free_ = true;
+      free_blocks_.emplace(std::make_pair(block->size_, block->ptr_), block);
+    }
+  }
+}
+
+void VirtualMemoryAutoGrowthBestFitAllocator::ExtendAndMerge(size_t size) {
+  void *ptr = nullptr;
+
+  auto allocateptr = underlying_allocator_->Allocate(size);
+  ptr = allocateptr->ptr();
+  size = allocateptr->size();
+  allocations_.push_back(std::move(allocateptr));  // hold allocation
+
+  if (all_blocks_.empty()) {
+    all_blocks_.push_back(Block(ptr, size, true));
+    free_blocks_.emplace(std::make_pair(size, ptr), all_blocks_.begin());
+    return;
+  }
+  for (auto block_it = all_blocks_.begin(); block_it != all_blocks_.end();
+       ++block_it) {
+    if (block_it->ptr_ > ptr) {
+      if (block_it == all_blocks_.begin()) {
+        // insert to front
+        if (block_it->is_free_ &&
+            reinterpret_cast<uint8_t *>(ptr) + size == block_it->ptr_) {
+          // merge with next
+          free_blocks_.erase(std::make_pair(block_it->size_, block_it->ptr_));
+          block_it->ptr_ = ptr;
+          block_it->size_ += size;
+          free_blocks_.emplace(std::make_pair(block_it->size_, block_it->ptr_),
+                               block_it);
+        } else {
+          // do not merge
+          all_blocks_.push_front(Block(ptr, size, true));
+          free_blocks_.emplace(std::make_pair(size, ptr), all_blocks_.begin());
+        }
+      } else {
+        // insert to middle
+        auto next = block_it;
+        auto pre = std::prev(block_it);
+        if (pre->is_free_ &&
+            reinterpret_cast<uint8_t *>(pre->ptr_) + pre->size_ == ptr &&
+            !(next->is_free_ &&
+              reinterpret_cast<uint8_t *>(ptr) + size == next->ptr_)) {
+          // merge with pre
+          free_blocks_.erase(std::make_pair(pre->size_, pre->ptr_));
+          pre->size_ += size;
+          free_blocks_.emplace(std::make_pair(pre->size_, pre->ptr_), pre);
+        } else if (next->is_free_ &&
+                   reinterpret_cast<uint8_t *>(ptr) + size == next->ptr_ &&
+                   !(pre->is_free_ &&
+                     reinterpret_cast<uint8_t *>(pre->ptr_) + pre->size_ ==
+                         ptr)) {
+          // merge with next
+          free_blocks_.erase(std::make_pair(next->size_, next->ptr_));
+          next->ptr_ = ptr;
+          next->size_ += size;
+          free_blocks_.emplace(std::make_pair(next->size_, next->ptr_), next);
+        } else if (pre->is_free_ &&
+                   reinterpret_cast<uint8_t *>(pre->ptr_) + pre->size_ == ptr &&
+                   next->is_free_ &&
+                   reinterpret_cast<uint8_t *>(ptr) + size == next->ptr_) {
+          // merge with pre and next
+          free_blocks_.erase(std::make_pair(pre->size_, pre->ptr_));
+          free_blocks_.erase(std::make_pair(next->size_, next->ptr_));
+          pre->size_ += (size + next->size_);
+          free_blocks_.emplace(std::make_pair(pre->size_, pre->ptr_), pre);
+          all_blocks_.erase(next);
+        } else {
+          // do not merge
+          auto iter = all_blocks_.insert(next, Block(ptr, size, true));
+          free_blocks_.emplace(std::make_pair(size, ptr), iter);
+        }
+      }
+      return;
+    }
+  }
+
+  // insert to back
+  auto block_it = all_blocks_.end();
+  block_it--;
+  if (block_it->is_free_ &&
+      reinterpret_cast<uint8_t *>(block_it->ptr_) + block_it->size_ == ptr) {
+    // merge with pre
+    free_blocks_.erase(std::make_pair(block_it->size_, block_it->ptr_));
+    block_it->size_ += size;
+    free_blocks_.emplace(std::make_pair(block_it->size_, block_it->ptr_),
+                         block_it);
+  } else {
+    // do not merge
+    all_blocks_.push_back(Block(ptr, size, true));
+    auto block_it = all_blocks_.end();
+    block_it--;
+    free_blocks_.emplace(std::make_pair(size, ptr), block_it);
+  }
+}
+
+Allocation *VirtualMemoryAutoGrowthBestFitAllocator::AllocFromFreeBlocks(
+    size_t size) {
+  auto iter = free_blocks_.lower_bound(std::make_pair(size, nullptr));
+  if (iter != free_blocks_.end()) {
+    std::list<Block>::iterator block_it = iter->second;
+    free_blocks_.erase(iter);
+    if (NeedSplit(block_it->size_, alignment_, size)) {
+      size_t remaining_size = block_it->size_ - size;
+      auto remaining_free_block = all_blocks_.insert(
+          block_it, Block(block_it->ptr_, remaining_size, true));
+      free_blocks_.emplace(std::make_pair(remaining_size, block_it->ptr_),
+                           remaining_free_block);
+      block_it->ptr_ =
+          reinterpret_cast<uint8_t *>(block_it->ptr_) + remaining_size;
+      block_it->size_ = size;
+    }
+
+    block_it->is_free_ = false;
+    return new BlockAllocation(block_it, place_);
+  }
+
+  return nullptr;
+}
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.h b/paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.h
new file mode 100644
index 0000000000000..5171e5b3cd1bf
--- /dev/null
+++ b/paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.h
@@ -0,0 +1,84 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <list>
+#include <map>
+#include <set>
+
+#include "paddle/fluid/memory/allocation/allocator.h"
+#include "paddle/fluid/memory/allocation/spin_lock.h"
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+struct Block {
+  Block(void *ptr, size_t size, bool is_free)
+      : ptr_(ptr), size_(size), is_free_(is_free) {}
+
+  void *ptr_;
+  size_t size_;
+  bool is_free_;
+};
+
+struct BlockAllocation : public Allocation {
+  explicit BlockAllocation(const std::list<Block>::iterator &it,
+                           platform::Place place)
+      : Allocation(it->ptr_, it->size_, place), block_it_(it) {}
+
+  std::list<Block>::iterator block_it_;
+};
+
+/**
+ * Like AutoGrowthBestFitAllocator, VirtualMemoryAutoGrowthBestFitAllocator will
+ * gradually apply to GPU for video memory as the model uses more video memory.
+ * However, the difference is that VirtualMemoryAutoGrowthBestFitAllocator uses
+ * nviaid's virtual memory management technology and obtains the virtual memory
+ * address. If the video memory applied for twice is continuous, we can combine
+ * the two video memories later. This combination can greatly reduce
+ * fragmentation.
+ */
+class VirtualMemoryAutoGrowthBestFitAllocator : public Allocator {
+ public:
+  VirtualMemoryAutoGrowthBestFitAllocator(
+      const std::shared_ptr<Allocator> &underlying_allocator, size_t alignment,
+      const platform::CUDAPlace &place);
+
+  bool IsAllocThreadSafe() const override { return true; }
+
+ protected:
+  Allocation *AllocateImpl(size_t size) override;
+
+  void FreeImpl(Allocation *allocation) override;
+
+ private:
+  Allocation *AllocFromFreeBlocks(size_t size);
+  void ExtendAndMerge(size_t size);
+  void TryMergeBlock2Blocks(std::list<Block>::iterator iter);
+
+  std::shared_ptr<Allocator> underlying_allocator_;
+  size_t alignment_;
+
+  std::map<std::pair<size_t, void *>, std::list<Block>::iterator> free_blocks_;
+  std::list<Block> all_blocks_;
+  std::list<AllocationPtr> allocations_;
+  platform::Place place_;
+  SpinLock spinlock_;
+};
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index dcf492dc6da37..f8d143af8a47c 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -79,8 +79,10 @@ if(WITH_UNITY_BUILD)
     include(unity_build_rule.cmake)
 endif()
 
-register_operators(EXCLUDES py_layer_op py_func_op warpctc_op dgc_op load_combine_op lstm_op run_program_op eye_op 
-        recurrent_op save_combine_op sparse_attention_op sync_batch_norm_op spectral_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS})
+set(OP_HEADER_DEPS ${OP_HEADER_DEPS} pten)
+
+register_operators(EXCLUDES py_layer_op py_func_op warpctc_op dgc_op load_combine_op lstm_op run_program_op eye_op
+        recurrent_op save_combine_op sparse_attention_op sync_batch_norm_op spectral_op cinn_launch_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS})
 
 op_library(run_program_op SRCS run_program_op.cc run_program_op.cu.cc DEPS executor_cache ${OP_HEADER_DEPS})
 op_library(save_combine_op DEPS string_array)
@@ -97,7 +99,7 @@ if (WITH_GPU OR WITH_ROCM)
     endif()
     op_library(sync_batch_norm_op)
     file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(sync_batch_norm);\n")
-    if ((NOT WIN32) AND (NOT WITH_ROCM) AND (NOT PADDLE_WITH_ARM) AND (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS 11.2) )
+    if ((NOT WIN32) AND (NOT WITH_ROCM) AND (NOT PADDLE_WITH_ARM) AND (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS 11.3) )
         op_library(sparse_attention_op)
         file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(sparse_attention);\n")
     endif()
@@ -166,6 +168,11 @@ if (WITH_ASCEND_CL)
   set(COMMON_OP_DEPS ${COMMON_OP_DEPS} npu_op_runner)
 endif()
 
+if (WITH_CINN)
+  op_library(cinn_launch_op SRCS cinn_launch_op.cc cinn_launch_op.cu.cc DEPS transform_desc cinn_compiler cinn ${OP_HEADER_DEPS})
+  cc_test(cinn_launch_op_test SRCS cinn_launch_op_test.cc DEPS cinn_compiler cinn_launch_op elementwise_add_op)
+endif()
+
 # FIXME(typhoonzero): operator deps may not needed.
 # op_library(lod_tensor_to_array_op DEPS lod_rank_table_op)
 # op_library(array_to_lod_tensor_op DEPS lod_rank_table_op)
diff --git a/paddle/fluid/operators/activation_op_npu.cc b/paddle/fluid/operators/activation_op_npu.cc
index 20c56d6a27933..e0cb4dee5311a 100644
--- a/paddle/fluid/operators/activation_op_npu.cc
+++ b/paddle/fluid/operators/activation_op_npu.cc
@@ -503,7 +503,6 @@ class SwishGradNPUKernel : public framework::OpKernel<T> {
     beta_x.mutable_data<T>(x->dims(), ctx.GetPlace());
     sigmoid_out.mutable_data<T>(x->dims(), ctx.GetPlace());
     swish_out.mutable_data<T>(x->dims(), ctx.GetPlace());
-
     const auto& muls_runner =
         NpuOpRunner("Muls", {*x}, {beta_x}, {{"value", beta}});
     muls_runner.Run(stream);
@@ -515,6 +514,9 @@ class SwishGradNPUKernel : public framework::OpKernel<T> {
     const auto& mul_runner =
         NpuOpRunner("Mul", {sigmoid_out, *x}, {swish_out}, {});
     mul_runner.Run(stream);
+    const auto& muls_runner2 =
+        NpuOpRunner("Muls", {swish_out}, {swish_out}, {{"value", beta}});
+    muls_runner2.Run(stream);
 
     const auto& mul_runner1 =
         NpuOpRunner("Mul", {sigmoid_out, swish_out}, {*dx}, {});
diff --git a/paddle/fluid/operators/activation_op_xpu.cc b/paddle/fluid/operators/activation_op_xpu.cc
index 257a91d7c15d7..2c3d9697366ca 100644
--- a/paddle/fluid/operators/activation_op_xpu.cc
+++ b/paddle/fluid/operators/activation_op_xpu.cc
@@ -53,14 +53,14 @@ class XPUActivationGradKernel
   }
 };
 
-template <typename DeviceContext, typename T>
+template <typename DeviceContext, typename T, typename XPUT>
 void xpu_activation_forward(
     const framework::ExecutionContext &ctx,
-    std::function<int(xpu::Context *, const T *, T *, int)> func) {
+    std::function<int(xpu::Context *, const XPUT *, XPUT *, int)> func) {
   const auto *x = ctx.Input<Tensor>("X");
   auto *y = ctx.Output<Tensor>("Out");
-  const T *x_data = x->data<T>();
-  T *y_data = y->mutable_data<T>(ctx.GetPlace());
+  const XPUT *x_data = reinterpret_cast<const XPUT *>(x->data<T>());
+  XPUT *y_data = reinterpret_cast<XPUT *>(y->mutable_data<T>(ctx.GetPlace()));
 
   auto xpu_context = ctx.device_context<DeviceContext>().x_context();
   int r = func(xpu_context, x_data, y_data, x->numel());
@@ -70,23 +70,24 @@ void xpu_activation_forward(
                                  r, XPUAPIErrorMsg[r]));
 }
 
-template <typename DeviceContext, typename T>
-void xpu_activation_backward(const framework::ExecutionContext &ctx,
-                             std::function<int(xpu::Context *, const T *,
-                                               const T *, const T *, T *, int)>
-                                 func) {
+template <typename DeviceContext, typename T, typename XPUT>
+void xpu_activation_backward(
+    const framework::ExecutionContext &ctx,
+    std::function<int(xpu::Context *, const XPUT *, const XPUT *, const XPUT *,
+                      XPUT *, int)>
+        func) {
   /* TODO: relu tanh sigmoid are inplace */
   const auto *x = ctx.Input<Tensor>("X");
   auto *y = ctx.Input<Tensor>("Out");
   auto *dOut = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
   auto *dX = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-  const T *x_data = nullptr;
-  const T *y_data = nullptr;
-  const T *y_grad = nullptr;
-  if (x != nullptr) x_data = x->data<T>();
-  if (y != nullptr) y_data = y->data<T>();
-  if (dOut != nullptr) y_grad = dOut->data<T>();
-  T *x_grad = dX->mutable_data<T>(ctx.GetPlace());
+  const XPUT *x_data = nullptr;
+  const XPUT *y_data = nullptr;
+  const XPUT *y_grad = nullptr;
+  if (x != nullptr) x_data = reinterpret_cast<const XPUT *>(x->data<T>());
+  if (y != nullptr) y_data = reinterpret_cast<const XPUT *>(y->data<T>());
+  if (dOut != nullptr) y_grad = reinterpret_cast<const XPUT *>(dOut->data<T>());
+  XPUT *x_grad = reinterpret_cast<XPUT *>(dX->mutable_data<T>(ctx.GetPlace()));
   auto xpu_context = ctx.device_context<DeviceContext>().x_context();
 
   int r = func(xpu_context, x_data, y_data, y_grad, x_grad, dX->numel());
@@ -98,65 +99,64 @@ void xpu_activation_backward(const framework::ExecutionContext &ctx,
 
 template <typename T>
 struct XPUReluFunctor : public BaseActivationFunctor<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
   void operator()(const framework::ExecutionContext &ctx) const {
-    xpu_activation_forward<paddle::platform::XPUDeviceContext, T>(ctx,
-                                                                  xpu::relu<T>);
+    xpu_activation_forward<paddle::platform::XPUDeviceContext, T, XPUType>(
+        ctx, xpu::relu<XPUType>);
   }
 };
 
 template <typename T>
 struct XPUSigmoidFunctor : public BaseActivationFunctor<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
   void operator()(const framework::ExecutionContext &ctx) const {
-    xpu_activation_forward<paddle::platform::XPUDeviceContext, T>(
-        ctx, xpu::sigmoid<T>);
+    xpu_activation_forward<paddle::platform::XPUDeviceContext, T, XPUType>(
+        ctx, xpu::sigmoid<XPUType>);
   }
 };
 
 template <typename T>
 struct XPUTanhFunctor : public BaseActivationFunctor<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
   void operator()(const framework::ExecutionContext &ctx) const {
-    xpu_activation_forward<paddle::platform::XPUDeviceContext, T>(ctx,
-                                                                  xpu::tanh<T>);
-  }
-};
-
-template <typename T>
-struct XPUGeluFunctor : public BaseActivationFunctor<T> {
-  void operator()(const framework::ExecutionContext &ctx) const {
-    xpu_activation_forward<paddle::platform::XPUDeviceContext, T>(ctx,
-                                                                  xpu::gelu<T>);
+    xpu_activation_forward<paddle::platform::XPUDeviceContext, T, XPUType>(
+        ctx, xpu::tanh<XPUType>);
   }
 };
 
 template <typename T>
 struct XPULogFunctor : public BaseActivationFunctor<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
   void operator()(const framework::ExecutionContext &ctx) const {
-    xpu_activation_forward<paddle::platform::XPUDeviceContext, T>(ctx,
-                                                                  xpu::log<T>);
+    xpu_activation_forward<paddle::platform::XPUDeviceContext, T, XPUType>(
+        ctx, xpu::log<XPUType>);
   }
 };
 
 template <typename T>
 struct XPUSquareFunctor : public BaseActivationFunctor<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
   void operator()(const framework::ExecutionContext &ctx) const {
-    xpu_activation_forward<paddle::platform::XPUDeviceContext, T>(
-        ctx, xpu::square<T>);
+    xpu_activation_forward<paddle::platform::XPUDeviceContext, T, XPUType>(
+        ctx, xpu::square<XPUType>);
   }
 };
 
 template <typename T>
 struct XPUSqrtFunctor : public BaseActivationFunctor<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
   void operator()(const framework::ExecutionContext &ctx) const {
-    xpu_activation_forward<paddle::platform::XPUDeviceContext, T>(ctx,
-                                                                  xpu::sqrt<T>);
+    xpu_activation_forward<paddle::platform::XPUDeviceContext, T, XPUType>(
+        ctx, xpu::sqrt<XPUType>);
   }
 };
 
 template <typename T>
 struct XPUAbsFunctor : public BaseActivationFunctor<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
   void operator()(const framework::ExecutionContext &ctx) const {
-    xpu_activation_forward<paddle::platform::XPUDeviceContext, T>(ctx,
-                                                                  xpu::abs<T>);
+    xpu_activation_forward<paddle::platform::XPUDeviceContext, T, XPUType>(
+        ctx, xpu::abs<XPUType>);
   }
 };
 
@@ -196,6 +196,7 @@ struct XPUPowFunctor : public BaseActivationFunctor<T> {
 
 template <typename T>
 struct XPUHardSwishFunctor : public BaseActivationFunctor<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
   void operator()(const framework::ExecutionContext &ctx) const {
     float threshold = ctx.Attr<float>("threshold");
     float scale = ctx.Attr<float>("scale");
@@ -208,61 +209,59 @@ struct XPUHardSwishFunctor : public BaseActivationFunctor<T> {
     PADDLE_ENFORCE_EQ(
         offset, 3.0f,
         platform::errors::External("Not support offset [%f] in XPU", offset));
-    xpu_activation_forward<paddle::platform::XPUDeviceContext, T>(
-        ctx, xpu::hard_swish<T>);
+    xpu_activation_forward<paddle::platform::XPUDeviceContext, T, XPUType>(
+        ctx, xpu::hard_swish<XPUType>);
   }
 };
 
 template <typename T>
 struct XPUReluGradFunctor : public BaseActivationFunctor<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
   void operator()(const framework::ExecutionContext &ctx) const {
-    xpu_activation_backward<paddle::platform::XPUDeviceContext, T>(
-        ctx, xpu::relu_grad<T>);
+    xpu_activation_backward<paddle::platform::XPUDeviceContext, T, XPUType>(
+        ctx, xpu::relu_grad<XPUType>);
   }
 };
 
 template <typename T>
 struct XPUTanhGradFunctor : public BaseActivationFunctor<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
   void operator()(const framework::ExecutionContext &ctx) const {
-    xpu_activation_backward<paddle::platform::XPUDeviceContext, T>(
-        ctx, xpu::tanh_grad<T>);
+    xpu_activation_backward<paddle::platform::XPUDeviceContext, T, XPUType>(
+        ctx, xpu::tanh_grad<XPUType>);
   }
 };
 
 template <typename T>
 struct XPUSigmoidGradFunctor : public BaseActivationFunctor<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
   void operator()(const framework::ExecutionContext &ctx) const {
-    xpu_activation_backward<paddle::platform::XPUDeviceContext, T>(
-        ctx, xpu::sigmoid_grad<T>);
-  }
-};
-
-template <typename T>
-struct XPUGeluGradFunctor : public BaseActivationFunctor<T> {
-  void operator()(const framework::ExecutionContext &ctx) const {
-    xpu_activation_backward<paddle::platform::XPUDeviceContext, T>(
-        ctx, xpu::gelu_grad<T>);
+    xpu_activation_backward<paddle::platform::XPUDeviceContext, T, XPUType>(
+        ctx, xpu::sigmoid_grad<XPUType>);
   }
 };
 
 template <typename T>
 struct XPUSqrtGradFunctor : public BaseActivationFunctor<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
   void operator()(const framework::ExecutionContext &ctx) const {
-    xpu_activation_backward<paddle::platform::XPUDeviceContext, T>(
-        ctx, xpu::sqrt_grad<T>);
+    xpu_activation_backward<paddle::platform::XPUDeviceContext, T, XPUType>(
+        ctx, xpu::sqrt_grad<XPUType>);
   }
 };
 
 template <typename T>
 struct XPUSquareGradFunctor : public BaseActivationFunctor<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
   void operator()(const framework::ExecutionContext &ctx) const {
-    xpu_activation_backward<paddle::platform::XPUDeviceContext, T>(
-        ctx, xpu::square_grad<T>);
+    xpu_activation_backward<paddle::platform::XPUDeviceContext, T, XPUType>(
+        ctx, xpu::square_grad<XPUType>);
   }
 };
 
 template <typename T>
 struct XPUHardSwishGradFunctor : public BaseActivationFunctor<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
   void operator()(const framework::ExecutionContext &ctx) const {
     float threshold = ctx.Attr<float>("threshold");
     float scale = ctx.Attr<float>("scale");
@@ -275,8 +274,8 @@ struct XPUHardSwishGradFunctor : public BaseActivationFunctor<T> {
     PADDLE_ENFORCE_EQ(
         offset, 3.0f,
         platform::errors::External("Not support offset [%f] in XPU", offset));
-    xpu_activation_backward<paddle::platform::XPUDeviceContext, T>(
-        ctx, xpu::hard_swish_grad<T>);
+    xpu_activation_backward<paddle::platform::XPUDeviceContext, T, XPUType>(
+        ctx, xpu::hard_swish_grad<XPUType>);
   }
 };
 
@@ -342,16 +341,23 @@ namespace ops = paddle::operators;
       ops::XPUActivationGradKernel<ops::grad_functor<float>>);
 
 REGISTER_ACTIVATION_XPU_KERNEL(relu, XPUReluFunctor, XPUReluGradFunctor)
-REGISTER_ACTIVATION_XPU_KERNEL(tanh, XPUTanhFunctor, XPUTanhGradFunctor)
 REGISTER_ACTIVATION_XPU_KERNEL(sigmoid, XPUSigmoidFunctor,
                                XPUSigmoidGradFunctor)
-REGISTER_ACTIVATION_XPU_KERNEL(gelu, XPUGeluFunctor, XPUGeluGradFunctor)
 REGISTER_ACTIVATION_XPU_KERNEL(sqrt, XPUSqrtFunctor, XPUSqrtGradFunctor)
 REGISTER_ACTIVATION_XPU_KERNEL(square, XPUSquareFunctor, XPUSquareGradFunctor)
 REGISTER_ACTIVATION_XPU_KERNEL(hard_swish, XPUHardSwishFunctor,
                                XPUHardSwishGradFunctor)
 REGISTER_ACTIVATION_XPU_KERNEL(leaky_relu, XPULeakyReluFunctor,
                                XPULeakyReluGradFunctor)
+
+REGISTER_OP_XPU_KERNEL(
+    tanh, ops::XPUActivationKernel<ops::XPUTanhFunctor<float>>,
+    ops::XPUActivationKernel<ops::XPUTanhFunctor<paddle::platform::float16>>);
+REGISTER_OP_XPU_KERNEL(
+    tanh_grad, ops::XPUActivationGradKernel<ops::XPUTanhGradFunctor<float>>,
+    ops::XPUActivationGradKernel<
+        ops::XPUTanhGradFunctor<paddle::platform::float16>>);
+
 REGISTER_OP_XPU_KERNEL(log,
                        ops::XPUActivationKernel<ops::XPULogFunctor<float>>);
 REGISTER_OP_XPU_KERNEL(pow,
diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc
index 210f3e098f95f..28c209018d662 100644
--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc
@@ -74,27 +74,15 @@ class CheckFiniteAndUnscaleXPUKernel : public framework::OpKernel<T> {
             platform::errors::External("XPU API(logical_not) return wrong "
                                        "value[%d %s]",
                                        r, XPUAPIErrorMsg[r]));
-        r = xpu::isnan(dev_ctx.x_context(),
-                       reinterpret_cast<const XPUTyp*>(x->data<T>()),
-                       is_nan.data<bool>(), x->numel());
-        PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External(
-                                              "XPU API(isnan) return wrong "
-                                              "value[%d %s]",
-                                              r, XPUAPIErrorMsg[r]));
-        r = xpu::logical_or(dev_ctx.x_context(), is_finite.data<bool>(),
-                            is_nan.data<bool>(), is_finite.data<bool>(),
-                            x->numel());
-        PADDLE_ENFORCE_EQ(
-            r, XPU_SUCCESS,
-            platform::errors::External("XPU API(logical_or) return wrong "
-                                       "value[%d %s]",
-                                       r, XPUAPIErrorMsg[r]));
         r = xpu::any(dev_ctx.x_context(), is_finite.data<bool>(),
                      found_inf_data, x->numel());
         PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External(
                                               "XPU API(any) return wrong "
                                               "value[%d %s]",
                                               r, XPUAPIErrorMsg[r]));
+        if (dev_ctx.x_context()->xpu_stream) {
+          dev_ctx.Wait();
+        }
         memory::Copy(platform::CPUPlace(), &cpu_found_inf_data,
                      BOOST_GET_CONST(platform::XPUPlace, dev_ctx.GetPlace()),
                      found_inf_data, sizeof(bool));
@@ -103,12 +91,12 @@ class CheckFiniteAndUnscaleXPUKernel : public framework::OpKernel<T> {
       if (cpu_found_inf_data) {
         inverse_scale = 0.0;
       }
-      auto dev_env = XPUEnv::getenv("XPUSIM_DEVICE_MODEL");
 
+      paddle::platform::XPUVersion version = dev_ctx.xpu_version();
+      framework::Tensor float_x;
+      framework::Tensor float_out;
       if (std::is_same<T, paddle::platform::float16>::value &&
-          (dev_env == nullptr || std::strcmp(dev_env, "KUNLUN1"))) {
-        framework::Tensor float_x;
-        framework::Tensor float_out;
+          (version == paddle::platform::XPUVersion::XPU1)) {
         float_x.mutable_data<MPDType>(dev_ctx.GetPlace(),
                                       x->numel() * sizeof(MPDType));
         float_out.mutable_data<MPDType>(dev_ctx.GetPlace(),
@@ -137,10 +125,6 @@ class CheckFiniteAndUnscaleXPUKernel : public framework::OpKernel<T> {
                                               "XPU API(cast_v2) return wrong "
                                               "value[%d %s]",
                                               r, XPUAPIErrorMsg[r]));
-        if (dev_ctx.x_context()->xpu_stream) {
-          dev_ctx.Wait();
-        }
-
       } else {
         int r = xpu::scale(dev_ctx.x_context(),
                            reinterpret_cast<const XPUTyp*>(x->data<T>()),
@@ -152,6 +136,9 @@ class CheckFiniteAndUnscaleXPUKernel : public framework::OpKernel<T> {
                                               r, XPUAPIErrorMsg[r]));
       }
     }
+    if (dev_ctx.x_context()->xpu_stream) {
+      dev_ctx.Wait();
+    }
     memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dev_ctx.GetPlace()),
                  found_inf_data, platform::CPUPlace(), &cpu_found_inf_data,
                  sizeof(bool));
diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op_xpu.cc b/paddle/fluid/operators/amp/update_loss_scaling_op_xpu.cc
index 1f05e5f246d9c..d9b3dcd6c15cf 100644
--- a/paddle/fluid/operators/amp/update_loss_scaling_op_xpu.cc
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op_xpu.cc
@@ -113,10 +113,9 @@ class UpdateLossScalingXPUKernel : public framework::OpKernel<T> {
     } else {
       cpu_pre_loss_scaling_data = (*pre_loss_scaling_data);
     }
-
     int cpu_good_out_data = 0;
     int cpu_bad_out_data = 0;
-    MPDType cpu_updated_loss_scaling_data;
+    MPDType cpu_updated_loss_scaling_data = cpu_pre_loss_scaling_data;
 
     if (cpu_found_inf_data) {
       cpu_good_out_data = 0;
@@ -140,8 +139,7 @@ class UpdateLossScalingXPUKernel : public framework::OpKernel<T> {
         cpu_good_out_data = 0;
       }
     }
-
-    // copy to host
+    // copy to device
     memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dev_ctx.GetPlace()),
                  bad_out_data, platform::CPUPlace(), &cpu_bad_out_data,
                  sizeof(int));
diff --git a/paddle/fluid/operators/batch_norm_op.cu b/paddle/fluid/operators/batch_norm_op.cu
index 3d26c2c570858..b4cf9c48df2a8 100644
--- a/paddle/fluid/operators/batch_norm_op.cu
+++ b/paddle/fluid/operators/batch_norm_op.cu
@@ -916,7 +916,7 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
     Tensor transformed_d_y(d_y->type());
     Tensor transformed_d_x;
     if (data_layout == DataLayout::kNHWC &&
-        compute_format == DataLayout::kNCHW) {
+        compute_format == DataLayout::kNCHW && x_dims.size() > 2) {
       VLOG(3) << "Transform input tensor from NHWC to NCHW.";
       ResizeToChannelFirst<platform::CUDADeviceContext, T>(ctx, x,
                                                            &transformed_x);
diff --git a/paddle/fluid/operators/cinn_launch_op.cc b/paddle/fluid/operators/cinn_launch_op.cc
new file mode 100644
index 0000000000000..26b2e1b24921b
--- /dev/null
+++ b/paddle/fluid/operators/cinn_launch_op.cc
@@ -0,0 +1,262 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/cinn_launch_op.h"
+#include "paddle/fluid/string/string_helper.h"
+
+namespace paddle {
+namespace operators {
+
+namespace details {
+
+const ::cinn::common::Target& PlaceToCinnTarget(const platform::Place& place) {
+  if (platform::is_cpu_place(place)) {
+    return ::cinn::common::DefaultHostTarget();
+  } else if (platform::is_gpu_place(place)) {
+    return ::cinn::common::DefaultNVGPUTarget();
+  }
+
+  PADDLE_THROW(platform::errors::InvalidArgument(
+      "CINN is not supported on current place:%s", place));
+  return ::cinn::common::UnkTarget();
+}
+
+void DebugCinnCompiledResult(const CinnCompiledObject& result) {
+  if (!VLOG_IS_ON(4)) {
+    return;
+  }
+  const auto& cinn_runtime_program = result.runtime_program;
+  const auto& cinn_scope = *(result.scope);
+  const auto& paddle2cinn_varmap = result.paddle2cinn_varmap;
+
+  VLOG(4) << "Compiled runtime_program instrunction size:["
+          << cinn_runtime_program->size() << "]";
+
+  std::vector<std::string> infos;
+  auto cinn_var_names = cinn_scope.var_names();
+  infos.reserve(cinn_var_names.size());
+  std::transform(cinn_var_names.begin(), cinn_var_names.end(),
+                 std::back_inserter(infos),
+                 [](const auto& name_view) { return name_view.data(); });
+  VLOG(4) << "Compiled scope variable names:["
+          << string::join_strings(infos, ',') << "]";
+
+  infos.clear();
+  infos.reserve(paddle2cinn_varmap.size());
+  std::transform(paddle2cinn_varmap.begin(), paddle2cinn_varmap.end(),
+                 std::back_inserter(infos), [](const auto& paddle2cinn) {
+                   return paddle2cinn.first + "->" + paddle2cinn.second;
+                 });
+  VLOG(4) << "Compiled paddle2cinn_varmap:[" << string::join_strings(infos, ',')
+          << "]";
+}
+
+std::vector<std::string> MapPaddleVariablesToCinn(
+    const std::vector<std::string>& paddle_names,
+    const std::unordered_map<std::string, std::string>& paddle2cinn_varmap) {
+  std::vector<std::string> result;
+  result.reserve(result.size());
+  std::transform(
+      paddle_names.begin(), paddle_names.end(), std::back_inserter(result),
+      [&paddle2cinn_varmap](const std::string& pd_name) {
+        PADDLE_ENFORCE_GT(paddle2cinn_varmap.count(pd_name), 0,
+                          platform::errors::NotFound(
+                              "Not found the corresponding cinn variable "
+                              "of paddle variable(%s) in compilation result.",
+                              pd_name));
+        return paddle2cinn_varmap.at(pd_name);
+      });
+  return result;
+}
+
+std::vector<CinnTensor> GetCinnTensorsFromCompiledScope(
+    const std::vector<std::string>& cinn_names, const CinnScope& cinn_scope) {
+  std::vector<CinnTensor> result;
+  result.reserve(cinn_names.size());
+  std::transform(cinn_names.begin(), cinn_names.end(),
+                 std::back_inserter(result),
+                 [&cinn_scope](const std::string& var_name) {
+                   PADDLE_ENFORCE_NOT_NULL(
+                       cinn_scope.FindVar(var_name),
+                       platform::errors::NotFound(
+                           "Variable(%s) not found in cinn scope.", var_name));
+                   return cinn_scope.GetTensor(var_name);
+                 });
+  return result;
+}
+
+void CheckTensorEquivalent(const std::string& paddle_name,
+                           const LoDTensor* paddle_tensor,
+                           const CinnTensor& cinn_tensor) {
+  PADDLE_ENFORCE_EQ(
+      paddle_tensor->IsInitialized(), true,
+      platform::errors::InvalidArgument(
+          "The tensor in variable(%s) is not initialized.", paddle_name));
+
+  // check dimension
+  auto cinn_dims = framework::make_ddim(cinn_tensor->shape().data());
+  PADDLE_ENFORCE_EQ(paddle_tensor->dims(), cinn_dims,
+                    platform::errors::InvalidArgument(
+                        "The tensor dimension in variable(%s) "
+                        "is not equivalent, paddle is [%s] "
+                        "but cinn is [%s].",
+                        paddle_name, paddle_tensor->dims(), cinn_dims));
+
+  // TODO(CtfGo): check the underlying data type after CINN ready
+}
+
+void TensorMutableDataWithCinnInfo(const platform::Place& place,
+                                   const CinnTensor& cinn_tensor,
+                                   LoDTensor* paddle_tensor) {
+  // TODO(CtfGo): support mutable corresponding c++ type after CINN ready
+  paddle_tensor->mutable_data<float>(
+      framework::make_ddim(cinn_tensor->shape().data()), place);
+}
+
+std::vector<std::string> SeperateTempVar(
+    const CinnScope& cinn_scope,
+    const std::vector<std::string>& input_cinn_names,
+    const std::vector<std::string>& output_cinn_names) {
+  auto cinn_var_names = cinn_scope.var_names();
+  std::unordered_set<std::string> all_cinn_names;
+  all_cinn_names.reserve(cinn_var_names.size());
+  std::transform(
+      cinn_var_names.begin(), cinn_var_names.end(),
+      std::inserter(all_cinn_names, all_cinn_names.end()),
+      [](const auto& name_view) { return std::string(name_view.data()); });
+
+  auto exclude_fn = [&all_cinn_names](const auto& cinn_name) {
+    all_cinn_names.erase(cinn_name);
+  };
+
+  std::for_each(input_cinn_names.begin(), input_cinn_names.end(), exclude_fn);
+  std::for_each(output_cinn_names.begin(), output_cinn_names.end(), exclude_fn);
+  return {all_cinn_names.begin(), all_cinn_names.end()};
+}
+
+std::unique_ptr<cinn_buffer_t> ShareTensorWithCinnBuffer(LoDTensor* tensor) {
+  // convert paddle dimensions array to cinn format
+  std::vector<cinn_dimension_t> cinn_dims(tensor->dims().size());
+  for (auto i = 0; i < tensor->dims().size(); ++i) {
+    cinn_dims[i] = static_cast<cinn_dimension_t>(tensor->dims().at(i));
+  }
+
+  auto cinn_buffer = std::make_unique<cinn_buffer_t>();
+  // assign size and memory
+  cinn_buffer->resize(cinn_dims.data(), cinn_dims.size());
+  cinn_buffer->memory = reinterpret_cast<uint8_t*>(tensor->data<float>());
+  return cinn_buffer;
+}
+
+void CheckArgumentsNotMissed(
+    const CinnScope& cinn_scope,
+    const std::map<std::string, cinn_pod_value_t>& name2argument) {
+  auto cinn_var_names = cinn_scope.var_names();
+  std::for_each(cinn_var_names.begin(), cinn_var_names.end(),
+                [&name2argument](const auto& name_view) {
+                  PADDLE_ENFORCE_GT(
+                      name2argument.count(name_view.data()), 0,
+                      platform::errors::InvalidArgument(
+                          "Parameter(%s) is not assgined.", name_view.data()));
+                });
+}
+
+}  // namespace details
+
+class CinnLaunchOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInputs(kX), "Input", kX, "CinnLaunchOp");
+    OP_INOUT_CHECK(ctx->HasOutputs(kOutputs), "Output", kOutputs,
+                   "CinnLaunchOp");
+  }
+
+ protected:
+  /* [Why use single type kernel]:
+   *
+   * This op is similar to a control flow op, it doses not need
+   * a op kernel, but in order to make it execute under dynamic
+   * graph mode, implement it with op kernel.
+   *
+   * So whether the kernel data type is int, float or other type,
+   * which has no effect on its execution logic, so directly
+   * specified a data type here.
+   *
+   * Of course, the data type here is also not important.
+   */
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(framework::proto::VarType::FP32,
+                                   ctx.GetPlace());
+  }
+};
+
+class CinnLaunchOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput(kX,
+             "(vector<LoDTensor>)"
+             "which are the input of graph inside the CinnLaunchOp.")
+        .AsDuplicable();
+    AddOutput(kOutputs,
+              "(vector<LoDTensor>)"
+              "which are the output of graph inside the CinnLaunchOp.")
+        .AsDuplicable();
+    AddAttr<std::string>(
+        kCompilationKey,
+        "(string)"
+        "a hash key used to get the graph object or its computation result.");
+    AddComment(R"DOC(
+CinnLaunch Operator.
+
+This operator is used to launch CINN(https://github.com/PaddlePaddle/CINN/blob/develop/README.md)
+to compile a graph and execute the compiled object.
+
+Both input and output of this operator are a set of variables
+which are input and output of the graph respectively that will be
+compiled and executed in this operator.
+In addition, there is an attribute named 'compilation_key' should be
+set necessarily to get corresponding ir::Graph object of the graph
+or its computation result.
+
+It accomplishes the computation of graph following several steps:
+  1. Fetch ir::Graph object from CinnCompiler using kCompilationKey
+  2. Compile the graph to a compiled object, and insert it to the
+     global cache so that we can directly query it from this cache next time
+     when shape of input variables are not changed at all.
+  3. Create and instantiate all variables used to execute compiled runtime program
+     if necessary according to the info(type,shape) included in the return scope.
+  4. Pack each tensor buffer of all above variables as execution arguments.
+  5. Launch execution of the runtime program with above arguments, then
+     the result would be output by writing value on underlying buffer address.
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(
+    cinn_launch, ops::CinnLaunchOp, ops::CinnLaunchOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+/* see [Why use single type kernel] */
+REGISTER_OP_CPU_KERNEL(
+    cinn_launch,
+    ops::CinnLaunchOpKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/cinn_launch_op.cu.cc b/paddle/fluid/operators/cinn_launch_op.cu.cc
new file mode 100644
index 0000000000000..7066cd4e59887
--- /dev/null
+++ b/paddle/fluid/operators/cinn_launch_op.cu.cc
@@ -0,0 +1,20 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/cinn_launch_op.h"
+
+/* see [Why use single type kernel] */
+REGISTER_OP_CUDA_KERNEL(cinn_launch,
+                        paddle::operators::CinnLaunchOpKernel<
+                            paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/cinn_launch_op.h b/paddle/fluid/operators/cinn_launch_op.h
new file mode 100644
index 0000000000000..27ff99084a096
--- /dev/null
+++ b/paddle/fluid/operators/cinn_launch_op.h
@@ -0,0 +1,226 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include "cinn/hlir/framework/graph_compiler.h"
+#include "cinn/hlir/framework/scope.h"
+#include "cinn/runtime/cinn_runtime.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h"
+
+namespace paddle {
+namespace operators {
+
+static constexpr char kX[] = "X";
+static constexpr char kOutputs[] = "Out";
+static constexpr char kCompilationKey[] = "compilation_key";
+
+using LoDTensor = framework::LoDTensor;
+using CinnTensor = ::cinn::hlir::framework::Tensor;
+using CinnScope = ::cinn::hlir::framework::Scope;
+using CinnCompiler = framework::paddle2cinn::CinnCompiler;
+using CinnCompiledObject = framework::paddle2cinn::CinnCompiledObject;
+
+namespace details {
+
+// Tranform Paddle place to CINN target
+const ::cinn::common::Target& PlaceToCinnTarget(const platform::Place& place);
+
+// Print detailed compilation result of graph for debug
+void DebugCinnCompiledResult(const CinnCompiledObject& result);
+
+// Transform names of Paddle variables to CINN ones
+std::vector<std::string> MapPaddleVariablesToCinn(
+    const std::vector<std::string>& paddle_names,
+    const std::unordered_map<std::string, std::string>& paddle2cinn_varmap);
+
+// Get CinnTensor with variable name from CinnScope
+std::vector<CinnTensor> GetCinnTensorsFromCompiledScope(
+    const std::vector<std::string>& cinn_names, const CinnScope& cinn_scope);
+
+// Check whether tensors from Paddle and CINN respectively
+// of the same variable are equivalent in type and dimension
+void CheckTensorEquivalent(const std::string& paddle_name,
+                           const LoDTensor* paddle_tensor,
+                           const CinnTensor& cinn_tensor);
+
+// Allocate buffer to a Paddle tensor with assginment information from CINN
+void TensorMutableDataWithCinnInfo(const platform::Place& place,
+                                   const CinnTensor& cinn_tensor,
+                                   LoDTensor* paddle_tensor);
+
+// Extract temporary variable names from CinnScope by excluding
+// input and output variables
+std::vector<std::string> SeperateTempVar(
+    const CinnScope& cinn_scope,
+    const std::vector<std::string>& input_cinn_names,
+    const std::vector<std::string>& output_cinn_names);
+
+// Share the buffer of a Paddle tensor to CINN by packing memory address
+// in a cinn_buffer_t object
+std::unique_ptr<cinn_buffer_t> ShareTensorWithCinnBuffer(LoDTensor* tensor);
+
+// Check all execution arguments are carried
+void CheckArgumentsNotMissed(
+    const CinnScope& cinn_scope,
+    const std::map<std::string, cinn_pod_value_t>& name2argument);
+
+}  // namespace details
+
+template <typename DeviceContext, typename T>
+class CinnLaunchOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const auto& scope = ctx.scope();
+    const auto& place = ctx.GetPlace();
+    // Step 1. Find graph object and prepare input
+    PADDLE_ENFORCE_EQ(ctx.HasAttr(kCompilationKey), true,
+                      platform::errors::NotFound(
+                          "No Attribute(%s) found for CinnLaunchOp operator.",
+                          kCompilationKey));
+    const auto& compilation_key =
+        ctx.template Attr<std::string>(kCompilationKey);
+    VLOG(4) << "CinnLaunchOp attribute(" << kCompilationKey << ") "
+            << "value:\n"
+            << CinnCompiler::GetInstance()->ReadableKey(compilation_key);
+
+    auto input_variable_names = ctx.InputNames(kX);
+    const auto& input_tensors = ctx.MultiInput<LoDTensor>(kX);
+    std::map<std::string, const LoDTensor*> inputs_name2tensor;
+    std::transform(input_variable_names.begin(), input_variable_names.end(),
+                   input_tensors.begin(),
+                   std::inserter(inputs_name2tensor, inputs_name2tensor.end()),
+                   [](const std::string& name, const LoDTensor* tensor) {
+                     return std::make_pair(name, tensor);
+                   });
+
+    // Step 2. Get compilation result of the graph
+    auto target = details::PlaceToCinnTarget(place);
+    const auto& cinn_compiled_object = CinnCompiler::GetInstance()->Compile(
+        compilation_key, inputs_name2tensor, target);
+    details::DebugCinnCompiledResult(cinn_compiled_object);
+
+    const auto& cinn_runtime_program = cinn_compiled_object.runtime_program;
+    const auto& cinn_scope = *(cinn_compiled_object.scope);
+    const auto& paddle2cinn_varmap = cinn_compiled_object.paddle2cinn_varmap;
+
+    // Step 3. Initialize all variables needed for cinn compiled runtime
+    //         program execution, and share buffers of their tensors into
+    //         cinn buffers through execution arguments passed.
+    VLOG(4) << "CinnLaunchOp initialize variables and prepare arguments";
+    std::map<std::string, cinn_pod_value_t> name2argument;
+    // because a cinn_pod_value_t does not own the cinn_buffer_t object,
+    // an extra stroage is necessary to keep the object and it can
+    // not be released until runtime program finish  execution.
+    std::vector<std::unique_ptr<cinn_buffer_t>> hold_buffers;
+
+    // 3.1 Prepare input variables: because tensors of input variables have
+    //     been initialized before graph compiled, just check the
+    //     equiality between tensors of paddle and cinn.
+    auto input_cinn_names = details::MapPaddleVariablesToCinn(
+        input_variable_names, paddle2cinn_varmap);
+    auto input_cinn_tensors =
+        details::GetCinnTensorsFromCompiledScope(input_cinn_names, cinn_scope);
+    for (auto i = 0; i < input_variable_names.size(); ++i) {
+      const auto& var_name = input_variable_names.at(i);
+      const auto& cinn_name = input_cinn_names.at(i);
+      auto* tensor = scope.GetVar(var_name)->GetMutable<LoDTensor>();
+      details::CheckTensorEquivalent(var_name, tensor,
+                                     input_cinn_tensors.at(i));
+
+      VLOG(4) << "Prepare input argument-" << i << ":"
+              << "name(" << var_name << "->" << cinn_name << "), "
+              << "tensor(type:" << tensor->type() << ","
+              << "dims:" << tensor->dims() << ").";
+      auto buffer = details::ShareTensorWithCinnBuffer(tensor);
+      name2argument.emplace(input_cinn_names.at(i), buffer.get());
+      hold_buffers.emplace_back(std::move(buffer));
+    }
+
+    // 3.2 Prepare output variables: all output variables should
+    //     be initialized and allocated buffer in advance before
+    //     the runtime program start execution, the compilation result
+    //     includes details of their buffer assginment which used by
+    //     Paddle tensor allocation. For those variables allocated yet,
+    //     like persistable parameters, just check the equiality between
+    //     Paddle allocation and CINN buffer assginment.
+    auto output_variable_names = ctx.OutputNames(kOutputs);
+    auto output_cinn_names = details::MapPaddleVariablesToCinn(
+        output_variable_names, paddle2cinn_varmap);
+    auto output_cinn_tensors =
+        details::GetCinnTensorsFromCompiledScope(output_cinn_names, cinn_scope);
+    for (auto i = 0; i < output_variable_names.size(); ++i) {
+      const auto& var_name = output_variable_names.at(i);
+      const auto& cinn_name = output_cinn_names.at(i);
+      auto* tensor = scope.GetVar(var_name)->GetMutable<LoDTensor>();
+      if (tensor->IsInitialized()) {
+        details::CheckTensorEquivalent(var_name, tensor,
+                                       output_cinn_tensors.at(i));
+      } else {
+        details::TensorMutableDataWithCinnInfo(place, output_cinn_tensors.at(i),
+                                               tensor);
+      }
+
+      VLOG(4) << "Prepare output argument-" << i << ":"
+              << "name(" << var_name << "->" << cinn_name << "), "
+              << "tensor(type:" << tensor->type() << ","
+              << "dims:" << tensor->dims() << ").";
+      auto buffer = details::ShareTensorWithCinnBuffer(tensor);
+      name2argument.emplace(output_cinn_names.at(i), buffer.get());
+      hold_buffers.emplace_back(std::move(buffer));
+    }
+
+    // 3.3 Prepare internal or temporary variables: Create a temporary
+    //     scope to keep internal variables within graph or temporary
+    //     variables needed by the compiled runtime program in addition.
+    //     Here we directly use the names from CinnScope as Paddle variable
+    //     names, because they will not be used outside the graph
+    //     and should be destructed after computation finished.
+    auto temp_variable_names = details::SeperateTempVar(
+        cinn_scope, input_cinn_names, output_cinn_names);
+    auto temp_scope = scope.NewTmpScope();
+    if (!temp_variable_names.empty()) {
+      auto temp_cinn_tensors = details::GetCinnTensorsFromCompiledScope(
+          temp_variable_names, cinn_scope);
+      for (auto i = 0; i < temp_variable_names.size(); ++i) {
+        const auto& var_name = temp_variable_names.at(i);
+        auto* tensor = temp_scope->Var(var_name)->GetMutable<LoDTensor>();
+        details::TensorMutableDataWithCinnInfo(place, temp_cinn_tensors.at(i),
+                                               tensor);
+
+        VLOG(4) << "Prepare temporary argument-" << i << ":"
+                << "name(" << var_name << "->" << var_name << "), "
+                << "tensor(type:" << tensor->type() << ","
+                << "dims:" << tensor->dims() << ").";
+        auto buffer = details::ShareTensorWithCinnBuffer(tensor);
+        name2argument.emplace(var_name, buffer.get());
+        hold_buffers.emplace_back(std::move(buffer));
+      }
+    }
+
+    // Step 4. Launch CINN to execute the compiled runtime program
+    details::CheckArgumentsNotMissed(cinn_scope, name2argument);
+    cinn_runtime_program->Execute(&name2argument);
+    VLOG(4) << "CinnLaunchOp launch execution done.";
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/cinn_launch_op_test.cc b/paddle/fluid/operators/cinn_launch_op_test.cc
new file mode 100644
index 0000000000000..cef95e0504591
--- /dev/null
+++ b/paddle/fluid/operators/cinn_launch_op_test.cc
@@ -0,0 +1,301 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/cinn_launch_op.h"
+#include <stdlib.h>
+#include <random>
+#include <string>
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/platform/cpu_helper.h"
+#include "paddle/fluid/platform/init.h"
+
+USE_OP(cinn_launch);
+USE_OP(elementwise_add);
+
+namespace paddle {
+namespace operators {
+
+using framework::ir::Graph;
+using framework::ir::Node;
+
+std::unique_ptr<Graph> CreateOnlyElementwiseAddGraph(
+    const std::string& x_name, const std::string& y_name,
+    const std::string& out_name) {
+  auto g = std::make_unique<Graph>(framework::ProgramDesc());
+  framework::OpDesc feed_op_x, feed_op_y;
+  feed_op_x.SetType("feed");
+  feed_op_x.SetOutput("Out", {x_name});
+  feed_op_y.SetType("feed");
+  feed_op_y.SetOutput("Out", {y_name});
+
+  framework::VarDesc x_var(x_name);
+  framework::VarDesc y_var(y_name);
+  framework::VarDesc out_var(out_name);
+
+  framework::OpDesc elementwise_add_op;
+  elementwise_add_op.SetType("add");
+  elementwise_add_op.SetInput("X", {x_name});
+  elementwise_add_op.SetInput("Y", {y_name});
+  elementwise_add_op.SetOutput("Out", {out_name});
+
+  auto* feed_op_node_x = g->CreateOpNode(&feed_op_x);
+  auto* feed_op_node_y = g->CreateOpNode(&feed_op_y);
+  auto* elementwise_add_node = g->CreateOpNode(&elementwise_add_op);
+  auto* x_node = g->CreateVarNode(&x_var);
+  auto* y_node = g->CreateVarNode(&y_var);
+  auto* out_node = g->CreateVarNode(&out_var);
+
+  // fill op node
+  feed_op_node_x->outputs = {x_node};
+  feed_op_node_y->outputs = {y_node};
+  elementwise_add_node->inputs = {x_node, y_node};
+  elementwise_add_node->outputs = {out_node};
+
+  // fill variable node
+  x_node->inputs = {feed_op_node_x};
+  x_node->outputs = {elementwise_add_node};
+  y_node->inputs = {feed_op_node_y};
+  y_node->outputs = {elementwise_add_node};
+  out_node->inputs = {elementwise_add_node};
+  return g;
+}
+
+void CreateInputVariablesWithRandomData(
+    const std::vector<std::string>& variable_names,
+    const framework::DDim& common_ddim, framework::Scope* scope) {
+  std::random_device seed;
+  std::default_random_engine engine(seed());
+  std::uniform_real_distribution<float> dist(0.f, 2.f);
+
+  for (const auto& var_name : variable_names) {
+    auto* tensor = scope->Var(var_name)->GetMutable<LoDTensor>();
+    auto* data = tensor->mutable_data<float>(common_ddim, platform::CPUPlace());
+    for (auto i = 0; i < tensor->numel(); ++i) {
+      data[i] = dist(engine);
+    }
+  }
+}
+
+void CopyInputDataToPlace(const framework::Scope& scope,
+                          const platform::Place& dst_place,
+                          framework::Scope* dst_scope) {
+  for (const auto& var_name : scope.LocalVarNames()) {
+    const auto& src_tensor = scope.GetVar(var_name)->Get<LoDTensor>();
+    auto* dst_tensor = dst_scope->Var(var_name)->GetMutable<LoDTensor>();
+    TensorCopySync(src_tensor, dst_place, dst_tensor);
+  }
+}
+
+TEST(CinnLaunchOpTest, TestElementwiseAddPass) {
+  paddle::framework::InitDevices();
+  platform::SetNumThreads(1);
+  // cache test graph into CinnCompiler
+  const auto& test_out_name = "test_out";
+  const auto& expected_out_name = "expected_out";
+  auto compilation_key = CinnCompiler::GetInstance()->AddGraph(
+      CreateOnlyElementwiseAddGraph("test_x", "test_y", test_out_name));
+  // create cinn_launch_op and elementwise_add op
+  auto cinn_launch_op = paddle::framework::OpRegistry::CreateOp(
+      "cinn_launch", {{"X", {"test_x", "test_y"}}}, {{"Out", {test_out_name}}},
+      {{"compilation_key", compilation_key}});
+  auto elementwise_add_op = paddle::framework::OpRegistry::CreateOp(
+      "elementwise_add", {{"X", {"test_x"}}, {"Y", {"test_y"}}},
+      {{"Out", {expected_out_name}}}, {{}});
+  // prepare input data
+  framework::Scope init_scope;
+  CreateInputVariablesWithRandomData({"test_x", "test_y"}, {10, 20},
+                                     &init_scope);
+  // Run ops and check the computation results
+  auto run_and_check_fn = [&](const platform::Place& place) {
+    framework::Scope scope;
+    CopyInputDataToPlace(init_scope, place, &scope);
+    scope.Var(test_out_name)->GetMutable<LoDTensor>();
+    scope.Var(expected_out_name)->GetMutable<LoDTensor>();
+
+    cinn_launch_op->Run(scope, place);
+    elementwise_add_op->Run(scope, place);
+
+    LoDTensor test_out, expected_out;
+    if (platform::is_cpu_place(place)) {
+      test_out.ShareDataWith(scope.Var(test_out_name)->Get<LoDTensor>());
+      expected_out.ShareDataWith(
+          scope.Var(expected_out_name)->Get<LoDTensor>());
+    } else {
+      TensorCopySync(scope.Var(test_out_name)->Get<LoDTensor>(),
+                     platform::CPUPlace(), &test_out);
+      TensorCopySync(scope.Var(expected_out_name)->Get<LoDTensor>(),
+                     platform::CPUPlace(), &expected_out);
+    }
+
+    ASSERT_TRUE(test_out.IsInitialized());
+    ASSERT_TRUE(expected_out.IsInitialized());
+    ASSERT_EQ(test_out.dims(), expected_out.dims());
+    const auto* test_data = test_out.data<float>();
+    const auto* excepted_data = expected_out.data<float>();
+    for (auto i = 0; i < expected_out.numel(); ++i) {
+      EXPECT_FLOAT_EQ(test_data[i], excepted_data[i]);
+    }
+  };
+
+  LOG(INFO) << "Check compute result on cpu";
+  run_and_check_fn(platform::CPUPlace());
+  run_and_check_fn(platform::CPUPlace());
+
+#ifdef PADDLE_WITH_CUDA
+  // create an new elementwise_add op
+  // because the above one cached the cpu kernel
+  LOG(INFO) << "Check compute result on gpu";
+  cinn_launch_op = paddle::framework::OpRegistry::CreateOp(
+      "cinn_launch", {{"X", {"test_x", "test_y"}}}, {{"Out", {test_out_name}}},
+      {{"compilation_key", compilation_key}});
+  elementwise_add_op = paddle::framework::OpRegistry::CreateOp(
+      "elementwise_add", {{"X", {"test_x"}}, {"Y", {"test_y"}}},
+      {{"Out", {expected_out_name}}}, {{}});
+  run_and_check_fn(platform::CUDAPlace());
+  run_and_check_fn(platform::CUDAPlace());
+#endif
+}
+
+namespace details {
+// Testing helper function used on CinnLaunchOpKernel in the following:
+// firstly build test data, then check both expected and illegal situations
+
+using CinnShape = ::cinn::hlir::framework::Shape;
+
+TEST(CinnLaunchOpHelperTest, TestPlaceToCinnTarget) {
+  ASSERT_EQ(PlaceToCinnTarget(platform::CPUPlace()),
+            ::cinn::common::DefaultHostTarget());
+  ASSERT_EQ(PlaceToCinnTarget(platform::CUDAPlace(0)),
+            ::cinn::common::DefaultNVGPUTarget());
+  ASSERT_THROW(PlaceToCinnTarget(platform::XPUPlace()),
+               paddle::platform::EnforceNotMet);
+}
+
+TEST(CinnLaunchOpHelperTest, TestMapPaddleVariablesToCinn) {
+  std::unordered_map<std::string, std::string> varmap(
+      {{"var1", "cinn_var1"}, {"var2", "cinn_var2"}, {"var3", "cinn_var3"}});
+
+  auto cinn_names = MapPaddleVariablesToCinn({"var1", "var3"}, varmap);
+  ASSERT_EQ(cinn_names.size(), 2);
+  EXPECT_EQ(cinn_names, std::vector<std::string>({"cinn_var1", "cinn_var3"}));
+  ASSERT_THROW(MapPaddleVariablesToCinn({"var1", "not_exist"}, varmap),
+               paddle::platform::EnforceNotMet);
+}
+
+TEST(CinnLaunchOpHelperTest, TestGetCinnTensorsFromCompiledScope) {
+  CinnScope cinn_scope;
+  cinn_scope.Var<CinnTensor>("cinn_var1");
+  cinn_scope.Var<CinnTensor>("cinn_var2");
+  cinn_scope.Var<CinnTensor>("cinn_var3");
+
+  auto cinn_tensors =
+      GetCinnTensorsFromCompiledScope({"cinn_var1", "cinn_var3"}, cinn_scope);
+  ASSERT_EQ(cinn_tensors.size(), 2);
+  ASSERT_EQ(cinn_tensors.front().get(),
+            cinn_scope.GetTensor("cinn_var1").get());
+  ASSERT_EQ(cinn_tensors.back().get(), cinn_scope.GetTensor("cinn_var3").get());
+  ASSERT_THROW(
+      GetCinnTensorsFromCompiledScope({"cinn_var1", "not_exist"}, cinn_scope),
+      paddle::platform::EnforceNotMet);
+}
+
+TEST(CinnLaunchOpHelperTest, TestCheckTensorEquivalent) {
+  platform::CPUPlace place;
+  framework::Scope scope;
+  auto* tensor1 = scope.Var("var1")->GetMutable<LoDTensor>();
+  tensor1->mutable_data<float>(framework::make_ddim({5, 8}), place);
+
+  CinnScope cinn_scope;
+  cinn_scope.Var<CinnTensor>("cinn_var1");
+  auto cinn_tensor1 = cinn_scope.GetTensor("cinn_var1");
+  cinn_tensor1->Resize(CinnShape({5, 8}));
+  cinn_tensor1->set_type(::cinn::common::type_of<float>());
+
+  ASSERT_NO_THROW(CheckTensorEquivalent("var1", tensor1, cinn_tensor1));
+  auto tensor2 = scope.Var("var2")->GetMutable<LoDTensor>();
+  ASSERT_THROW(CheckTensorEquivalent("var2", tensor2, cinn_tensor1),
+               paddle::platform::EnforceNotMet);
+
+  cinn_tensor1->Resize(CinnShape({5, 7}));
+  ASSERT_THROW(CheckTensorEquivalent("var1", tensor1, cinn_tensor1),
+               paddle::platform::EnforceNotMet);
+}
+
+TEST(CinnLaunchOpHelperTest, TestTensorMutableDataWithCinnInfo) {
+  platform::CPUPlace place;
+  framework::Scope scope;
+  auto* tensor1 = scope.Var("var1")->GetMutable<LoDTensor>();
+  CinnScope cinn_scope;
+  cinn_scope.Var<CinnTensor>("cinn_var1");
+  auto cinn_tensor1 = cinn_scope.GetTensor("cinn_var1");
+  cinn_tensor1->Resize(CinnShape({5, 8}));
+
+  ASSERT_NO_THROW(TensorMutableDataWithCinnInfo(place, cinn_tensor1, tensor1));
+  ASSERT_TRUE(tensor1->IsInitialized());
+  ASSERT_EQ(tensor1->dims(), framework::make_ddim({5, 8}));
+}
+
+TEST(CinnLaunchOpHelperTest, TestSeperateTempVar) {
+  CinnScope cinn_scope;
+  cinn_scope.Var<CinnTensor>("cinn_var1");
+  cinn_scope.Var<CinnTensor>("cinn_var2");
+  cinn_scope.Var<CinnTensor>("cinn_var3");
+  cinn_scope.Var<CinnTensor>("cinn_var4");
+
+  auto temp_names =
+      SeperateTempVar(cinn_scope, {"cinn_var1", "cinn_var2"}, {"cinn_var4"});
+  ASSERT_EQ(temp_names.size(), 1);
+  EXPECT_EQ(temp_names.front(), "cinn_var3");
+}
+
+TEST(CinnLaunchOpHelperTest, TestShareTensorWithCinnBuffer) {
+  platform::CPUPlace place;
+  framework::Scope scope;
+  auto* tensor1 = scope.Var("var1")->GetMutable<LoDTensor>();
+  tensor1->mutable_data<float>(framework::make_ddim({5, 6}), place);
+  auto* data1 = tensor1->data<float>();
+  data1[0] = 9.99f;
+  data1[10] = 19.99f;
+
+  auto cinn_buffer = ShareTensorWithCinnBuffer(tensor1);
+  ASSERT_NE(cinn_buffer->memory, nullptr);
+  ASSERT_EQ(cinn_buffer->num_elements(), 30);
+  auto* shadow_data = reinterpret_cast<float*>(cinn_buffer->memory);
+  EXPECT_FLOAT_EQ(shadow_data[0], 9.99f);
+  EXPECT_FLOAT_EQ(shadow_data[10], 19.99f);
+}
+
+TEST(CinnLaunchOpHelperTest, TestCheckArgumentsNotMissed) {
+  CinnScope cinn_scope;
+  cinn_scope.Var<CinnTensor>("cinn_var1");
+  cinn_scope.Var<CinnTensor>("cinn_var2");
+  std::map<std::string, cinn_pod_value_t> name2argument(
+      {{"cinn_var1", cinn_pod_value_t()}, {"cinn_var2", cinn_pod_value_t()}});
+
+  ASSERT_NO_THROW(CheckArgumentsNotMissed(cinn_scope, name2argument));
+  name2argument.erase("cinn_var2");
+  ASSERT_THROW(CheckArgumentsNotMissed(cinn_scope, name2argument),
+               paddle::platform::EnforceNotMet);
+}
+
+}  // namespace details
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/compat/hard_sigmoid.pbtxt b/paddle/fluid/operators/compat/hard_sigmoid.pbtxt
new file mode 100644
index 0000000000000..c8b66edf2223a
--- /dev/null
+++ b/paddle/fluid/operators/compat/hard_sigmoid.pbtxt
@@ -0,0 +1,17 @@
+type: "hard_sigmoid"
+def {
+  inputs {
+    name: "X"
+  }
+  outputs {
+    name: "Out"
+  }
+  attrs {
+    name: "slope"
+    type: FLOAT
+  }
+  attrs {
+    name: "offset"
+    type: FLOAT
+  }
+}
diff --git a/paddle/fluid/operators/controlflow/fetch_v2_op.cc b/paddle/fluid/operators/controlflow/fetch_v2_op.cc
index 355e52b9436e6..0837caf9353a3 100644
--- a/paddle/fluid/operators/controlflow/fetch_v2_op.cc
+++ b/paddle/fluid/operators/controlflow/fetch_v2_op.cc
@@ -77,12 +77,35 @@ class FetchV2Op : public framework::OperatorWithKernel {
   framework::OpKernelType GetKernelTypeForVar(
       const std::string &var_name, const framework::Tensor &tensor,
       const framework::OpKernelType &expected_kernel_type) const override {
+    if (!tensor.IsInitialized()) {
+      return expected_kernel_type;
+    }
     return framework::OpKernelType(expected_kernel_type.data_type_,
                                    tensor.place(), tensor.layout());
   }
 
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
+    auto *fetch_var = ctx.InputVar("X");
+    if (fetch_var == nullptr) {
+      return framework::OpKernelType(framework::proto::VarType::FP32,
+                                     platform::CPUPlace());
+    }
+
+    if (fetch_var->IsType<framework::LoDTensor>()) {
+      auto &src_item = fetch_var->Get<framework::LoDTensor>();
+      if (!src_item.IsInitialized()) {
+        return framework::OpKernelType(framework::proto::VarType::FP32,
+                                       platform::CPUPlace());
+      }
+    } else {
+      auto &src_item = fetch_var->Get<framework::LoDTensorArray>();
+      if (src_item.empty() || !src_item[0].IsInitialized()) {
+        return framework::OpKernelType(framework::proto::VarType::FP32,
+                                       platform::CPUPlace());
+      }
+    }
+
     return framework::OpKernelType(
         OperatorWithKernel::IndicateVarDataType(ctx, "X"),
         platform::CPUPlace());
@@ -127,10 +150,16 @@ class FetchV2Kernel {
 
     if (fetch_var->IsType<framework::LoDTensor>()) {
       auto &src_item = fetch_var->Get<framework::LoDTensor>();
+      if (!src_item.IsInitialized()) {
+        return;
+      }
       auto *dst_item = &(BOOST_GET(framework::LoDTensor, fetch_list->at(col)));
-      PADDLE_ENFORCE_EQ(platform::is_cpu_place(src_item.place()), true,
-                        platform::errors::InvalidArgument(
-                            "Tensor's place of input(X) must be CPUPlace."));
+      bool check_place = platform::is_cpu_place(src_item.place()) ||
+                         platform::is_cuda_pinned_place(src_item.place());
+      PADDLE_ENFORCE_EQ(
+          check_place, true,
+          platform::errors::InvalidArgument("Tensor's place of input(X) must "
+                                            "be CPUPlace or CUDAPinnedPlace."));
       if (deepcopy) {
         DeepCopy(src_item, fetch_var_name, dst_item);
       } else {
@@ -170,9 +199,7 @@ class FetchV2OpProtoMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault(true);
     AddComment(R"DOC(
 FetchV2 Operator.
-
 It should not be configured by users directly.
-
 )DOC");
   }
 };
@@ -188,8 +215,11 @@ REGISTER_OPERATOR(
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
 
-REGISTER_OP_CPU_KERNEL_FUNCTOR(fetch_v2, float, ops::FetchV2Kernel, double,
-                               ops::FetchV2Kernel, int, ops::FetchV2Kernel,
-                               int64_t, ops::FetchV2Kernel, bool,
-                               ops::FetchV2Kernel, plat::float16,
-                               ops::FetchV2Kernel);
+REGISTER_OP_CPU_KERNEL_FUNCTOR(
+    fetch_v2, float, ops::FetchV2Kernel, double, ops::FetchV2Kernel, int8_t,
+    ops::FetchV2Kernel, uint8_t, ops::FetchV2Kernel, int, ops::FetchV2Kernel,
+    int64_t, ops::FetchV2Kernel, bool, ops::FetchV2Kernel,
+    paddle::platform::bfloat16, ops::FetchV2Kernel,
+    paddle::platform::complex<float>, ops::FetchV2Kernel,
+    paddle::platform::complex<double>, ops::FetchV2Kernel, plat::float16,
+    ops::FetchV2Kernel, int16_t, ops::FetchV2Kernel);
diff --git a/paddle/fluid/operators/copy_cross_scope_test.cc b/paddle/fluid/operators/copy_cross_scope_test.cc
index e175b235f9c18..37bc32d745eda 100644
--- a/paddle/fluid/operators/copy_cross_scope_test.cc
+++ b/paddle/fluid/operators/copy_cross_scope_test.cc
@@ -61,7 +61,7 @@ void Compare1(f::Scope* scope, const p::DeviceContext& ctx,
 
   // run
   f::AttributeMap attrs = {{"to_main_scope", false}, {"num_micro_batches", 3}};
-  std::map<std::string, std::vector<std::string>> output;
+  f::VariableNameMap output;
   auto op = f::OpRegistry::CreateOp(op_type, {{"X", {"tmp"}}, {"Id", {"Id"}}},
                                     output, attrs);
 
@@ -109,7 +109,7 @@ void Compare2(f::Scope* scope, const p::DeviceContext& ctx,
 
   // run
   f::AttributeMap attrs = {{"to_main_scope", true}, {"num_micro_batches", 3}};
-  std::map<std::string, std::vector<std::string>> output;
+  f::VariableNameMap output;
   auto op = f::OpRegistry::CreateOp(op_type, {{"X", {"tmp"}}, {"Id", {"Id"}}},
                                     output, attrs);
 
diff --git a/paddle/fluid/operators/dot_op.h b/paddle/fluid/operators/dot_op.h
index 09d607891b485..f6877c57a5c18 100644
--- a/paddle/fluid/operators/dot_op.h
+++ b/paddle/fluid/operators/dot_op.h
@@ -19,6 +19,11 @@
 #include "paddle/fluid/operators/math/complex_functors.h"
 #include "paddle/fluid/platform/for_range.h"
 
+// only can include the headers in paddle/pten/api dirs
+#include "paddle/pten/api/lib/utils/tensor_utils.h"
+#include "paddle/pten/include/core.h"
+#include "paddle/pten/include/linalg.h"
+
 namespace paddle {
 namespace operators {
 
@@ -228,48 +233,23 @@ struct DotGradFunction<DeviceContext, T, math::DisableComplex<T>> {
   }
 };
 
+// See Note [ Why still keep the original kernel implementation? ]
 template <typename DeviceContext, typename T>
 class DotKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* tensor_x = ctx.Input<Tensor>("X");
-    auto* tensor_y = ctx.Input<Tensor>("Y");
-    auto* tensor_out = ctx.Output<Tensor>("Out");
-    tensor_out->mutable_data<T>(ctx.GetPlace());
-
-#if defined(__NVCC__) || defined(__HIPCC__)
-    if (1 == tensor_out->dims().size()) {
-      auto out = framework::EigenScalar<T>::From(*tensor_out);
-      auto x = framework::EigenVector<T>::Flatten(*tensor_x);
-      auto y = framework::EigenVector<T>::Flatten(*tensor_y);
-
-      auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
-      out.device(dev) = (x * y).sum();
-    } else {
-      auto out = framework::EigenMatrix<T>::From(*tensor_out);
-      auto x = framework::EigenMatrix<T>::From(*tensor_x);
-      auto y = framework::EigenMatrix<T>::From(*tensor_y);
-
-      auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
-      out.device(dev) = (x * y).sum(Eigen::DSizes<int, 1>(1));
-    }
-#else
-    auto const *x = tensor_x->data<T>(), *x_ = &x[0];
-    auto const *y = tensor_y->data<T>(), *y_ = &y[0];
-    auto* z = tensor_out->data<T>();
-
-    // Loop over the total N elements of both operands while sum-reducing every
-    // B pairs along the way where B is the dimension of the least ordered axis
-    auto&& d = tensor_x->dims();
-    auto const N = tensor_x->numel();
-    auto const B = d[d.size() - 1];
-
-    for (int j = 0; j < N / B; j++) {
-      T ss = 0;
-      for (int i = 0; i < B; i++) ss += (*x_++) * (*y_++);
-      z[j] = ss;
-    }
-#endif
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* out = ctx.Output<Tensor>("Out");
+    auto& dev_ctx = ctx.device_context<DeviceContext>();
+    out->mutable_data<T>(x->place());
+
+    auto pt_x = paddle::experimental::MakePtenDenseTensor(*x);
+    auto pt_y = paddle::experimental::MakePtenDenseTensor(*y);
+    auto pt_out = paddle::experimental::MakePtenDenseTensor(*out);
+
+    // call new kernel
+    pten::Dot<T>(dev_ctx, *pt_x.get(), *pt_y.get(), pt_out.get());
   }
 };
 
diff --git a/paddle/fluid/operators/eigvalsh_op.cc b/paddle/fluid/operators/eigvalsh_op.cc
new file mode 100644
index 0000000000000..fd5893df0c449
--- /dev/null
+++ b/paddle/fluid/operators/eigvalsh_op.cc
@@ -0,0 +1,163 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/eigvalsh_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class EigvalshOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Eigvalsh");
+    OP_INOUT_CHECK(ctx->HasOutput("Eigenvalues"), "Output", "Eigenvalues",
+                   "Eigvalsh");
+
+    auto input_dim = ctx->GetInputDim("X");
+    auto rank = input_dim.size();
+
+    PADDLE_ENFORCE_GE(rank, 2,
+                      platform::errors::InvalidArgument(
+                          "The Input(X) should have at least 2 dimensions."
+                          "But received a %d dimension tensor.",
+                          rank));
+    PADDLE_ENFORCE_EQ(
+        input_dim[rank - 2], input_dim[rank - 1],
+        platform::errors::InvalidArgument(
+            "Eigvalsh op is designed for square matrix, consequently"
+            "inner-most 2 dimensions of Input(X) should be symmetric."
+            "But received X's shape[-2] = %d and shape[-1] = %d.",
+            input_dim[rank - 2], input_dim[rank - 1]));
+
+    std::vector<int64_t> values_dim;
+
+    for (auto i = 0; i < rank - 1; i++) {
+      values_dim.emplace_back(input_dim[i]);
+    }
+
+    ctx->SetOutputDim("Eigenvalues", framework::make_ddim(values_dim));
+
+    if (ctx->HasOutput("Eigenvectors")) {
+      ctx->SetOutputDim("Eigenvectors", input_dim);
+    }
+  }
+};
+
+class EigvalshOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(Tensor), Hermitian or real symmetric matrices."
+             "Its shape should be [*, N, N] where * is zero or"
+             "more batch dimensions. The data type is float32 ,"
+             "float64, complex64, complex128.");
+    AddOutput("Eigenvalues",
+              "(Tensor), The eigenvalues in ascending order."
+              "The data type is float32 or float64.");
+    AddOutput(
+        "Eigenvectors",
+        "(Tensor), The column is the normalized eigenvector "
+        "corresponding to the eigenvalue. The data type is the same as ``X``."
+        "Eigenvectors are required to calculate gradient when backward.");
+    AddAttr<std::string>(
+        "UPLO",
+        "(string, default 'L'), 'L' represents the lower triangular matrix,"
+        "'U' represents the upper triangular matrix.")
+        .SetDefault("L");
+    AddAttr<bool>("is_test",
+                  "(bool, default false) Set to true for inference only, false "
+                  "for training.")
+        .SetDefault(false);
+    AddComment(R"DOC(
+Eigvalsh Operator.
+
+Computes the eigenvalues of a complex Hermitian
+ (conjugate symmetric) or a real symmetric matrix.
+
+)DOC");
+  }
+};
+
+class EigvalshGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("Eigenvectors"), "Input", "Eigenvectors",
+                   "EigvalshGrad");
+    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Eigenvalues")),
+                   "Input", "Eigenvalues@GRAD", "EigvalshGrad");
+    auto dims = ctx->GetInputDim("Eigenvectors");
+    auto x_grad_name = framework::GradVarName("X");
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, dims);
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "Eigenvectors"),
+        ctx.device_context());
+  }
+};
+
+template <typename T>
+class EigvalshGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType(this->ForwardOpType() + "_grad");
+    op->SetInput("Eigenvectors", this->Output("Eigenvectors"));
+    op->SetInput(framework::GradVarName("Eigenvalues"),
+                 this->OutputGrad("Eigenvalues"));
+    op->SetAttrMap(this->Attrs());
+    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(eigvalsh, ops::EigvalshOp, ops::EigvalshOpMaker,
+                  ops::EigvalshGradOpMaker<paddle::framework::OpDesc>,
+                  ops::EigvalshGradOpMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(eigvalsh_grad, ops::EigvalshGradOp);
+
+REGISTER_OP_CPU_KERNEL(
+    eigvalsh,
+    ops::EigvalshKernel<paddle::platform::CPUDeviceContext, float, float>,
+    ops::EigvalshKernel<paddle::platform::CPUDeviceContext, double, double>,
+    ops::EigvalshKernel<paddle::platform::CPUDeviceContext, float,
+                        paddle::platform::complex<float>>,
+    ops::EigvalshKernel<paddle::platform::CPUDeviceContext, double,
+                        paddle::platform::complex<double>>);
+
+REGISTER_OP_CPU_KERNEL(
+    eigvalsh_grad,
+    ops::EigvalshGradKernel<paddle::platform::CPUDeviceContext, float, float>,
+    ops::EigvalshGradKernel<paddle::platform::CPUDeviceContext, double, double>,
+    ops::EigvalshGradKernel<paddle::platform::CPUDeviceContext, float,
+                            paddle::platform::complex<float>>,
+    ops::EigvalshGradKernel<paddle::platform::CPUDeviceContext, double,
+                            paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/eigvalsh_op.cu b/paddle/fluid/operators/eigvalsh_op.cu
new file mode 100644
index 0000000000000..a623307857094
--- /dev/null
+++ b/paddle/fluid/operators/eigvalsh_op.cu
@@ -0,0 +1,36 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/eigvalsh_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    eigvalsh,
+    ops::EigvalshKernel<paddle::platform::CUDADeviceContext, float, float>,
+    ops::EigvalshKernel<paddle::platform::CUDADeviceContext, double, double>,
+    ops::EigvalshKernel<paddle::platform::CUDADeviceContext, float,
+                        paddle::platform::complex<float>>,
+    ops::EigvalshKernel<paddle::platform::CUDADeviceContext, double,
+                        paddle::platform::complex<double>>);
+
+REGISTER_OP_CUDA_KERNEL(
+    eigvalsh_grad,
+    ops::EigvalshGradKernel<paddle::platform::CUDADeviceContext, float, float>,
+    ops::EigvalshGradKernel<paddle::platform::CUDADeviceContext, double,
+                            double>,
+    ops::EigvalshGradKernel<paddle::platform::CUDADeviceContext, float,
+                            paddle::platform::complex<float>>,
+    ops::EigvalshGradKernel<paddle::platform::CUDADeviceContext, double,
+                            paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/eigvalsh_op.h b/paddle/fluid/operators/eigvalsh_op.h
new file mode 100644
index 0000000000000..6c40ce107a317
--- /dev/null
+++ b/paddle/fluid/operators/eigvalsh_op.h
@@ -0,0 +1,79 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/eigen_values_vectors.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
+template <typename DeviceContext, typename ValueType, typename T>
+class EigvalshKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto input = ctx.Input<Tensor>("X");
+    auto output_w = ctx.Output<Tensor>("Eigenvalues");
+
+    std::string lower = ctx.Attr<std::string>("UPLO");
+    bool is_lower = (lower == "L");
+    bool is_test = ctx.Attr<bool>("is_test");
+    math::MatrixEighFunctor<DeviceContext, T> functor;
+    if (is_test) {
+      functor(ctx, *input, output_w, nullptr, is_lower, false);
+    } else {
+      auto output_v = ctx.Output<Tensor>("Eigenvectors");
+      functor(ctx, *input, output_w, output_v, is_lower, true);
+    }
+  }
+};
+
+template <typename DeviceContext, typename ValueType, typename T>
+class EigvalshGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& x_grad = *ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto& output_v = *ctx.Input<Tensor>("Eigenvectors");
+    auto& output_w_grad =
+        *ctx.Input<Tensor>(framework::GradVarName("Eigenvalues"));
+
+    auto dito =
+        math::DeviceIndependenceTensorOperations<DeviceContext, T, ValueType>(
+            ctx);
+    auto tV = dito.Transpose(dito.Conj(output_v));
+
+    // compute elementwise multiply of output_v and output_w_grad
+    x_grad.mutable_data<T>(output_v.dims(), ctx.GetPlace());
+    auto output_v_vector = EigenVector<T>::Flatten(output_v);
+    auto output_w_grad_vector = EigenVector<ValueType>::Flatten(output_w_grad);
+    auto result_vector = EigenVector<T>::Flatten(x_grad);
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+    std::vector<int> broadcast_factor;
+    broadcast_factor.push_back(output_v.dims().at(output_v.dims().size() - 1));
+    result_vector.device(place) =
+        output_v_vector * output_w_grad_vector.broadcast(broadcast_factor);
+
+    x_grad = dito.Matmul(x_grad, tV);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_mul_op_npu.cc
index b2030ad21e8d1..36a7d54f8c1c2 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op_npu.cc
@@ -143,8 +143,16 @@ class ElementwiseMulGradNPUKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 
 REGISTER_OP_NPU_KERNEL(elementwise_mul, ops::ElementwiseMulNPUKernel<float>,
-                       ops::ElementwiseMulNPUKernel<paddle::platform::float16>);
+                       ops::ElementwiseMulNPUKernel<paddle::platform::float16>,
+#ifdef PADDLE_WITH_ASCEND_INT64
+                       ops::ElementwiseMulNPUKernel<int64_t>,
+#endif
+                       ops::ElementwiseMulNPUKernel<int>);
 
 REGISTER_OP_NPU_KERNEL(
     elementwise_mul_grad, ops::ElementwiseMulGradNPUKernel<float>,
-    ops::ElementwiseMulGradNPUKernel<paddle::platform::float16>);
+    ops::ElementwiseMulGradNPUKernel<paddle::platform::float16>,
+#ifdef PADDLE_WITH_ASCEND_INT64
+    ops::ElementwiseMulGradNPUKernel<int64_t>,
+#endif
+    ops::ElementwiseMulGradNPUKernel<int>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h
index 13e4624ef717f..651f0e3dc8014 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op.h
@@ -129,7 +129,7 @@ class ElementwiseOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string &var_name, const framework::Tensor &tensor,
-      const framework::OpKernelType &expected_kernel_type) const {
+      const framework::OpKernelType &expected_kernel_type) const override {
     if (framework::IsComplexType(expected_kernel_type.data_type_)) {
       // only promote inputs’s types when contains complex input
       return framework::OpKernelType(tensor.type(), tensor.place(),
diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_div_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_div_mkldnn_op.cc
new file mode 100644
index 0000000000000..c037daba0ee3f
--- /dev/null
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_div_mkldnn_op.cc
@@ -0,0 +1,147 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h"
+
+namespace paddle {
+namespace framework {
+class ExecutionContext;
+}  // namespace framework
+namespace platform {
+class CPUDeviceContext;
+struct CPUPlace;
+}  // namespace platform
+}  // namespace paddle
+
+namespace paddle {
+namespace operators {
+template <typename T>
+class EltwiseDivMKLDNNGradKernel : public ElemwiseGradKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    ElemwiseGradKernel<T>::Compute(ctx);
+
+    auto& dev_ctx =
+        ctx.template device_context<platform::MKLDNNDeviceContext>();
+    const auto& mkldnn_engine = dev_ctx.GetEngine();
+
+    auto* y = ctx.Input<framework::Tensor>("Y");
+    auto* out = ctx.Input<framework::Tensor>("Out");
+    auto* dout = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
+    int axis = ctx.Attr<int>("axis");
+
+    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
+
+    if (dx) {
+      // dx = dout / y
+
+      platform::BinaryMKLDNNHandler<T> handler(
+          dnnl::algorithm::binary_div, axis, mkldnn_engine, ctx.GetPlace(),
+          dout, y, dx, 1.0f, 1.0f, 1.0f);
+
+      const auto src_dout_memory = handler.AcquireSrcMemory(dout);
+      const auto src_y_memory = handler.AcquireSecondSrcMemory(y);
+      const auto dst_dx_memory = handler.AcquireDstMemory(dx);
+
+      const auto binary_prim = handler.AcquireForwardPrimitive();
+
+      const std::unordered_map<int, dnnl::memory> args = {
+          {DNNL_ARG_SRC_0, *src_dout_memory},
+          {DNNL_ARG_SRC_1, *src_y_memory},
+          {DNNL_ARG_DST, *dst_dx_memory}};
+
+      binary_prim->execute(astream, args);
+      astream.wait();
+
+      dx->set_layout(framework::DataLayout::kMKLDNN);
+      dx->set_format(platform::GetMKLDNNFormat(*dst_dx_memory));
+    }
+
+    if (dy) {
+      // dy = -dout * out / y
+
+      platform::BinaryMKLDNNHandler<T> y_handler(
+          dnnl::algorithm::binary_div, axis, mkldnn_engine, ctx.GetPlace(), y,
+          y, nullptr, 1.0f, 1.0f, 1.0f);
+
+      const auto y_memory = y_handler.AcquireSrcMemory(y);
+
+      dnnl::post_ops po;
+      po.append_binary(dnnl::algorithm::binary_div, y_memory->get_desc());
+
+      platform::BinaryMKLDNNHandler<T> handler(
+          dnnl::algorithm::binary_mul, axis, mkldnn_engine, ctx.GetPlace(),
+          dout, out, nullptr, -1.0f, 1.0f, 1.0f, po);
+
+      const auto src_dout_memory = handler.AcquireSrcMemory(dout);
+      const auto src_out_memory = handler.AcquireSecondSrcMemory(out);
+
+      // If broadcasting is in use then let's write to temporary
+      // buffer allocated by oneDNN
+      const auto dst_dy_memory = (dout->dims() == dy->dims())
+                                     ? handler.AcquireDstMemory(dy)
+                                     : handler.AcquireDstMemory();
+
+      const auto binary_prim = handler.AcquireForwardPrimitive();
+
+      const std::unordered_map<int, dnnl::memory> args = {
+          {DNNL_ARG_SRC_0, *src_dout_memory},
+          {DNNL_ARG_SRC_1, *src_out_memory},
+          {DNNL_ARG_DST, *dst_dy_memory},
+          {DNNL_ARG_ATTR_MULTIPLE_POST_OP(0) | DNNL_ARG_SRC_1, *y_memory}};
+
+      binary_prim->execute(astream, args);
+      astream.wait();
+
+      dy->set_layout(framework::DataLayout::kMKLDNN);
+
+      // Reduction is needed for broadcasting scenario
+      if (dout->dims() != dy->dims()) {
+        platform::ReductionMKLDNNHandler<T> handler_sum(
+            dnnl::algorithm::reduction_sum, 0.0f, 0.0f, mkldnn_engine,
+            ctx.GetPlace(), dout, dy, CalculateBroadcastedDims(dout, dy));
+        auto dy_memory_p = handler_sum.AcquireDstMemory(dy);
+        auto reduction_p = handler_sum.AcquireForwardPrimitive();
+
+        // As source we use mem object with results from binary operation
+        reduction_p->execute(astream, {{DNNL_ARG_SRC, *dst_dy_memory},
+                                       {DNNL_ARG_DST, *dy_memory_p}});
+        astream.wait();
+        dy->set_format(
+            platform::GetMKLDNNFormat(dy_memory_p->get_desc().reshape(
+                framework::vectorize<int64_t>(dy->dims()))));
+
+      } else {
+        dy->set_format(platform::GetMKLDNNFormat(*dst_dy_memory));
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+// TODO(piotrekobi) add int8, uint8 support
+REGISTER_OP_KERNEL(elementwise_div, MKLDNN, paddle::platform::CPUPlace,
+                   ops::EltwiseMKLDNNKernel<float, dnnl::algorithm::binary_div>,
+                   ops::EltwiseMKLDNNKernel<paddle::platform::bfloat16,
+                                            dnnl::algorithm::binary_div>)
+
+REGISTER_OP_KERNEL(elementwise_div_grad, MKLDNN, paddle::platform::CPUPlace,
+                   ops::EltwiseDivMKLDNNGradKernel<paddle::platform::bfloat16>,
+                   ops::EltwiseDivMKLDNNGradKernel<float>)
diff --git a/paddle/fluid/operators/expand_v2_op_npu.cc b/paddle/fluid/operators/expand_v2_op_npu.cc
index 4b0e0770573a6..46385a20ab989 100644
--- a/paddle/fluid/operators/expand_v2_op_npu.cc
+++ b/paddle/fluid/operators/expand_v2_op_npu.cc
@@ -106,11 +106,28 @@ class ExpandV2NPUKernel : public framework::OpKernel<T> {
     Out->Resize(out_dims);
     Out->mutable_data<T>(ctx.GetPlace());
 
-    const auto& runner = NpuOpRunner("ExpandD", {*X}, {*Out}, attr_input);
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    runner.Run(stream);
+    const auto& dev_ctx =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>();
+    auto op_func = [](const std::vector<Tensor>& inputs,
+                      const std::vector<Tensor>& outputs,
+                      const NPUAttributeMap& attrs,
+                      const platform::NPUDeviceContext& dev_ctx) {
+      const auto& runner = NpuOpRunner("ExpandD", inputs, outputs, attrs);
+      runner.Run(dev_ctx.stream());
+    };
+
+    if (X->type() == framework::proto::VarType::BOOL) {
+      NpuOpRunner::TypeAdapter({*X}, {*Out}, attr_input, dev_ctx, op_func,
+                               {framework::proto::VarType::UINT8},
+                               {framework::proto::VarType::UINT8});
+    } else if (X->type() == framework::proto::VarType::INT64) {
+      NpuOpRunner::TypeAdapter({*X}, {*Out}, attr_input, dev_ctx, op_func,
+                               {framework::proto::VarType::INT32},
+                               {framework::proto::VarType::INT32});
+    } else {
+      const auto& runner = NpuOpRunner("ExpandD", {*X}, {*Out}, attr_input);
+      runner.Run(dev_ctx.stream());
+    }
   }
 };
 
@@ -181,7 +198,9 @@ REGISTER_OP_NPU_KERNEL(
     ops::ExpandV2NPUKernel<paddle::platform::NPUDeviceContext, float>,
     ops::ExpandV2NPUKernel<paddle::platform::NPUDeviceContext,
                            paddle::platform::float16>,
-    ops::ExpandV2NPUKernel<paddle::platform::NPUDeviceContext, int>);
+    ops::ExpandV2NPUKernel<paddle::platform::NPUDeviceContext, int64_t>,
+    ops::ExpandV2NPUKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::ExpandV2NPUKernel<paddle::platform::NPUDeviceContext, bool>);
 
 REGISTER_OP_NPU_KERNEL(
     expand_v2_grad,
diff --git a/paddle/fluid/operators/fake_quantize_op.cu b/paddle/fluid/operators/fake_quantize_op.cu
index 583ff157a0d39..8f2235c7e3d21 100644
--- a/paddle/fluid/operators/fake_quantize_op.cu
+++ b/paddle/fluid/operators/fake_quantize_op.cu
@@ -216,14 +216,14 @@ __global__ void ClipAndQuantDequantKernel(const T* in, const T* scale,
   int tid = threadIdx.x;
 
   T s = scale[0];
+  T inv_s = inverse(s);
   T bin_cnt_t = static_cast<T>(bin_cnt);
 
   for (int i = bid; i < n; i += blockDim.x * gridDim.x) {
     T x = in[i];
     x = x > s ? s : x;
     x = x < -s ? -s : x;
-    x = (bin_cnt_t / s) * x;
-
+    x = bin_cnt_t * inv_s * x;
     x = static_cast<T>(round(static_cast<float>(x)));
     out[i] = (x * s) / bin_cnt_t;
   }
diff --git a/paddle/fluid/operators/fake_quantize_op.h b/paddle/fluid/operators/fake_quantize_op.h
index 11a2d2de8bcf7..21e7079ff6233 100644
--- a/paddle/fluid/operators/fake_quantize_op.h
+++ b/paddle/fluid/operators/fake_quantize_op.h
@@ -28,8 +28,9 @@ namespace operators {
 
 template <typename T>
 inline HOSTDEVICE T inverse(T s) {
-  T eps = 1e-6;
-  return s <= 1e-30 ? 1.0 / (s + eps) : 1.0 / s;
+  T eps = static_cast<T>(1e-6);
+  T one = static_cast<T>(1.0);
+  return s <= static_cast<T>(1e-30) ? one / (s + eps) : one / s;
 }
 
 template <typename DeviceContext, typename T>
diff --git a/paddle/fluid/operators/fill_any_like_op.cc b/paddle/fluid/operators/fill_any_like_op.cc
index 1e908d5ead9c6..3174fada77802 100644
--- a/paddle/fluid/operators/fill_any_like_op.cc
+++ b/paddle/fluid/operators/fill_any_like_op.cc
@@ -47,6 +47,12 @@ class FillAnyLikeOp : public framework::OperatorWithKernel {
                                    expected_kernel_type.place_,
                                    tensor.layout());
   }
+
+  framework::KernelSignature GetExpectedPtenKernelArgs(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::KernelSignature("fill_any_like", {"X"}, {"value"},
+                                      {"Out"});
+  }
 };
 
 class FillAnyLikeOpMaker : public framework::OpProtoAndCheckerMaker {
diff --git a/paddle/fluid/operators/fill_any_like_op.h b/paddle/fluid/operators/fill_any_like_op.h
index 2fb7bf985f222..5e2d06672c3c1 100644
--- a/paddle/fluid/operators/fill_any_like_op.h
+++ b/paddle/fluid/operators/fill_any_like_op.h
@@ -17,7 +17,10 @@ limitations under the License. */
 #include <limits>
 #include <type_traits>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/framework/pten_utils.h"
+
+#include "paddle/pten/include/core.h"
+#include "paddle/pten/include/creation.h"
 
 namespace paddle {
 namespace operators {
@@ -31,6 +34,7 @@ class FillAnyLikeKernel : public framework::OpKernel<T> {
                                 float, T>::type>::type;
 
   void Compute(const framework::ExecutionContext& context) const override {
+    auto* in = context.Input<framework::Tensor>("X");
     auto* out = context.Output<framework::Tensor>("Out");
     out->mutable_data<T>(context.GetPlace());
 
@@ -58,9 +62,12 @@ class FillAnyLikeKernel : public framework::OpKernel<T> {
         std::isnan(value), false,
         platform::errors::InvalidArgument("The filled value is NaN."));
 
-    math::SetConstant<DeviceContext, T> setter;
-    setter(context.template device_context<DeviceContext>(), out,
-           static_cast<T>(value));
+    auto pt_x = paddle::experimental::MakePtenDenseTensor(*in);
+    auto pt_out = paddle::experimental::MakePtenDenseTensor(*out);
+
+    const auto& dev_ctx = context.template device_context<DeviceContext>();
+    // call new kernel
+    pten::FillAnyLike<T>(dev_ctx, *pt_x, value, pt_out.get());
   }
 };
 
diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc
index 44dcc343a4b4a..aea149fbedc45 100644
--- a/paddle/fluid/operators/fill_constant_op.cc
+++ b/paddle/fluid/operators/fill_constant_op.cc
@@ -64,9 +64,51 @@ class FillConstantOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
+    framework::OpKernelType kt = framework::OpKernelType(
         framework::proto::VarType::Type(ctx.Attr<int>("dtype")),
         ctx.GetPlace());
+    // TODO(zyfncg) The force_cpu and place_type are conflicted, it's a issue
+    // lefted before, and we may merge them in the future.
+    // In order to invoke new fill_constant kernel, the place of OpKernelType
+    // will be setted by force_cpu and place_type here.
+    if (ctx.Attr<bool>("force_cpu")) {
+      kt.place_ = platform::CPUPlace();
+    }
+    auto place_type = ctx.Attr<int>("place_type");
+    if (place_type != -1) {
+      switch (place_type) {
+        case 0:
+          kt.place_ = platform::CPUPlace();
+          break;
+        case 1:
+        case 2:
+          kt.place_ = platform::CUDAPlace();
+          break;
+        case 3:
+          kt.place_ = platform::XPUPlace();
+          break;
+        default:
+          PADDLE_THROW(platform::errors::Unimplemented(
+              "Could NOT determine the place of variable, place_type = %d .",
+              place_type));
+      }
+    }
+
+    return kt;
+  }
+
+  framework::KernelSignature GetExpectedPtenKernelArgs(
+      const framework::ExecutionContext& ctx) const override {
+    if (!ctx.HasInput("ShapeTensor") &&
+        ctx.MultiInput<framework::Tensor>("ShapeTensorList").empty() &&
+        !ctx.HasInput("ValueTensor") &&
+        !ctx.OutputVar("Out")->IsType<framework::SelectedRows>()) {
+      const auto& str_value = ctx.Attr<std::string>("str_value");
+      std::string value = str_value.empty() ? "value" : "str_value";
+      return framework::KernelSignature("fill_constant.scalar", {}, {value},
+                                        {"Out"});
+    }
+    return framework::KernelSignature("fill_constant.unregistered", {}, {}, {});
   }
 };
 
diff --git a/paddle/fluid/operators/fill_constant_op_npu.cc b/paddle/fluid/operators/fill_constant_op_npu.cc
index 16a2433f5cad6..7241fcaf1878f 100644
--- a/paddle/fluid/operators/fill_constant_op_npu.cc
+++ b/paddle/fluid/operators/fill_constant_op_npu.cc
@@ -22,13 +22,13 @@ namespace operators {
 template <typename T>
 class FillConstantNPUKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
+  void Compute(const framework::ExecutionContext &ctx) const override {
     auto data_type =
         static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype"));
     auto str_value = ctx.Attr<std::string>("str_value");
     auto float_value = ctx.Attr<float>("value");
 
-    auto* out_var = ctx.Output<framework::Tensor>("Out");
+    auto *out_var = ctx.Output<framework::Tensor>("Out");
     auto stream =
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
@@ -59,28 +59,49 @@ class FillConstantNPUKernel : public framework::OpKernel<T> {
     }
     auto shape = GetShape(ctx);
 
-    Tensor tensor_value(data_type);
-    tensor_value.mutable_data<T>({1}, ctx.GetPlace());
-    FillNpuTensorWithConstant<T>(&tensor_value, value);
-
     out_var->mutable_data<T>(shape, ctx.GetPlace());
-
-    NpuOpRunner runner;
+    if (data_type != framework::proto::VarType::BOOL) {
+      Tensor tensor_value(data_type);
+      tensor_value.mutable_data<T>({1}, ctx.GetPlace());
+      FillNpuTensorWithConstant<T>(&tensor_value, value);
+      NpuOpRunner runner;
 #if (CANN_VERSION_CODE >= 503003)
-    runner.SetType("FillD")
-        .AddInput(tensor_value)
-        .AddOutput(*out_var)
-        .AddAttrs(
-            {{ "dims",
-               framework::vectorize(shape) }})
-        .Run(stream);
+      runner.SetType("FillD")
+          .AddInput(tensor_value)
+          .AddOutput(*out_var)
+          .AddAttrs(
+              {{ "dims",
+                 framework::vectorize(shape) }})
+          .Run(stream);
 #else
-    runner.SetType("Fill")
-        .AddInput(framework::vectorize(shape))
-        .AddInput(tensor_value)
-        .AddOutput(*out_var)
-        .Run(stream);
+      runner.SetType("Fill")
+          .AddInput(framework::vectorize(shape))
+          .AddInput(tensor_value)
+          .AddOutput(*out_var)
+          .Run(stream);
 #endif
+    } else {
+      const auto &dev_ctx =
+          ctx.template device_context<paddle::platform::NPUDeviceContext>();
+      auto op_func = [&shape, &value](
+          const std::vector<Tensor> &inputs, const std::vector<Tensor> &outputs,
+          const NPUAttributeMap &attrs,
+          const platform::NPUDeviceContext &dev_ctx) {
+        Tensor tensor_value;
+        tensor_value.mutable_data<uint8_t>({1}, dev_ctx.GetPlace());
+        FillNpuTensorWithConstant<uint8_t>(&tensor_value,
+                                           static_cast<uint8_t>(value));
+
+        NpuOpRunner runner;
+        runner.SetType("Fill")
+            .AddInput(framework::vectorize(shape))
+            .AddInput(tensor_value)
+            .AddOutput(outputs[0])
+            .Run(dev_ctx.stream());
+      };
+      NpuOpRunner::TypeAdapter({}, {*out_var}, {}, dev_ctx, op_func, {},
+                               {framework::proto::VarType::UINT8});
+    }
   }
 };
 }  // namespace operators
diff --git a/paddle/fluid/operators/fill_constant_op_xpu.cc b/paddle/fluid/operators/fill_constant_op_xpu.cc
index d55b8e2b81b52..a70f9e2c3b337 100644
--- a/paddle/fluid/operators/fill_constant_op_xpu.cc
+++ b/paddle/fluid/operators/fill_constant_op_xpu.cc
@@ -17,8 +17,11 @@ namespace ops = paddle::operators;
 #ifdef PADDLE_WITH_XPU
 REGISTER_OP_XPU_KERNEL(
     fill_constant, ops::FillConstantKernel<float>,
-    ops::FillConstantKernel<int64_t>, ops::FillConstantKernel<double>,
-    ops::FillConstantKernel<bool>, ops::FillConstantKernel<int>,
+    ops::FillConstantKernel<double>, ops::FillConstantKernel<uint8_t>,
+    ops::FillConstantKernel<int16_t>, ops::FillConstantKernel<int>,
+    ops::FillConstantKernel<int64_t>, ops::FillConstantKernel<bool>,
+    ops::FillConstantKernel<paddle::platform::float16>,
+    ops::FillConstantKernel<paddle::platform::bfloat16>,
     ops::FillConstantKernel<paddle::platform::complex<float>>,
     ops::FillConstantKernel<paddle::platform::complex<double>>);
 #endif
diff --git a/paddle/fluid/operators/flatten_op.cc b/paddle/fluid/operators/flatten_op.cc
index 14f2e9061b742..517422af1f6aa 100644
--- a/paddle/fluid/operators/flatten_op.cc
+++ b/paddle/fluid/operators/flatten_op.cc
@@ -79,14 +79,6 @@ class FlattenOp : public framework::OperatorWithKernel {
       const framework::ExecutionContext &ctx) const override {
     auto input_data_type =
         framework::OperatorWithKernel::IndicateVarDataType(ctx, "X");
-
-    //#ifdef PADDLE_WITH_MKLDNN
-    //    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
-    //      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
-    //                                     framework::DataLayout::kMKLDNN,
-    //                                     framework::LibraryType::kMKLDNN);
-    //    }
-    //#endif
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 };
@@ -157,14 +149,6 @@ class FlattenGradOp : public framework::OperatorWithKernel {
       const framework::ExecutionContext &ctx) const override {
     auto input_data_type = framework::OperatorWithKernel::IndicateVarDataType(
         ctx, framework::GradVarName("Out"));
-
-    //#ifdef PADDLE_WITH_MKLDNN
-    //    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
-    //      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
-    //                                     framework::DataLayout::kMKLDNN,
-    //                                     framework::LibraryType::kMKLDNN);
-    //    }
-    //#endif
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 };
@@ -227,14 +211,6 @@ class Flatten2Op : public framework::OperatorWithKernel {
       const framework::ExecutionContext &ctx) const override {
     auto input_data_type =
         framework::OperatorWithKernel::IndicateVarDataType(ctx, "X");
-
-    //#ifdef PADDLE_WITH_MKLDNN
-    //    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
-    //      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
-    //                                     framework::DataLayout::kMKLDNN,
-    //                                     framework::LibraryType::kMKLDNN);
-    //    }
-    //#endif
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 };
@@ -285,14 +261,6 @@ class Flatten2GradOp : public framework::OperatorWithKernel {
       const framework::ExecutionContext &ctx) const override {
     auto input_data_type = framework::OperatorWithKernel::IndicateVarDataType(
         ctx, framework::GradVarName("Out"));
-
-    //#ifdef PADDLE_WITH_MKLDNN
-    //    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
-    //      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
-    //                                     framework::DataLayout::kMKLDNN,
-    //                                     framework::LibraryType::kMKLDNN);
-    //    }
-    //#endif
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 };
@@ -365,6 +333,18 @@ class FlattenContiguousRangeOp : public framework::OperatorWithKernel {
 
     return out_shape;
   }
+
+  framework::KernelSignature GetExpectedPtenKernelArgs(
+      const framework::ExecutionContext &ctx) const override {
+    if (ctx.HasOutput("XShape")) {
+      return framework::KernelSignature("flatten_contiguous_range.mid", {"X"},
+                                        {"start_axis", "stop_axis"},
+                                        {"Out", "XShape"});
+    } else {
+      return framework::KernelSignature("flatten_contiguous_range", {"X"},
+                                        {"start_axis", "stop_axis"}, {"Out"});
+    }
+  }
 };
 
 class FlattenContiguousRangeOpMaker : public FlattenOpMaker {
diff --git a/paddle/fluid/operators/flatten_op.h b/paddle/fluid/operators/flatten_op.h
index efcb0cbe2e2a8..7d08a95821138 100644
--- a/paddle/fluid/operators/flatten_op.h
+++ b/paddle/fluid/operators/flatten_op.h
@@ -15,10 +15,13 @@ limitations under the License. */
 #pragma once
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/pten_utils.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/pooling.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/pten/include/core.h"
+#include "paddle/pten/include/manipulation.h"
 
 namespace paddle {
 namespace operators {
@@ -122,13 +125,16 @@ class FlattenContiguousRangeKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext &context) const override {
     auto *in = context.Input<framework::LoDTensor>("X");
     auto *out = context.Output<framework::LoDTensor>("Out");
-    auto out_dims = out->dims();
-
     out->mutable_data(context.GetPlace(), in->type());
-    framework::TensorCopy(
-        *in, context.GetPlace(),
-        context.template device_context<platform::DeviceContext>(), out);
-    out->Resize(out_dims);
+    auto &start_axis = context.Attr<int>("start_axis");
+    auto &stop_axis = context.Attr<int>("stop_axis");
+    auto &dev_ctx = context.device_context<DeviceContext>();
+
+    auto pt_x = paddle::experimental::MakePtenDenseTensor(*in);
+    auto pt_out = paddle::experimental::MakePtenDenseTensor(*out);
+
+    // call new kernel
+    pten::Flatten<T>(dev_ctx, *pt_x.get(), start_axis, stop_axis, pt_out.get());
   }
 };
 
diff --git a/paddle/fluid/operators/fused/fmha_ref.h b/paddle/fluid/operators/fused/fmha_ref.h
index bef0052a00d6b..066e7e15e8831 100644
--- a/paddle/fluid/operators/fused/fmha_ref.h
+++ b/paddle/fluid/operators/fused/fmha_ref.h
@@ -69,7 +69,7 @@ class FMHARef {
   ~FMHARef() {}
 
   void ComputeForward(const Tensor& qkv_input_tensor,
-                      const Tensor& src_mask_tensor,
+                      const Tensor* src_mask_tensor,
                       Tensor* transpose_2_out_tensor, Tensor* qk_out_tensor,
                       Tensor* src_mask_out_tensor, Tensor* softmax_out_tensor,
                       Tensor* dropout_mask_out_tensor,
@@ -111,17 +111,17 @@ class FMHARef {
     blas.BatchedGEMM(transA, transB, gemm_m, gemm_n, gemm_k, alpha, q_ptr,
                      k_ptr, beta, qk_out_data, gemm_batch_size, stride_a,
                      stride_b);
-
-    std::vector<const Tensor*> ins;
-    std::vector<Tensor*> outs;
-    ins.emplace_back(qk_out_tensor);
-    ins.emplace_back(&src_mask_tensor);
-    outs.emplace_back(src_mask_out_tensor);
-    int elewise_add_axis = -1;
     int softmax_axis = -1;
-    if (&src_mask_tensor != nullptr) {
+    if (src_mask_tensor != nullptr) {
+      std::vector<const Tensor*> ins;
+      std::vector<Tensor*> outs;
+      ins.emplace_back(qk_out_tensor);
+      ins.emplace_back(src_mask_tensor);
+      outs.emplace_back(src_mask_out_tensor);
+      int elewise_add_axis = -1;
       LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
           dev_ctx_, ins, &outs, elewise_add_axis, AddFunctor<T>());
+
       SoftmaxForwardCUDAKernelDriver<T>(dev_ctx_, *src_mask_out_tensor,
                                         softmax_axis, softmax_out_tensor);
     } else {
@@ -165,7 +165,7 @@ class FMHARef {
   }
 
   void ComputeBackward(
-      const Tensor& transpose_2_out_tensor, const Tensor& src_mask_tensor,
+      const Tensor& transpose_2_out_tensor, const Tensor* src_mask_tensor,
       const Tensor& softmax_out_tensor, const Tensor& dropout_mask_out_tensor,
       const Tensor& dropout_out_tensor, const Tensor& qk_out_tensor,
       const Tensor& src_mask_out_tensor, const Tensor& fmha_out_grad_tensor,
@@ -249,7 +249,7 @@ class FMHARef {
           softmax_out_grad_tensor);
     }
 
-    if (&src_mask_tensor != nullptr) {
+    if (src_mask_tensor != nullptr) {
       SoftmaxBackwardCUDAKernelDriver<T>(dev_ctx_, softmax_out_tensor,
                                          *softmax_out_grad_tensor, softmax_axis,
                                          src_mask_out_grad_tensor);
diff --git a/paddle/fluid/operators/fused/fused_attention_op.cc b/paddle/fluid/operators/fused/fused_attention_op.cc
index 6c4ac318264e8..11601a5ce40d5 100644
--- a/paddle/fluid/operators/fused/fused_attention_op.cc
+++ b/paddle/fluid/operators/fused/fused_attention_op.cc
@@ -27,8 +27,6 @@ class FusedAttentionOp : public framework::OperatorWithKernel {
 
   void InferShape(framework::InferShapeContext *ctx) const override {
     OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "FusedAttentionOp");
-    OP_INOUT_CHECK(ctx->HasInput("SrcMask"), "Input", "SrcMask",
-                   "FusedAttentionOp");
     OP_INOUT_CHECK(ctx->HasInput("QKVW"), "Input", "QKVW", "FusedAttentionOp");
     OP_INOUT_CHECK(ctx->HasInput("QKVBias"), "Input", "QKVBias",
                    "FusedAttentionOp");
@@ -37,12 +35,22 @@ class FusedAttentionOp : public framework::OperatorWithKernel {
     OP_INOUT_CHECK(ctx->HasInput("OutLinearBias"), "Input", "OutLinearBias",
                    "FusedAttentionOp");
 
-    OP_INOUT_CHECK(ctx->HasOutput("LnMean"), "Output", "LnMean",
-                   "FusedAttentionOp");
-    OP_INOUT_CHECK(ctx->HasOutput("LnVariance"), "Output", "LnVariance",
-                   "FusedAttentionOp");
-    OP_INOUT_CHECK(ctx->HasOutput("LnOut"), "Output", "LnOut",
-                   "FusedAttentionOp");
+    if (ctx->Attrs().Get<bool>("pre_layer_norm") == true) {
+      OP_INOUT_CHECK(ctx->HasOutput("LnMean"), "Output", "LnMean",
+                     "FusedAttentionOp");
+      OP_INOUT_CHECK(ctx->HasOutput("LnVariance"), "Output", "LnVariance",
+                     "FusedAttentionOp");
+      OP_INOUT_CHECK(ctx->HasOutput("LnOut"), "Output", "LnOut",
+                     "FusedAttentionOp");
+    } else {
+      OP_INOUT_CHECK(ctx->HasOutput("Ln2Mean"), "Output", "Ln2Mean",
+                     "FusedAttentionOp");
+      OP_INOUT_CHECK(ctx->HasOutput("Ln2Variance"), "Output", "Ln2Variance",
+                     "FusedAttentionOp");
+      OP_INOUT_CHECK(ctx->HasOutput("BiasDropoutResidualOut"), "Output",
+                     "BiasDropoutResidualOut", "FusedAttentionOp");
+    }
+
     // qkv_out: [batch_size, seq_len, 3, num_head, dim_head]
     OP_INOUT_CHECK(ctx->HasOutput("QKVOut"), "Output", "QKVOut",
                    "FusedAttentionOp");
@@ -54,8 +62,11 @@ class FusedAttentionOp : public framework::OperatorWithKernel {
                    "FusedAttentionOp");
     OP_INOUT_CHECK(ctx->HasOutput("QKTVOut"), "Output", "QKTVOut",
                    "FusedAttentionOp");
-    OP_INOUT_CHECK(ctx->HasOutput("SrcMaskOut"), "Output", "SrcMaskOut",
-                   "FusedAttentionOp");
+
+    if (ctx->HasInput("SrcMask")) {
+      OP_INOUT_CHECK(ctx->HasOutput("SrcMaskOut"), "Output", "SrcMaskOut",
+                     "FusedAttentionOp");
+    }
     OP_INOUT_CHECK(ctx->HasOutput("SoftmaxOut"), "Output", "SoftmaxOut",
                    "FusedAttentionOp");
     OP_INOUT_CHECK(ctx->HasOutput("AttnDropoutMaskOut"), "Output",
@@ -66,12 +77,7 @@ class FusedAttentionOp : public framework::OperatorWithKernel {
                    "FusedAttentionOp");
     OP_INOUT_CHECK(ctx->HasOutput("OutLinearOut"), "Output", "OutLinearOut",
                    "FusedAttentionOp");
-    OP_INOUT_CHECK(ctx->HasOutput("Ln2Mean"), "Output", "Ln2Mean",
-                   "FusedAttentionOp");
-    OP_INOUT_CHECK(ctx->HasOutput("Ln2Variance"), "Output", "Ln2Variance",
-                   "FusedAttentionOp");
-    OP_INOUT_CHECK(ctx->HasOutput("BiasDropoutResidualOut"), "Output",
-                   "BiasDropoutResidualOut", "FusedAttentionOp");
+
     OP_INOUT_CHECK(ctx->HasOutput("DropoutMaskOut"), "Output", "DropoutMaskOut",
                    "FusedAttentionOp");
     OP_INOUT_CHECK(ctx->HasOutput("Y"), "Output", "Y", "FusedAttentionOp");
@@ -101,9 +107,15 @@ class FusedAttentionOp : public framework::OperatorWithKernel {
                           "input qkv_weight = [%s]",
                           x_dim, y_dim));
 
-    ctx->SetOutputDim("LnMean", {x_dim[0] * x_dim[1]});
-    ctx->SetOutputDim("LnVariance", {x_dim[0] * x_dim[1]});
-    ctx->SetOutputDim("LnOut", ctx->GetInputDim("X"));
+    if (ctx->Attrs().Get<bool>("pre_layer_norm") == true) {
+      ctx->SetOutputDim("LnMean", {x_dim[0] * x_dim[1]});
+      ctx->SetOutputDim("LnVariance", {x_dim[0] * x_dim[1]});
+      ctx->SetOutputDim("LnOut", ctx->GetInputDim("X"));
+    } else {
+      ctx->SetOutputDim("Ln2Mean", {x_dim[0] * x_dim[1]});
+      ctx->SetOutputDim("Ln2Variance", {x_dim[0] * x_dim[1]});
+      ctx->SetOutputDim("BiasDropoutResidualOut", ctx->GetInputDim("X"));
+    }
     // [batch_size, seq_len, 3, num_head, head_size]
     ctx->SetOutputDim("QKVOut",
                       {x_dim[0], x_dim[1], y_dim[0], y_dim[1], y_dim[2]});
@@ -114,7 +126,10 @@ class FusedAttentionOp : public framework::OperatorWithKernel {
                       {y_dim[0], x_dim[0], y_dim[1], x_dim[1], y_dim[2]});
     // [batch, num_head, seq_len, seq_len]
     ctx->SetOutputDim("QKOut", {x_dim[0], y_dim[1], x_dim[1], x_dim[1]});
-    ctx->SetOutputDim("SrcMaskOut", {x_dim[0], y_dim[1], x_dim[1], x_dim[1]});
+
+    if (ctx->HasInput("SrcMask")) {
+      ctx->SetOutputDim("SrcMaskOut", {x_dim[0], y_dim[1], x_dim[1], x_dim[1]});
+    }
     // the same as QKOut's shape.
     ctx->SetOutputDim("AttnDropoutOut",
                       {x_dim[0], y_dim[1], x_dim[1], x_dim[1]});
@@ -129,12 +144,10 @@ class FusedAttentionOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim("FMHAOut", {x_dim[0], x_dim[1], y_dim[1], y_dim[2]});
     ctx->SetOutputDim("OutLinearOut", ctx->GetInputDim("X"));
 
-    ctx->SetOutputDim("Ln2Mean", {x_dim[0] * x_dim[1]});
-    ctx->SetOutputDim("Ln2Variance", {x_dim[0] * x_dim[1]});
     if (ctx->Attrs().Get<bool>("dropout_is_test") == false) {
       ctx->SetOutputDim("DropoutMaskOut", ctx->GetInputDim("X"));
     }
-    ctx->SetOutputDim("BiasDropoutResidualOut", ctx->GetInputDim("X"));
+
     ctx->SetOutputDim("Y", ctx->GetInputDim("X"));
   }
 
@@ -305,25 +318,28 @@ class FusedAttentionOpMaker : public framework::OpProtoAndCheckerMaker {
         });
 
     AddComment(R"DOC(
-    	Add fused attention op whose logic is as follows:
-        // @input: [batch_size, seq_len, 3, num_head, head_dim] 
-        // @final_out: [batch_size, seq_len, num_heads, head_dim] 
-   	if (pre_layernorm)
-    	    out = layer_norm(input);
+  Add fused attention op whose logic is as follows:
+  // @input: [batch_size, seq_len, 3, num_head, head_dim] 
+  // @final_out: [batch_size, seq_len, num_heads, head_dim] 
+  if (pre_layernorm)
+    out = layer_norm(input);
 	out = compute_qkv(out) + bias;
 	// fmha module
-	{
-            out = transpose(out, perm=[2, 0, 3, 1, 4]);
-            out = q * k^t;
-            out = attn_mark + out;
-            out = softmax(out);
-            out = dropout(out);
-            out = out * v;
-            out = transpose(out, perm=[0, 2, 1, 3]);
+  {
+    out = transpose(out, perm=[2, 0, 3, 1, 4]);
+    out = q * k^t;
+    out = attn_mask + out;
+    out = softmax(out);
+    out = dropout(out);
+    out = out * v;
+    out = transpose(out, perm=[0, 2, 1, 3]);
                 
-        }
+  }
 	out = out_linear(out);
-	final_out = layer_norm(residual + dropout(bias + out));
+  if (pre_layernorm)
+    final_out = residual + dropout(bias + out);
+  else
+    final_out = layer_norm(residual + dropout(bias + out));
     )DOC");
   }
 };
@@ -338,45 +354,47 @@ class FusedAttentionGradOp : public framework::OperatorWithKernel {
         platform::errors::InvalidArgument(
             "GradOp is only callable when attn_dropout_is_test is false"));
 
-    OP_INOUT_CHECK(ctx->HasInput("Ln2Mean"), "Input", "Ln2Mean",
-                   "FusedAttentionGrad");
-    OP_INOUT_CHECK(ctx->HasInput("Ln2Variance"), "Input", "Ln2Variance",
-                   "FusedAttentionGrad");
-    if (ctx->HasOutput(framework::GradVarName("Ln2Scale"))) {
-      ctx->SetOutputDim(framework::GradVarName("Ln2Scale"),
-                        ctx->GetInputDim("Ln2Scale"));
-    }
-    if (ctx->HasOutput(framework::GradVarName("Ln2Bias"))) {
-      ctx->SetOutputDim(framework::GradVarName("Ln2Bias"),
-                        ctx->GetInputDim("Ln2Bias"));
-    }
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "FusedAttentionGrad");
-    OP_INOUT_CHECK(ctx->HasInput("LnMean"), "Input", "LnMean",
-                   "FusedAttentionGrad");
-    OP_INOUT_CHECK(ctx->HasInput("LnVariance"), "Input", "LnVariance",
-                   "FusedAttentionGrad");
-    if (ctx->Attrs().Get<bool>("pre_layer_norm") == true) {
+    if (ctx->Attrs().Get<bool>("pre_layer_norm") == false) {
+      OP_INOUT_CHECK(ctx->HasInput("Ln2Mean"), "Input", "Ln2Mean",
+                     "FusedAttentionGrad");
+      OP_INOUT_CHECK(ctx->HasInput("Ln2Variance"), "Input", "Ln2Variance",
+                     "FusedAttentionGrad");
+      if (ctx->HasOutput(framework::GradVarName("Ln2Scale"))) {
+        ctx->SetOutputDim(framework::GradVarName("Ln2Scale"),
+                          ctx->GetInputDim("Ln2Scale"));
+      }
+      if (ctx->HasOutput(framework::GradVarName("Ln2Bias"))) {
+        ctx->SetOutputDim(framework::GradVarName("Ln2Bias"),
+                          ctx->GetInputDim("Ln2Bias"));
+      }
+    } else {
+      OP_INOUT_CHECK(ctx->HasInput("LnMean"), "Input", "LnMean",
+                     "FusedAttentionGrad");
+      OP_INOUT_CHECK(ctx->HasInput("LnVariance"), "Input", "LnVariance",
+                     "FusedAttentionGrad");
       OP_INOUT_CHECK(ctx->HasInput("LnOut"), "Input", "LnOut",
                      "FusedAttentionGrad");
     }
+
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "FusedAttentionGrad");
     OP_INOUT_CHECK(ctx->HasInput("QKVW"), "Input", "QKVW",
                    "FusedAttentionGrad");
     OP_INOUT_CHECK(ctx->HasInput("QKVBias"), "Input", "QKVBias",
                    "FusedAttentionGrad");
-    OP_INOUT_CHECK(ctx->HasInput("SrcMask"), "Input", "SrcMask",
-                   "FusedAttentionGrad");
     OP_INOUT_CHECK(ctx->HasInput("OutLinearW"), "Input", "OutLinearW",
                    "FusedAttentionGrad");
     OP_INOUT_CHECK(ctx->HasInput("OutLinearBias"), "Input", "OutLinearBias",
                    "FusedAttentionGrad");
 
-    if (ctx->HasOutput(framework::GradVarName("LnScale"))) {
-      ctx->SetOutputDim(framework::GradVarName("LnScale"),
-                        ctx->GetInputDim("LnScale"));
-    }
-    if (ctx->HasOutput(framework::GradVarName("LnBias"))) {
-      ctx->SetOutputDim(framework::GradVarName("LnBias"),
-                        ctx->GetInputDim("LnBias"));
+    if (ctx->Attrs().Get<bool>("pre_layer_norm") == true) {
+      if (ctx->HasOutput(framework::GradVarName("LnScale"))) {
+        ctx->SetOutputDim(framework::GradVarName("LnScale"),
+                          ctx->GetInputDim("LnScale"));
+      }
+      if (ctx->HasOutput(framework::GradVarName("LnBias"))) {
+        ctx->SetOutputDim(framework::GradVarName("LnBias"),
+                          ctx->GetInputDim("LnBias"));
+      }
     }
     if (ctx->HasOutput(framework::GradVarName("X"))) {
       ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
@@ -390,8 +408,13 @@ class FusedAttentionGradOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim(framework::GradVarName("QKVBias"),
                       ctx->GetInputDim("QKVBias"));
 
-    ctx->SetOutputDim(framework::GradVarName("LnOut"),
-                      ctx->GetInputDim("LnOut"));
+    if (ctx->Attrs().Get<bool>("pre_layer_norm") == true) {
+      ctx->SetOutputDim(framework::GradVarName("LnOut"),
+                        ctx->GetInputDim("LnOut"));
+    } else {
+      ctx->SetOutputDim(framework::GradVarName("BiasDropoutResidualOut"),
+                        ctx->GetInputDim("BiasDropoutResidualOut"));
+    }
     ctx->SetOutputDim(framework::GradVarName("FMHAOut"),
                       ctx->GetInputDim("FMHAOut"));
     ctx->SetOutputDim(framework::GradVarName("QKTVOut"),
@@ -404,16 +427,17 @@ class FusedAttentionGradOp : public framework::OperatorWithKernel {
                       ctx->GetInputDim("SoftmaxOut"));
     ctx->SetOutputDim(framework::GradVarName("AttnDropoutOut"),
                       ctx->GetInputDim("AttnDropoutOut"));
-    ctx->SetOutputDim(framework::GradVarName("SrcMaskOut"),
-                      ctx->GetInputDim("SrcMaskOut"));
+
+    if (ctx->HasOutput(framework::GradVarName("SrcMaskOut"))) {
+      ctx->SetOutputDim(framework::GradVarName("SrcMaskOut"),
+                        ctx->GetInputDim("SrcMaskOut"));
+    }
     ctx->SetOutputDim(framework::GradVarName("QKVOut"),
                       ctx->GetInputDim("QKVOut"));
     ctx->SetOutputDim(framework::GradVarName("QKVBiasOut"),
                       ctx->GetInputDim("QKVBiasOut"));
     ctx->SetOutputDim(framework::GradVarName("OutLinearOut"),
                       ctx->GetInputDim("OutLinearOut"));
-    ctx->SetOutputDim(framework::GradVarName("BiasDropoutResidualOut"),
-                      ctx->GetInputDim("BiasDropoutResidualOut"));
   }
 
  protected:
@@ -439,28 +463,42 @@ class FusedAttentionGradOpMaker : public framework::SingleGradOpMaker<T> {
     op->SetInput("X", this->Input("X"));
     op->SetInput("QKVW", this->Input("QKVW"));
     op->SetInput("QKVBias", this->Input("QKVBias"));
-    op->SetInput("SrcMask", this->Input("SrcMask"));
+
+    if (this->HasInput("SrcMask")) {
+      op->SetInput("SrcMask", this->Input("SrcMask"));
+      op->SetInput("SrcMaskOut", this->Output("SrcMaskOut"));
+      op->SetOutput(framework::GradVarName("SrcMaskOut"),
+                    this->OutputGrad("SrcMaskOut"));
+    }
+
     op->SetInput("OutLinearW", this->Input("OutLinearW"));
     op->SetInput("OutLinearBias", this->Input("OutLinearBias"));
-    if (this->HasInput("LnScale")) {
-      op->SetInput("LnScale", this->Input("LnScale"));
-      op->SetOutput(framework::GradVarName("LnScale"),
-                    this->InputGrad("LnScale"));
-    }
-    if (this->HasInput("LnBias")) {
-      op->SetInput("LnBias", this->Input("LnBias"));
-      op->SetOutput(framework::GradVarName("LnBias"),
-                    this->InputGrad("LnBias"));
-    }
-    if (this->HasInput("Ln2Scale")) {
-      op->SetInput("Ln2Scale", this->Input("Ln2Scale"));
-      op->SetOutput(framework::GradVarName("Ln2Scale"),
-                    this->InputGrad("Ln2Scale"));
-    }
-    if (this->HasInput("Ln2Bias")) {
-      op->SetInput("Ln2Bias", this->Input("Ln2Bias"));
-      op->SetOutput(framework::GradVarName("Ln2Bias"),
-                    this->InputGrad("Ln2Bias"));
+
+    op->SetAttrMap(this->Attrs());
+    bool is_pre_layer_norm =
+        BOOST_GET_CONST(bool, op->GetAttr("pre_layer_norm"));
+    if (is_pre_layer_norm) {
+      if (this->HasInput("LnScale")) {
+        op->SetInput("LnScale", this->Input("LnScale"));
+        op->SetOutput(framework::GradVarName("LnScale"),
+                      this->InputGrad("LnScale"));
+      }
+      if (this->HasInput("LnBias")) {
+        op->SetInput("LnBias", this->Input("LnBias"));
+        op->SetOutput(framework::GradVarName("LnBias"),
+                      this->InputGrad("LnBias"));
+      }
+    } else {
+      if (this->HasInput("Ln2Scale")) {
+        op->SetInput("Ln2Scale", this->Input("Ln2Scale"));
+        op->SetOutput(framework::GradVarName("Ln2Scale"),
+                      this->InputGrad("Ln2Scale"));
+      }
+      if (this->HasInput("Ln2Bias")) {
+        op->SetInput("Ln2Bias", this->Input("Ln2Bias"));
+        op->SetOutput(framework::GradVarName("Ln2Bias"),
+                      this->InputGrad("Ln2Bias"));
+      }
     }
 
     op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
@@ -473,9 +511,22 @@ class FusedAttentionGradOpMaker : public framework::SingleGradOpMaker<T> {
                   this->InputGrad("OutLinearW"));
 
     // use forward outputs as backward inputs.
-    op->SetInput("LnOut", this->Output("LnOut"));
-    op->SetInput("LnMean", this->Output("LnMean"));
-    op->SetInput("LnVariance", this->Output("LnVariance"));
+    if (is_pre_layer_norm) {
+      if (this->HasOutput("LnOut")) {
+        op->SetInput("LnOut", this->Output("LnOut"));
+      }
+      if (this->HasOutput("LnMean")) {
+        op->SetInput("LnMean", this->Output("LnMean"));
+      }
+      if (this->HasOutput("LnVariance")) {
+        op->SetInput("LnVariance", this->Output("LnVariance"));
+      }
+    } else {
+      op->SetInput("Ln2Mean", this->Output("Ln2Mean"));
+      op->SetInput("Ln2Variance", this->Output("Ln2Variance"));
+      op->SetInput("BiasDropoutResidualOut",
+                   this->Output("BiasDropoutResidualOut"));
+    }
     op->SetInput("QKVOut", this->Output("QKVOut"));
     op->SetInput("QKVBiasOut", this->Output("QKVBiasOut"));
     op->SetInput("TransposeOut2", this->Output("TransposeOut2"));
@@ -484,19 +535,23 @@ class FusedAttentionGradOpMaker : public framework::SingleGradOpMaker<T> {
     op->SetInput("SoftmaxOut", this->Output("SoftmaxOut"));
     op->SetInput("AttnDropoutMaskOut", this->Output("AttnDropoutMaskOut"));
     op->SetInput("AttnDropoutOut", this->Output("AttnDropoutOut"));
-    op->SetInput("SrcMaskOut", this->Output("SrcMaskOut"));
+
     op->SetInput("FMHAOut", this->Output("FMHAOut"));
     op->SetInput("OutLinearOut", this->Output("OutLinearOut"));
-
-    op->SetInput("Ln2Mean", this->Output("Ln2Mean"));
-    op->SetInput("Ln2Variance", this->Output("Ln2Variance"));
     op->SetInput("DropoutMaskOut", this->Output("DropoutMaskOut"));
-    op->SetInput("BiasDropoutResidualOut",
-                 this->Output("BiasDropoutResidualOut"));
     op->SetInput("QKVOut", this->Output("QKVOut"));
 
     // backward outputs: dinput
-    op->SetOutput(framework::GradVarName("LnOut"), this->OutputGrad("LnOut"));
+    if (is_pre_layer_norm) {
+      if (this->HasOutput("LnOut")) {
+        op->SetOutput(framework::GradVarName("LnOut"),
+                      this->OutputGrad("LnOut"));
+      }
+    } else {
+      op->SetOutput(framework::GradVarName("BiasDropoutResidualOut"),
+                    this->OutputGrad("BiasDropoutResidualOut"));
+    }
+
     op->SetOutput(framework::GradVarName("QKVOut"), this->OutputGrad("QKVOut"));
     op->SetOutput(framework::GradVarName("QKVBiasOut"),
                   this->OutputGrad("QKVBiasOut"));
@@ -509,16 +564,11 @@ class FusedAttentionGradOpMaker : public framework::SingleGradOpMaker<T> {
                   this->OutputGrad("SoftmaxOut"));
     op->SetOutput(framework::GradVarName("AttnDropoutOut"),
                   this->OutputGrad("AttnDropoutOut"));
-    op->SetOutput(framework::GradVarName("SrcMaskOut"),
-                  this->OutputGrad("SrcMaskOut"));
+
     op->SetOutput(framework::GradVarName("FMHAOut"),
                   this->OutputGrad("FMHAOut"));
-    op->SetOutput(framework::GradVarName("BiasDropoutResidualOut"),
-                  this->OutputGrad("BiasDropoutResidualOut"));
     op->SetOutput(framework::GradVarName("OutLinearOut"),
                   this->OutputGrad("OutLinearOut"));
-
-    op->SetAttrMap(this->Attrs());
   }
 };
 
diff --git a/paddle/fluid/operators/fused/fused_attention_op.cu b/paddle/fluid/operators/fused/fused_attention_op.cu
index 95e690cb17ec1..76bcb7c9c3a51 100644
--- a/paddle/fluid/operators/fused/fused_attention_op.cu
+++ b/paddle/fluid/operators/fused/fused_attention_op.cu
@@ -95,12 +95,6 @@ class FusedAttentionOpKernel : public framework::OpKernel<T> {
     const auto qkv_w_dims = qkv_weight->dims();
 
     auto *x_data = input_x->data<T>();
-    auto *ln_scale_data = (ln_scale == nullptr ? nullptr : ln_scale->data<U>());
-    auto *ln_bias_data = (ln_bias == nullptr ? nullptr : ln_bias->data<U>());
-    auto *ln_mean_data = ln_mean->mutable_data<U>(ctx.GetPlace());
-    auto *ln_var_data = ln_var->mutable_data<U>(ctx.GetPlace());
-    auto *ln_out_data = ln_out->mutable_data<T>(ctx.GetPlace());
-
     auto *qkv_weight_data = qkv_weight->data<T>();
     auto *qkv_bias_data = qkv_bias->data<T>();
     auto *qkv_out_data = qkv_out->mutable_data<T>(ctx.GetPlace());
@@ -111,7 +105,9 @@ class FusedAttentionOpKernel : public framework::OpKernel<T> {
         transpose_out_2->mutable_data<T>(ctx.GetPlace());
     auto *qk_out_data = qk_out->mutable_data<T>(ctx.GetPlace());
     auto *qktv_out_data = qktv_out->mutable_data<T>(ctx.GetPlace());
-    auto *src_mask_out_data = src_mask_out->mutable_data<T>(ctx.GetPlace());
+    auto *src_mask_out_data =
+        (src_mask == nullptr) ? nullptr
+                              : src_mask_out->mutable_data<T>(ctx.GetPlace());
     auto *softmax_out_data = softmax_out->mutable_data<T>(ctx.GetPlace());
     auto *attn_dropout_mask_out_data =
         attn_dropout_mask_out->mutable_data<uint8_t>(ctx.GetPlace());
@@ -125,16 +121,8 @@ class FusedAttentionOpKernel : public framework::OpKernel<T> {
     auto *out_linear_out_data = out_linear_out->mutable_data<T>(ctx.GetPlace());
 
     // get data ptr for bias+dropout+residual+layernorm
-    auto *ln_scale_2_data =
-        (ln_scale_2 == nullptr ? nullptr : ln_scale_2->data<U>());
-    auto *ln_bias_2_data =
-        (ln_bias_2 == nullptr ? nullptr : ln_bias_2->data<U>());
     auto *dropout_mask_out_data =
         dropout_mask_out->mutable_data<uint8_t>(ctx.GetPlace());
-    auto *bias_dropout_residual_out_data =
-        bias_dropout_residual_out->mutable_data<T>(ctx.GetPlace());
-    auto *ln_mean_2_data = ln_mean_2->mutable_data<U>(ctx.GetPlace());
-    auto *ln_var_2_data = ln_var_2->mutable_data<U>(ctx.GetPlace());
     auto *final_out_data = out->mutable_data<T>(ctx.GetPlace());
 
     int batch_size = input_x_dims[0];
@@ -173,6 +161,13 @@ class FusedAttentionOpKernel : public framework::OpKernel<T> {
         ln_epsilon);
 
     if (pre_layer_norm) {
+      auto *ln_scale_data =
+          (ln_scale == nullptr ? nullptr : ln_scale->data<U>());
+      auto *ln_bias_data = (ln_bias == nullptr ? nullptr : ln_bias->data<U>());
+      auto *ln_mean_data = ln_mean->mutable_data<U>(ctx.GetPlace());
+      auto *ln_var_data = ln_var->mutable_data<U>(ctx.GetPlace());
+      auto *ln_out_data = ln_out->mutable_data<T>(ctx.GetPlace());
+
       layer_norm_compute.ComputeForward(x_data, ln_scale_data, ln_bias_data,
                                         ln_out_data, ln_mean_data, ln_var_data);
       qkv_compute.ComputeForward(qkv_weight_data, ln_out_data, qkv_bias_data,
@@ -181,21 +176,37 @@ class FusedAttentionOpKernel : public framework::OpKernel<T> {
       qkv_compute.ComputeForward(qkv_weight_data, x_data, qkv_bias_data,
                                  qkv_out_data, qkv_bias_out_data);
     }
-    fmha_ref_compute.ComputeForward(*qkv_bias_out, *src_mask, transpose_out_2,
+    fmha_ref_compute.ComputeForward(*qkv_bias_out, src_mask, transpose_out_2,
                                     qk_out, src_mask_out, softmax_out,
                                     attn_dropout_mask_out, attn_dropout_out,
                                     qktv_out, fmha_out);
+
     // fmha_out: [batch_size, seq_len, num_head, head_dim]
     // weight:   [embed_dim, embed_dim]
     // out_linear_out: [batch_size, seq_len, embed_dim]
     out_linear_compute.ComputeForward(out_linear_weight_data, fmha_out_data,
                                       nullptr, out_linear_out_data, nullptr);
-    // output = layernorm(residual + dropout(input + bias))
-    fused_dropout_layernorm_helper.LayernormResidualDropoutBias(
-        ctx.cuda_device_context(), out_linear_out_data, x_data,
-        out_linear_bias_data, ln_scale_2_data, ln_bias_2_data,
-        bias_dropout_residual_out_data, dropout_mask_out_data, final_out_data,
-        ln_mean_2_data, ln_var_2_data);
+    if (pre_layer_norm) {
+      // output = (residual + dropout(input + bias))
+      fused_dropout_layernorm_helper.ResidualDropoutBias(
+          ctx.cuda_device_context(), out_linear_out_data, x_data,
+          out_linear_bias_data, final_out_data, dropout_mask_out_data);
+    } else {
+      auto *ln_scale_2_data =
+          (ln_scale_2 == nullptr ? nullptr : ln_scale_2->data<U>());
+      auto *ln_bias_2_data =
+          (ln_bias_2 == nullptr ? nullptr : ln_bias_2->data<U>());
+      auto *bias_dropout_residual_out_data =
+          bias_dropout_residual_out->mutable_data<T>(ctx.GetPlace());
+      auto *ln_mean_2_data = ln_mean_2->mutable_data<U>(ctx.GetPlace());
+      auto *ln_var_2_data = ln_var_2->mutable_data<U>(ctx.GetPlace());
+      // output = layernorm(residual + dropout(input + bias))
+      fused_dropout_layernorm_helper.LayernormResidualDropoutBias(
+          ctx.cuda_device_context(), out_linear_out_data, x_data,
+          out_linear_bias_data, ln_scale_2_data, ln_bias_2_data,
+          bias_dropout_residual_out_data, dropout_mask_out_data, final_out_data,
+          ln_mean_2_data, ln_var_2_data);
+    }
   }
 };
 
@@ -243,9 +254,6 @@ class FusedAttentionGradKernel : public framework::OpKernel<T> {
     auto *out_linear_bias_data = out_linear_bias->data<T>();
 
     // fw output
-    auto *ln_mean = ctx.Input<Tensor>("LnMean");
-    auto *ln_var = ctx.Input<Tensor>("LnVariance");
-    auto *ln_out = ctx.Input<Tensor>("LnOut");
     auto *fmha_out = ctx.Input<Tensor>("FMHAOut");
     auto *transpose_out_2 = ctx.Input<Tensor>("TransposeOut2");
     auto *qk_out = ctx.Input<Tensor>("QKOut");
@@ -260,24 +268,18 @@ class FusedAttentionGradKernel : public framework::OpKernel<T> {
     auto *dropout_mask_out = ctx.Input<Tensor>("DropoutMaskOut");
     auto *bias_dropout_residual_out =
         ctx.Input<Tensor>("BiasDropoutResidualOut");
-    auto *ln_mean_data = ln_mean->data<U>();
-    auto *ln_var_data = ln_var->data<U>();
-    auto *ln_out_data = ln_out->data<T>();
     auto *fmha_out_data = fmha_out->data<T>();
     auto *transpose_out_2_data = transpose_out_2->data<T>();
     auto *qk_out_data = qk_out->data<T>();
     auto *qktv_out_data = qktv_out->data<T>();
     auto *softmax_out_data = softmax_out->data<T>();
-    auto *src_mask_out_data = src_mask_out->data<T>();
+    auto *src_mask_out_data =
+        (src_mask == nullptr) ? nullptr : src_mask_out->data<T>();
     auto *out_linear_out_data = out_linear_out->data<T>();
-    auto *ln_2_mean_data = ln_2_mean->data<U>();
-    auto *ln_2_var_data = ln_2_var->data<U>();
     auto *dropout_mask_out_data = dropout_mask_out->data<uint8_t>();
-    auto *bias_dropout_residual_out_data = bias_dropout_residual_out->data<T>();
 
     // output's grad
     auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *d_ln_out = ctx.Output<Tensor>(framework::GradVarName("LnOut"));
     auto *d_qkv_out = ctx.Output<Tensor>(framework::GradVarName("QKVOut"));
     auto *d_qkv_bias_out =
         ctx.Output<Tensor>(framework::GradVarName("QKVBiasOut"));
@@ -297,7 +299,6 @@ class FusedAttentionGradKernel : public framework::OpKernel<T> {
     auto *d_bias_dropout_residual_out =
         ctx.Output<Tensor>(framework::GradVarName("BiasDropoutResidualOut"));
     auto *d_x_data = d_x->mutable_data<T>(ctx.GetPlace());
-    auto *d_ln_out_data = d_ln_out->mutable_data<T>(ctx.GetPlace());
     auto *d_qkv_out_data = d_qkv_out->mutable_data<T>(ctx.GetPlace());
     auto *d_qkv_bias_out_data = d_qkv_bias_out->mutable_data<T>(ctx.GetPlace());
     auto *d_qktv_out_data = d_qktv_out->mutable_data<T>(ctx.GetPlace());
@@ -307,16 +308,14 @@ class FusedAttentionGradKernel : public framework::OpKernel<T> {
     auto *d_softmax_out_data = d_softmax_out->mutable_data<T>(ctx.GetPlace());
     auto *d_attn_dropout_out_data =
         d_attn_dropout_out->mutable_data<T>(ctx.GetPlace());
-    auto *d_src_mask_out_data = d_src_mask_out->mutable_data<T>(ctx.GetPlace());
+    auto *d_src_mask_out_data =
+        (src_mask == nullptr) ? nullptr
+                              : d_src_mask_out->mutable_data<T>(ctx.GetPlace());
     auto *d_fmha_out_data = d_fmha_out->mutable_data<T>(ctx.GetPlace());
     auto *d_out_linear_out_data =
         d_out_linear_out->mutable_data<T>(ctx.GetPlace());
-    auto *d_bias_dropout_residual_out_data =
-        d_bias_dropout_residual_out->mutable_data<T>(ctx.GetPlace());
 
     // parameter grad
-    auto *d_ln_scale = ctx.Output<Tensor>(framework::GradVarName("LnScale"));
-    auto *d_ln_bias = ctx.Output<Tensor>(framework::GradVarName("LnBias"));
     auto *d_qkv_weight = ctx.Output<Tensor>(framework::GradVarName("QKVW"));
     auto *d_qkv_bias = ctx.Output<Tensor>(framework::GradVarName("QKVBias"));
     auto *d_out_linear_weight =
@@ -325,24 +324,13 @@ class FusedAttentionGradKernel : public framework::OpKernel<T> {
         ctx.Output<Tensor>(framework::GradVarName("OutLinearBias"));
     auto *d_ln_2_scale = ctx.Output<Tensor>(framework::GradVarName("Ln2Scale"));
     auto *d_ln_2_bias = ctx.Output<Tensor>(framework::GradVarName("Ln2Bias"));
-    auto *d_ln_scale_data =
-        (d_ln_scale == nullptr ? nullptr
-                               : d_ln_scale->mutable_data<U>(ctx.GetPlace()));
-    auto *d_ln_bias_data =
-        (d_ln_bias == nullptr ? nullptr
-                              : d_ln_bias->mutable_data<U>(ctx.GetPlace()));
+
     auto *d_qkv_weight_data = d_qkv_weight->mutable_data<T>(ctx.GetPlace());
     auto *d_qkv_bias_data = d_qkv_bias->mutable_data<T>(ctx.GetPlace());
     auto *d_out_linear_weight_data =
         d_out_linear_weight->mutable_data<T>(ctx.GetPlace());
     auto *d_out_linear_bias_data =
         d_out_linear_bias->mutable_data<T>(ctx.GetPlace());
-    auto *d_ln_2_scale_data =
-        (d_ln_2_scale == nullptr ? nullptr : d_ln_2_scale->mutable_data<U>(
-                                                 ctx.GetPlace()));
-    auto *d_ln_2_bias_data =
-        (d_ln_2_bias == nullptr ? nullptr
-                                : d_ln_2_bias->mutable_data<U>(ctx.GetPlace()));
 
     const auto input_x_dims = input_x->dims();
     const auto qkv_w_dims = qkv_weight->dims();
@@ -388,17 +376,36 @@ class FusedAttentionGradKernel : public framework::OpKernel<T> {
         ctx.cuda_device_context(), bsz_seq, dim_embed, dropout_param2,
         ln2epsilon);
 
-    fused_dropout_layernorm_helper.LayernormResidualDropoutBiasGrad(
-        ctx.cuda_device_context(), d_y_data, bias_dropout_residual_out_data,
-        dropout_mask_out_data, ln_2_scale_data, ln_2_mean_data, ln_2_var_data,
-        d_bias_dropout_residual_out_data, d_ln_2_scale_data, d_ln_2_bias_data,
-        d_out_linear_out_data, d_out_linear_bias_data, d_residual_data);
+    if (pre_layer_norm) {
+      fused_dropout_layernorm_helper.ResidualDropoutBiasGrad(
+          ctx.cuda_device_context(), d_y_data, dropout_mask_out_data,
+          d_out_linear_out_data, d_residual_data, d_out_linear_bias_data);
+    } else {
+      auto *ln_2_mean_data = ln_2_mean->data<U>();
+      auto *ln_2_var_data = ln_2_var->data<U>();
+      auto *bias_dropout_residual_out_data =
+          bias_dropout_residual_out->data<T>();
+      auto *d_ln_2_scale_data =
+          (d_ln_2_scale == nullptr ? nullptr : d_ln_2_scale->mutable_data<U>(
+                                                   ctx.GetPlace()));
+      auto *d_ln_2_bias_data =
+          (d_ln_2_bias == nullptr ? nullptr : d_ln_2_bias->mutable_data<U>(
+                                                  ctx.GetPlace()));
+      auto *d_bias_dropout_residual_out_data =
+          d_bias_dropout_residual_out->mutable_data<T>(ctx.GetPlace());
+
+      fused_dropout_layernorm_helper.LayernormResidualDropoutBiasGrad(
+          ctx.cuda_device_context(), d_y_data, bias_dropout_residual_out_data,
+          dropout_mask_out_data, ln_2_scale_data, ln_2_mean_data, ln_2_var_data,
+          d_bias_dropout_residual_out_data, d_ln_2_scale_data, d_ln_2_bias_data,
+          d_out_linear_out_data, d_out_linear_bias_data, d_residual_data);
+    }
 
     out_linear_compute.ComputeBackward(fmha_out_data, out_linear_weight_data,
                                        d_out_linear_out_data, d_fmha_out_data,
                                        d_out_linear_weight_data, nullptr);
     fmha_ref_compute.ComputeBackward(
-        *transpose_out_2, *src_mask, *softmax_out, *attn_dropout_mask_out,
+        *transpose_out_2, src_mask, *softmax_out, *attn_dropout_mask_out,
         *attn_dropout_out, *qk_out, *src_mask_out, *d_fmha_out, d_qktv_out,
         d_attn_dropout_out, d_softmax_out, d_src_mask_out, d_qk_out,
         d_transpose_out_2, nullptr, d_qkv_bias_out);
@@ -407,6 +414,24 @@ class FusedAttentionGradKernel : public framework::OpKernel<T> {
                     cudaMemcpyDeviceToDevice);
 
     if (pre_layer_norm) {
+      auto *ln_mean = ctx.Input<Tensor>("LnMean");
+      auto *ln_var = ctx.Input<Tensor>("LnVariance");
+      auto *ln_out = ctx.Input<Tensor>("LnOut");
+      auto *ln_mean_data = ln_mean->data<U>();
+      auto *ln_var_data = ln_var->data<U>();
+      auto *ln_out_data = ln_out->data<T>();
+
+      auto *d_ln_out = ctx.Output<Tensor>(framework::GradVarName("LnOut"));
+      auto *d_ln_scale = ctx.Output<Tensor>(framework::GradVarName("LnScale"));
+      auto *d_ln_bias = ctx.Output<Tensor>(framework::GradVarName("LnBias"));
+      auto *d_ln_out_data = d_ln_out->mutable_data<T>(ctx.GetPlace());
+      auto *d_ln_scale_data =
+          (d_ln_scale == nullptr ? nullptr
+                                 : d_ln_scale->mutable_data<U>(ctx.GetPlace()));
+      auto *d_ln_bias_data =
+          (d_ln_bias == nullptr ? nullptr
+                                : d_ln_bias->mutable_data<U>(ctx.GetPlace()));
+
       qkv_compute.ComputeBackward(ln_out_data, qkv_weight_data,
                                   d_qkv_bias_out_data, d_ln_out_data,
                                   d_qkv_weight_data, d_qkv_bias_data);
diff --git a/paddle/fluid/operators/fused/fused_feedforward_op.cc b/paddle/fluid/operators/fused/fused_feedforward_op.cc
index 4e03c7369d10e..7da790fc5c6e2 100644
--- a/paddle/fluid/operators/fused/fused_feedforward_op.cc
+++ b/paddle/fluid/operators/fused/fused_feedforward_op.cc
@@ -41,18 +41,8 @@ class FusedFeedForwardOp : public framework::OperatorWithKernel {
                    "fused_feedforward");
     OP_INOUT_CHECK(context->HasOutput("Dropout2Mask"), "Output", "Dropout2Mask",
                    "fused_feedforward");
-    OP_INOUT_CHECK(context->HasOutput("Ln1Mean"), "Output", "Ln1Mean",
-                   "fused_feedforward");
-    OP_INOUT_CHECK(context->HasOutput("Ln1Variance"), "Output", "Ln1Variance",
-                   "fused_feedforward");
-    OP_INOUT_CHECK(context->HasOutput("Ln2Mean"), "Output", "Ln2Mean",
-                   "fused_feedforward");
-    OP_INOUT_CHECK(context->HasOutput("Ln2Variance"), "Output", "Ln2Variance",
-                   "fused_feedforward");
     OP_INOUT_CHECK(context->HasOutput("Linear1Out"), "Output", "Linear1Out",
                    "fused_feedforward");
-    OP_INOUT_CHECK(context->HasOutput("Ln1Out"), "Output", "Ln1Out",
-                   "fused_feedforward");
     OP_INOUT_CHECK(context->HasOutput("Dropout1Out"), "Output", "Dropout1Out",
                    "fused_feedforward");
     OP_INOUT_CHECK(context->HasOutput("Dropout2Out"), "Output", "Dropout2Out",
@@ -76,7 +66,6 @@ class FusedFeedForwardOp : public framework::OperatorWithKernel {
     }
     context->SetOutputDim("Dropout1Out", tmp_dim_x);
     context->SetOutputDim("Linear1Out", tmp_dim_x);
-    context->SetOutputDim("Ln1Out", dim_x);
     context->SetOutputDim("Dropout2Out", dim_x);
 
     if (context->Attrs().Get<bool>("dropout2_is_test") == false) {
@@ -84,10 +73,25 @@ class FusedFeedForwardOp : public framework::OperatorWithKernel {
     }
     framework::DDim mean_dim =
         framework::make_ddim({mat_dim_x.batch_size_ * mat_dim_x.height_});
-    context->SetOutputDim("Ln1Mean", mean_dim);
-    context->SetOutputDim("Ln1Variance", mean_dim);
-    context->SetOutputDim("Ln2Mean", mean_dim);
-    context->SetOutputDim("Ln2Variance", mean_dim);
+    bool pre_layer_norm = context->Attrs().Get<bool>("pre_layer_norm");
+    if (pre_layer_norm) {
+      OP_INOUT_CHECK(context->HasOutput("Ln1Mean"), "Output", "Ln1Mean",
+                     "fused_feedforward");
+      OP_INOUT_CHECK(context->HasOutput("Ln1Variance"), "Output", "Ln1Variance",
+                     "fused_feedforward");
+      OP_INOUT_CHECK(context->HasOutput("Ln1Out"), "Output", "Ln1Out",
+                     "fused_feedforward");
+      context->SetOutputDim("Ln1Out", dim_x);
+      context->SetOutputDim("Ln1Mean", mean_dim);
+      context->SetOutputDim("Ln1Variance", mean_dim);
+    } else {
+      OP_INOUT_CHECK(context->HasOutput("Ln2Mean"), "Output", "Ln2Mean",
+                     "fused_feedforward");
+      OP_INOUT_CHECK(context->HasOutput("Ln2Variance"), "Output", "Ln2Variance",
+                     "fused_feedforward");
+      context->SetOutputDim("Ln2Mean", mean_dim);
+      context->SetOutputDim("Ln2Variance", mean_dim);
+    }
     context->ShareLoD("X", "Out");
   }
 
@@ -218,14 +222,13 @@ class FusedFeedForwardOpGrad : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(ctx->Attrs().Get<bool>("dropout2_is_test"), false,
                       platform::errors::InvalidArgument(
                           "GradOp is only callable when is_test is false"));
+    bool pre_layer_norm = ctx->Attrs().Get<bool>("pre_layer_norm");
     OP_INOUT_CHECK(ctx->HasInput("Dropout1Mask"), "Input", "Dropout1Mask",
                    "FusedFeedForwardGrad");
     OP_INOUT_CHECK(ctx->HasInput("Dropout2Mask"), "Input", "Dropout1Mask",
                    "FusedFeedForwardGrad");
     OP_INOUT_CHECK(ctx->HasInput("Linear1Out"), "Input", "Linear1Out",
                    "FusedFeedForwardGrad");
-    OP_INOUT_CHECK(ctx->HasInput("Ln1Out"), "Input", "Ln1Out",
-                   "FusedFeedForwardGrad");
     OP_INOUT_CHECK(ctx->HasInput("Dropout1Out"), "Input", "Dropout1Out",
                    "FusedFeedForwardGrad");
     OP_INOUT_CHECK(ctx->HasInput("Dropout2Out"), "Input", "Dropout2Out",
@@ -234,14 +237,19 @@ class FusedFeedForwardOpGrad : public framework::OperatorWithKernel {
                    "FusedFeedForwardGrad");
     OP_INOUT_CHECK(ctx->HasInput("Linear2Weight"), "Input", "Linear2Weight",
                    "FusedFeedForwardGrad");
-    OP_INOUT_CHECK(ctx->HasInput("Ln1Mean"), "Input", "Ln1Mean",
-                   "FusedFeedForwardGrad");
-    OP_INOUT_CHECK(ctx->HasInput("Ln1Variance"), "Input", "Ln1Variance",
-                   "FusedFeedForwardGrad");
-    OP_INOUT_CHECK(ctx->HasInput("Ln2Mean"), "Input", "Ln2Mean",
-                   "FusedFeedForwardGrad");
-    OP_INOUT_CHECK(ctx->HasInput("Ln2Variance"), "Input", "Ln2Variance",
-                   "FusedFeedForwardGrad");
+    if (pre_layer_norm) {
+      OP_INOUT_CHECK(ctx->HasInput("Ln1Mean"), "Input", "Ln1Mean",
+                     "FusedFeedForwardGrad");
+      OP_INOUT_CHECK(ctx->HasInput("Ln1Variance"), "Input", "Ln1Variance",
+                     "FusedFeedForwardGrad");
+      OP_INOUT_CHECK(ctx->HasInput("Ln1Out"), "Input", "Ln1Out",
+                     "FusedFeedForwardGrad");
+    } else {
+      OP_INOUT_CHECK(ctx->HasInput("Ln2Mean"), "Input", "Ln2Mean",
+                     "FusedFeedForwardGrad");
+      OP_INOUT_CHECK(ctx->HasInput("Ln2Variance"), "Input", "Ln2Variance",
+                     "FusedFeedForwardGrad");
+    }
 
     OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input",
                    framework::GradVarName("Out"), "FusedFeedForwardGrad");
@@ -299,30 +307,36 @@ class FusedFeedForwardOpGradMaker : public framework::SingleGradOpMaker<T> {
     op->SetInput("Linear1Weight", this->Input("Linear1Weight"));
     op->SetInput("Linear1Bias", this->Input("Linear1Bias"));
     op->SetInput("Linear2Weight", this->Input("Linear2Weight"));
-    op->SetInput("Ln1Scale", this->Input("Ln1Scale"));
-    op->SetInput("Ln1Bias", this->Input("Ln1Bias"));
-    op->SetInput("Ln2Scale", this->Input("Ln2Scale"));
-    op->SetInput("Ln2Bias", this->Input("Ln2Bias"));
     op->SetInput("Dropout1Mask", this->Output("Dropout1Mask"));
     op->SetInput("Dropout2Mask", this->Output("Dropout2Mask"));
     op->SetInput("Linear1Out", this->Output("Linear1Out"));
-    op->SetInput("Ln1Out", this->Output("Ln1Out"));
-    op->SetInput("Ln1Mean", this->Output("Ln1Mean"));
-    op->SetInput("Ln1Variance", this->Output("Ln1Variance"));
-    op->SetInput("Ln2Mean", this->Output("Ln2Mean"));
-    op->SetInput("Ln2Variance", this->Output("Ln2Variance"));
     op->SetInput("Dropout1Out", this->Output("Dropout1Out"));
     op->SetInput("Dropout2Out", this->Output("Dropout2Out"));
 
+    op->SetAttrMap(this->Attrs());
+    bool pre_layer_norm = BOOST_GET_CONST(bool, op->GetAttr("pre_layer_norm"));
+
     op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-    op->SetOutput(framework::GradVarName("Ln1Scale"),
-                  this->InputGrad("Ln1Scale"));
-    op->SetOutput(framework::GradVarName("Ln1Bias"),
-                  this->InputGrad("Ln1Bias"));
-    op->SetOutput(framework::GradVarName("Ln2Scale"),
-                  this->InputGrad("Ln2Scale"));
-    op->SetOutput(framework::GradVarName("Ln2Bias"),
-                  this->InputGrad("Ln2Bias"));
+    if (pre_layer_norm) {
+      op->SetInput("Ln1Scale", this->Input("Ln1Scale"));
+      op->SetInput("Ln1Bias", this->Input("Ln1Bias"));
+      op->SetInput("Ln1Out", this->Output("Ln1Out"));
+      op->SetInput("Ln1Mean", this->Output("Ln1Mean"));
+      op->SetInput("Ln1Variance", this->Output("Ln1Variance"));
+      op->SetOutput(framework::GradVarName("Ln1Scale"),
+                    this->InputGrad("Ln1Scale"));
+      op->SetOutput(framework::GradVarName("Ln1Bias"),
+                    this->InputGrad("Ln1Bias"));
+    } else {
+      op->SetInput("Ln2Scale", this->Input("Ln2Scale"));
+      op->SetInput("Ln2Bias", this->Input("Ln2Bias"));
+      op->SetInput("Ln2Mean", this->Output("Ln2Mean"));
+      op->SetInput("Ln2Variance", this->Output("Ln2Variance"));
+      op->SetOutput(framework::GradVarName("Ln2Scale"),
+                    this->InputGrad("Ln2Scale"));
+      op->SetOutput(framework::GradVarName("Ln2Bias"),
+                    this->InputGrad("Ln2Bias"));
+    }
     op->SetOutput(framework::GradVarName("Linear1Weight"),
                   this->InputGrad("Linear1Weight"));
     op->SetOutput(framework::GradVarName("Linear1Bias"),
@@ -334,8 +348,6 @@ class FusedFeedForwardOpGradMaker : public framework::SingleGradOpMaker<T> {
       op->SetOutput(framework::GradVarName("Linear2Bias"),
                     this->InputGrad("Linear2Bias"));
     }
-
-    op->SetAttrMap(this->Attrs());
   }
 };
 
diff --git a/paddle/fluid/operators/fused/fused_feedforward_op.cu b/paddle/fluid/operators/fused/fused_feedforward_op.cu
index 61a8a9a82f2e0..3b47e65c4833d 100644
--- a/paddle/fluid/operators/fused/fused_feedforward_op.cu
+++ b/paddle/fluid/operators/fused/fused_feedforward_op.cu
@@ -113,26 +113,40 @@ class FusedFeedForwardKernel : public framework::OpKernel<T> {
     auto* linear1_bias = context.Input<framework::Tensor>("Linear1Bias");
     auto* linear2_weight = context.Input<framework::Tensor>("Linear2Weight");
     auto* linear2_bias = context.Input<framework::Tensor>("Linear2Bias");
-    auto* ln1_scale = context.Input<framework::Tensor>("Ln1Scale");
-    auto* ln1_bias = context.Input<framework::Tensor>("Ln1Bias");
-    auto* ln2_scale = context.Input<framework::Tensor>("Ln2Scale");
-    auto* ln2_bias = context.Input<framework::Tensor>("Ln2Bias");
-
-    auto* ln1_mean = context.Output<framework::Tensor>("Ln1Mean");
-    auto* ln1_variance = context.Output<framework::Tensor>("Ln1Variance");
-    auto* ln2_mean = context.Output<framework::Tensor>("Ln2Mean");
-    auto* ln2_variance = context.Output<framework::Tensor>("Ln2Variance");
+    const bool pre_layer_norm = context.Attr<bool>("pre_layer_norm");
+
+    auto* ln1_scale =
+        pre_layer_norm ? context.Input<framework::Tensor>("Ln1Scale") : nullptr;
+    auto* ln1_bias =
+        pre_layer_norm ? context.Input<framework::Tensor>("Ln1Bias") : nullptr;
+    auto* ln2_scale = !pre_layer_norm
+                          ? context.Input<framework::Tensor>("Ln2Scale")
+                          : nullptr;
+    auto* ln2_bias =
+        !pre_layer_norm ? context.Input<framework::Tensor>("Ln2Bias") : nullptr;
+
+    auto* ln1_mean =
+        pre_layer_norm ? context.Output<framework::Tensor>("Ln1Mean") : nullptr;
+    auto* ln1_variance = pre_layer_norm
+                             ? context.Output<framework::Tensor>("Ln1Variance")
+                             : nullptr;
+    auto* ln2_mean = !pre_layer_norm
+                         ? context.Output<framework::Tensor>("Ln2Mean")
+                         : nullptr;
+    auto* ln2_variance = !pre_layer_norm
+                             ? context.Output<framework::Tensor>("Ln2Variance")
+                             : nullptr;
     auto* out = context.Output<framework::Tensor>("Out");
     auto* dropout1_mask = context.Output<framework::Tensor>("Dropout1Mask");
     auto* dropout2_mask = context.Output<framework::Tensor>("Dropout2Mask");
     auto* linear1_out = context.Output<framework::Tensor>("Linear1Out");
-    auto* ln1_out = context.Output<framework::Tensor>("Ln1Out");
+    auto* ln1_out =
+        pre_layer_norm ? context.Output<framework::Tensor>("Ln1Out") : nullptr;
     auto* dropout1_out = context.Output<framework::Tensor>("Dropout1Out");
     auto* dropout2_out = context.Output<framework::Tensor>("Dropout2Out");
 
     const std::string act_method = context.Attr<std::string>("act_method");
 
-    const bool pre_layer_norm = context.Attr<bool>("pre_layer_norm");
     const float epsilon1 = context.Attr<float>("ln1_epsilon");
     const float epsilon2 = context.Attr<float>("ln2_epsilon");
 
@@ -144,12 +158,16 @@ class FusedFeedForwardKernel : public framework::OpKernel<T> {
     out->mutable_data<T>(place);
     dropout1_mask->mutable_data<uint8_t>(place);
     dropout2_mask->mutable_data<uint8_t>(place);
-    ln1_mean->mutable_data<U>(place);
-    ln1_variance->mutable_data<U>(place);
-    ln2_mean->mutable_data<U>(place);
-    ln2_variance->mutable_data<U>(place);
+    if (pre_layer_norm) {
+      ln1_mean->mutable_data<U>(place);
+      ln1_variance->mutable_data<U>(place);
+      ln1_out->mutable_data<T>(place);
+    } else {
+      ln2_mean->mutable_data<U>(place);
+      ln2_variance->mutable_data<U>(place);
+    }
+
     linear1_out->mutable_data<T>(place);
-    ln1_out->mutable_data<T>(place);
     dropout1_out->mutable_data<T>(place);
     dropout2_out->mutable_data<T>(place);
 
@@ -193,16 +211,16 @@ class FusedFeedForwardGradKernel : public framework::OpKernel<T> {
       const framework::Tensor& d_out, const framework::Tensor& x,
       const framework::Tensor& dropout1_mask,
       const framework::Tensor& dropout2_mask,
-      const framework::Tensor& linear1_out, const framework::Tensor& ln1_out,
+      const framework::Tensor& linear1_out, const framework::Tensor* ln1_out,
       const framework::Tensor& dropout1_out,
       const framework::Tensor& dropout2_out,
       const framework::Tensor& linear1_weight,
       const framework::Tensor* linear1_bias,
       const framework::Tensor& linear2_weight,
       const framework::Tensor* ln1_gamma, const framework::Tensor* ln1_beta,
-      const framework::Tensor& ln1_mean, const framework::Tensor& ln1_variance,
+      const framework::Tensor* ln1_mean, const framework::Tensor* ln1_variance,
       const framework::Tensor* ln2_gamma, const framework::Tensor* ln2_beta,
-      const framework::Tensor& ln2_mean, const framework::Tensor& ln2_variance,
+      const framework::Tensor* ln2_mean, const framework::Tensor* ln2_variance,
       framework::Tensor* d_x, framework::Tensor* d_linear1_weight,
       framework::Tensor* d_linear1_bias, framework::Tensor* d_linear2_weight,
       framework::Tensor* d_linear2_bias, framework::Tensor* d_ln1_gamma,
@@ -252,8 +270,8 @@ class FusedFeedForwardGradKernel : public framework::OpKernel<T> {
     } else {
       fused_dropout_layernorm_helper.LayernormResidualDropoutBiasGrad(
           ctx, d_out.data<T>(), dropout2_out.data<T>(),
-          dropout2_mask.data<uint8_t>(), ln2_gamma_ptr, ln2_mean.data<U>(),
-          ln2_variance.data<U>(), d_dropout2_out.data<T>(), d_ln2_gamma_ptr,
+          dropout2_mask.data<uint8_t>(), ln2_gamma_ptr, ln2_mean->data<U>(),
+          ln2_variance->data<U>(), d_dropout2_out.data<T>(), d_ln2_gamma_ptr,
           d_ln2_beta_ptr, d_linear2_out.data<T>(), d_linear2_bias_ptr,
           d_residual.data<T>());
     }
@@ -273,13 +291,13 @@ class FusedFeedForwardGradKernel : public framework::OpKernel<T> {
     if (pre_layer_norm) {
       framework::Tensor d_ln1_out;
       d_ln1_out.mutable_data<T>({bsz_seq, d_model}, place);
-      MatMulGrad(ctx, d_linear1_out, ln1_out, linear1_weight, &d_ln1_out,
+      MatMulGrad(ctx, d_linear1_out, *ln1_out, linear1_weight, &d_ln1_out,
                  d_linear1_weight);
 
-      pre_layernorm_helper.LayerNormGrad(ctx, d_ln1_out.data<T>(), x.data<T>(),
-                                         ln1_gamma_ptr, ln1_mean.data<U>(),
-                                         ln1_variance.data<U>(), d_x->data<T>(),
-                                         d_ln1_gamma_ptr, d_ln1_beta_ptr);
+      pre_layernorm_helper.LayerNormGrad(
+          ctx, d_ln1_out.data<T>(), x.data<T>(), ln1_gamma_ptr,
+          ln1_mean->data<U>(), ln1_variance->data<U>(), d_x->data<T>(),
+          d_ln1_gamma_ptr, d_ln1_beta_ptr);
     } else {
       MatMulGrad(ctx, d_linear1_out, x, linear1_weight, d_x, d_linear1_weight);
     }
@@ -290,33 +308,52 @@ class FusedFeedForwardGradKernel : public framework::OpKernel<T> {
     auto d_out =
         *context.Input<framework::Tensor>(framework::GradVarName("Out"));
     auto x = *context.Input<framework::Tensor>("X");
+    const bool pre_layer_norm = context.Attr<bool>("pre_layer_norm");
     auto dropout1_mask = *context.Input<framework::Tensor>("Dropout1Mask");
     auto dropout2_mask = *context.Input<framework::Tensor>("Dropout2Mask");
     auto linear1_out = *context.Input<framework::Tensor>("Linear1Out");
-    auto ln1_out = *context.Input<framework::Tensor>("Ln1Out");
+    auto* ln1_out =
+        pre_layer_norm ? context.Input<framework::Tensor>("Ln1Out") : nullptr;
     auto dropout1_out = *context.Input<framework::Tensor>("Dropout1Out");
     auto dropout2_out = *context.Input<framework::Tensor>("Dropout2Out");
     auto linear1_weight = *context.Input<framework::Tensor>("Linear1Weight");
     auto* linear1_bias = context.Input<framework::Tensor>("Linear1Bias");
     auto linear2_weight = *context.Input<framework::Tensor>("Linear2Weight");
-    auto ln1_mean = *context.Input<framework::Tensor>("Ln1Mean");
-    auto ln1_variance = *context.Input<framework::Tensor>("Ln1Variance");
-    auto* ln1_scale = context.Input<framework::Tensor>("Ln1Scale");
-    auto* ln1_bias = context.Input<framework::Tensor>("Ln1Bias");
-    auto ln2_mean = *context.Input<framework::Tensor>("Ln2Mean");
-    auto ln2_variance = *context.Input<framework::Tensor>("Ln2Variance");
-    auto* ln2_scale = context.Input<framework::Tensor>("Ln2Scale");
-    auto* ln2_bias = context.Input<framework::Tensor>("Ln2Bias");
+    auto* ln1_mean =
+        pre_layer_norm ? context.Input<framework::Tensor>("Ln1Mean") : nullptr;
+    auto* ln1_variance = pre_layer_norm
+                             ? context.Input<framework::Tensor>("Ln1Variance")
+                             : nullptr;
+    auto* ln1_scale =
+        pre_layer_norm ? context.Input<framework::Tensor>("Ln1Scale") : nullptr;
+    auto* ln1_bias =
+        pre_layer_norm ? context.Input<framework::Tensor>("Ln1Bias") : nullptr;
+    auto* ln2_mean =
+        !pre_layer_norm ? context.Input<framework::Tensor>("Ln2Mean") : nullptr;
+    auto* ln2_variance = !pre_layer_norm
+                             ? context.Input<framework::Tensor>("Ln2Variance")
+                             : nullptr;
+    auto* ln2_scale = !pre_layer_norm
+                          ? context.Input<framework::Tensor>("Ln2Scale")
+                          : nullptr;
+    auto* ln2_bias =
+        !pre_layer_norm ? context.Input<framework::Tensor>("Ln2Bias") : nullptr;
 
     auto* d_x = context.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto* d_ln1_scale =
-        context.Output<framework::Tensor>(framework::GradVarName("Ln1Scale"));
-    auto* d_ln1_bias =
-        context.Output<framework::Tensor>(framework::GradVarName("Ln1Bias"));
+    auto* d_ln1_scale = pre_layer_norm
+                            ? context.Output<framework::Tensor>(
+                                  framework::GradVarName("Ln1Scale"))
+                            : nullptr;
+    auto* d_ln1_bias = pre_layer_norm
+                           ? context.Output<framework::Tensor>(
+                                 framework::GradVarName("Ln1Bias"))
+                           : nullptr;
     auto* d_ln2_scale =
-        context.Output<framework::Tensor>(framework::GradVarName("Ln2Scale"));
+        pre_layer_norm ? nullptr : context.Output<framework::Tensor>(
+                                       framework::GradVarName("Ln2Scale"));
     auto* d_ln2_bias =
-        context.Output<framework::Tensor>(framework::GradVarName("Ln2Bias"));
+        pre_layer_norm ? nullptr : context.Output<framework::Tensor>(
+                                       framework::GradVarName("Ln2Bias"));
     auto* d_linear1_weight = context.Output<framework::Tensor>(
         framework::GradVarName("Linear1Weight"));
     auto* d_linear1_bias = context.Output<framework::Tensor>(
@@ -328,7 +365,6 @@ class FusedFeedForwardGradKernel : public framework::OpKernel<T> {
 
     const float epsilon1 = context.Attr<float>("ln1_epsilon");
     const float epsilon2 = context.Attr<float>("ln2_epsilon");
-    const bool pre_layer_norm = context.Attr<bool>("pre_layer_norm");
     const std::string act_method = context.Attr<std::string>("act_method");
     DropoutParam dropout_param1(context, 1);
     DropoutParam dropout_param2(context, 2);
diff --git a/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc b/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc
index 8e0627fc15c22..e1506e3708366 100644
--- a/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc
+++ b/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc
@@ -21,7 +21,6 @@ namespace operators {
 using paddle::framework::LoDTensor;
 using paddle::framework::Tensor;
 using paddle::platform::CPUDeviceContext;
-using paddle::platform::CreateKey;
 using paddle::platform::MKLDNNGetDataType;
 using paddle::platform::MKLDNNMemDesc;
 using platform::to_void_cast;
diff --git a/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc b/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc
index a61a3de62f397..edf541fde2a51 100644
--- a/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc
+++ b/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc
@@ -21,7 +21,6 @@ namespace operators {
 using paddle::framework::LoDTensor;
 using paddle::framework::Tensor;
 using paddle::platform::CPUDeviceContext;
-using paddle::platform::CreateKey;
 using paddle::platform::MKLDNNGetDataType;
 using paddle::platform::MKLDNNMemDesc;
 using platform::to_void_cast;
diff --git a/paddle/fluid/operators/gather_nd_op_xpu.cc b/paddle/fluid/operators/gather_nd_op_xpu.cc
new file mode 100644
index 0000000000000..c7e4169865fa6
--- /dev/null
+++ b/paddle/fluid/operators/gather_nd_op_xpu.cc
@@ -0,0 +1,79 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+
+#include "paddle/fluid/operators/gather_nd_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class GatherNdXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *x = ctx.Input<Tensor>("X");
+    auto *index = ctx.Input<Tensor>("Index");
+    auto *out = ctx.Output<Tensor>("Out");
+
+    out->template mutable_data<T>(ctx.GetPlace());
+    if (x->numel() == 0) return;
+
+    if (index->numel() == 0) {
+      framework::TensorCopy(*x, ctx.GetPlace(), ctx.device_context(), out);
+      return;
+    }
+
+    const auto &index_type = index->type();
+    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
+                            index_type == framework::proto::VarType::INT64;
+    PADDLE_ENFORCE_EQ(index_type_match, true,
+                      platform::errors::InvalidArgument(
+                          "Index holds the wrong type, it holds [%s],"
+                          "but desires to be [%s] or [%s]",
+                          paddle::framework::DataTypeToString(index_type),
+                          paddle::framework::DataTypeToString(
+                              framework::proto::VarType::INT32),
+                          paddle::framework::DataTypeToString(
+                              framework::proto::VarType::INT64)));
+
+    auto x_shape = paddle::framework::vectorize<int>(x->dims());
+    auto index_shape = paddle::framework::vectorize<int>(index->dims());
+    xpu::VectorParam<int> x_vec = {x_shape.data(),
+                                   static_cast<int>(x_shape.size()), nullptr};
+    auto &dev_ctx =
+        ctx.template device_context<paddle::platform::XPUDeviceContext>();
+    int ret = XPU_SUCCESS;
+    if (index_type == framework::proto::VarType::INT32) {
+      ret = xpu::gather_nd<T, int>(dev_ctx.x_context(), x->data<T>(),
+                                   index->data<int>(), out->data<T>(), x_vec,
+                                   index_shape);
+    } else {
+      ret = xpu::gather_nd<T, int64_t>(dev_ctx.x_context(), x->data<T>(),
+                                       index->data<int64_t>(), out->data<T>(),
+                                       x_vec, index_shape);
+    }
+    PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                      platform::errors::External(
+                          "XPU gather_nd kernel return wrong value[%d %s]", ret,
+                          XPUAPIErrorMsg[ret]));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(gather_nd, ops::GatherNdXPUKernel<int>,
+                       ops::GatherNdXPUKernel<int64_t>,
+                       ops::GatherNdXPUKernel<float>);
+
+#endif
diff --git a/paddle/fluid/operators/gather_op_xpu.cc b/paddle/fluid/operators/gather_op_xpu.cc
index 6d1dac8304050..d9fdbb2a9dd75 100644
--- a/paddle/fluid/operators/gather_op_xpu.cc
+++ b/paddle/fluid/operators/gather_op_xpu.cc
@@ -24,6 +24,8 @@ namespace operators {
 
 template <typename T>
 class GatherOpXPUKernel : public framework::OpKernel<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     PADDLE_ENFORCE_EQ(
@@ -63,13 +65,16 @@ class GatherOpXPUKernel : public framework::OpKernel<T> {
     auto &dev_ctx = ctx.template device_context<platform::XPUDeviceContext>();
     int r = XPU_SUCCESS;
     if (index->type() == framework::proto::VarType::INT32) {
-      r = xpu::gather<T, int>(dev_ctx.x_context(), x->data<T>(),
-                              index->data<int>(), output->data<T>(), xshape,
-                              index->dims()[0], 0);
+      r = xpu::gather<XPUType, int>(
+          dev_ctx.x_context(), reinterpret_cast<const XPUType *>(x->data<T>()),
+          index->data<int>(), reinterpret_cast<XPUType *>(output->data<T>()),
+          xshape, index->dims()[0], 0);
     } else {
-      r = xpu::gather<T, int64_t>(dev_ctx.x_context(), x->data<T>(),
-                                  index->data<int64_t>(), output->data<T>(),
-                                  xshape, index->dims()[0], 0);
+      r = xpu::gather<XPUType, int64_t>(
+          dev_ctx.x_context(), reinterpret_cast<const XPUType *>(x->data<T>()),
+          index->data<int64_t>(),
+          reinterpret_cast<XPUType *>(output->data<T>()), xshape,
+          index->dims()[0], 0);
     }
     PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
                       platform::errors::External(
@@ -80,6 +85,8 @@ class GatherOpXPUKernel : public framework::OpKernel<T> {
 
 template <typename T>
 class GatherGradOpXPUKernel : public framework::OpKernel<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     PADDLE_ENFORCE_EQ(
@@ -123,13 +130,28 @@ class GatherGradOpXPUKernel : public framework::OpKernel<T> {
 
     int r = XPU_SUCCESS;
     if (index->type() == framework::proto::VarType::INT32) {
-      r = xpu::gather_grad<T, int>(dev_ctx.x_context(), dout->data<T>(),
-                                   index->data<int>(), dx->data<T>(), xshape,
-                                   index->dims()[0], 0, overwrite);
+      r = xpu::gather_grad<XPUType, int>(
+          dev_ctx.x_context(),
+          reinterpret_cast<const XPUType *>(dout->data<T>()),
+          index->data<int>(), reinterpret_cast<XPUType *>(dx->data<T>()),
+          xshape, index->dims()[0], 0, overwrite);
     } else {
-      r = xpu::gather_grad<T, int64_t>(dev_ctx.x_context(), dout->data<T>(),
-                                       index->data<int64_t>(), dx->data<T>(),
-                                       xshape, index->dims()[0], 0, overwrite);
+      xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
+      int *index_int_ptr_l3 =
+          RAII_GUARD.alloc_l3_or_gm<int32_t>(index->numel());
+      r = xpu::cast_v2<int64_t, int32_t>(dev_ctx.x_context(),
+                                         index->data<int64_t>(),
+                                         index_int_ptr_l3, index->numel());
+      PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External(
+                                            "XPU API(cast_v2) return wrong "
+                                            "value[%d %s]",
+                                            r, XPUAPIErrorMsg[r]));
+
+      r = xpu::gather_grad<XPUType, int>(
+          dev_ctx.x_context(),
+          reinterpret_cast<const XPUType *>(dout->data<T>()), index_int_ptr_l3,
+          reinterpret_cast<XPUType *>(dx->data<T>()), xshape, index->dims()[0],
+          0, overwrite);
     }
     PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
                       platform::errors::External(
@@ -142,6 +164,8 @@ class GatherGradOpXPUKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_XPU_KERNEL(gather, ops::GatherOpXPUKernel<float>);
-REGISTER_OP_XPU_KERNEL(gather_grad, ops::GatherGradOpXPUKernel<float>);
+REGISTER_OP_XPU_KERNEL(gather, ops::GatherOpXPUKernel<float>,
+                       ops::GatherOpXPUKernel<paddle::platform::float16>);
+REGISTER_OP_XPU_KERNEL(gather_grad, ops::GatherGradOpXPUKernel<float>,
+                       ops::GatherGradOpXPUKernel<paddle::platform::float16>);
 #endif
diff --git a/paddle/fluid/operators/gelu_op_xpu.cc b/paddle/fluid/operators/gelu_op_xpu.cc
new file mode 100644
index 0000000000000..b8c2e9becf295
--- /dev/null
+++ b/paddle/fluid/operators/gelu_op_xpu.cc
@@ -0,0 +1,89 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/operators/gelu_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class GeluXPUKernel : public framework::OpKernel<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+
+    auto* out = ctx.Output<Tensor>("Out");
+
+    auto place = ctx.GetPlace();
+
+    const XPUType* x_data = reinterpret_cast<const XPUType*>(x->data<T>());
+    XPUType* y_data = reinterpret_cast<XPUType*>(out->mutable_data<T>(place));
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    int r = xpu::gelu<XPUType>(dev_ctx.x_context(), x_data, y_data, x->numel());
+    PADDLE_ENFORCE_EQ(
+        r, XPU_SUCCESS,
+        platform::errors::External("XPU gelu kernel return wrong value[%d %s]",
+                                   r, XPUAPIErrorMsg[r]));
+  }
+};
+
+template <typename DeviceContext, typename T>
+class GeluGradXPUKernel : public framework::OpKernel<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+
+    auto place = ctx.GetPlace();
+    const XPUType* x_data = reinterpret_cast<const XPUType*>(x->data<T>());
+    const XPUType* dout_data =
+        reinterpret_cast<const XPUType*>(dout->data<T>());
+    XPUType* dx_data = reinterpret_cast<XPUType*>(dx->mutable_data<T>(place));
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+
+    int r = xpu::gelu_grad<XPUType>(dev_ctx.x_context(), x_data, nullptr,
+                                    dout_data, dx_data, dout->numel());
+    PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
+                      platform::errors::External(
+                          "XPU gelu_grad kernel return wrong value[%d %s]", r,
+                          XPUAPIErrorMsg[r]));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_XPU_KERNEL(
+    gelu, ops::GeluXPUKernel<paddle::platform::XPUDeviceContext, float>,
+    ops::GeluXPUKernel<paddle::platform::XPUDeviceContext,
+                       paddle::platform::float16>);
+
+REGISTER_OP_XPU_KERNEL(
+    gelu_grad,
+    ops::GeluGradXPUKernel<paddle::platform::XPUDeviceContext, float>,
+    ops::GeluGradXPUKernel<paddle::platform::XPUDeviceContext,
+                           paddle::platform::float16>);
diff --git a/paddle/fluid/operators/index_select_op.cu b/paddle/fluid/operators/index_select_op.cu
index 43761d97962a4..2353781daaa39 100644
--- a/paddle/fluid/operators/index_select_op.cu
+++ b/paddle/fluid/operators/index_select_op.cu
@@ -54,14 +54,18 @@ __global__ void index_select_grad_cuda_kernel(const T* output_grad,
 
   int64_t pre_idx = idx / (stride * size);
   int64_t dim_idx = idx % (stride * size) / stride;
-  int64_t begin_idx = idx + (delta * pre_idx - dim_idx) * stride;
+  IndexT src_dim_idx = index[dim_idx];
+  int64_t input_idx = idx + (delta * pre_idx + src_dim_idx - dim_idx) * stride;
+  paddle::platform::CudaAtomicAdd(&input_grad[input_idx], output_grad[idx]);
+}
 
-  input_grad[idx] = 0.0;
-  for (int64_t i = 0; i < nums; i++) {
-    if (index[i] == dim_idx) {
-      input_grad[idx] += output_grad[begin_idx + i * stride];
-    }
+template <typename T>
+__global__ void index_select_grad_init(T* input_grad, int64_t N) {
+  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= N) {
+    return;
   }
+  input_grad[idx] = 0.0;
 }
 
 template <typename DeviceContext, typename T>
@@ -143,8 +147,8 @@ class IndexSelectGradCUDAKernel : public framework::OpKernel<T> {
     dim = dim >= 0 ? dim : dim + input_dim.size();
     auto stride_dim = framework::stride(input_dim);
     int64_t stride = stride_dim[dim];
-    int64_t size = input_dim[dim];
-    int64_t delta = output_dim[dim] - size;
+    int64_t size = output_dim[dim];
+    int64_t delta = input_dim[dim] - size;
 
     const auto& index_type = index->type();
     bool index_type_match = index_type == framework::proto::VarType::INT64 ||
@@ -161,17 +165,22 @@ class IndexSelectGradCUDAKernel : public framework::OpKernel<T> {
 
     int64_t numel = in_grad->numel();
     int64_t index_nums = index->numel();
+    int64_t out_nums = output_grad->numel();
 
     auto stream =
         context.template device_context<platform::CUDADeviceContext>().stream();
 
+    index_select_grad_init<
+        T><<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
+             PADDLE_CUDA_NUM_THREADS, 0, stream>>>(in_grad_data, numel);
+
     if (index_type == framework::proto::VarType::INT64) {
       const int64_t* index_data = index->data<int64_t>();
       index_select_grad_cuda_kernel<T, int64_t><<<
-          (numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
+          (out_nums + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
           PADDLE_CUDA_NUM_THREADS, 0, stream>>>(output_grad_data, in_grad_data,
-                                                index_data, index_nums, numel,
-                                                stride, size, delta);
+                                                index_data, index_nums,
+                                                out_nums, stride, size, delta);
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
 #else
@@ -180,10 +189,10 @@ class IndexSelectGradCUDAKernel : public framework::OpKernel<T> {
     } else {
       const int* index_data = index->data<int>();
       index_select_grad_cuda_kernel<T, int><<<
-          (numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
+          (out_nums + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
           PADDLE_CUDA_NUM_THREADS, 0, stream>>>(output_grad_data, in_grad_data,
-                                                index_data, index_nums, numel,
-                                                stride, size, delta);
+                                                index_data, index_nums,
+                                                out_nums, stride, size, delta);
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
 #else
@@ -201,12 +210,16 @@ REGISTER_OP_CUDA_KERNEL(
     index_select,
     ops::IndexSelectCUDAKernel<paddle::platform::CUDADeviceContext, float>,
     ops::IndexSelectCUDAKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::IndexSelectCUDAKernel<paddle::platform::CUDADeviceContext,
+                               paddle::platform::float16>,
     ops::IndexSelectCUDAKernel<paddle::platform::CUDADeviceContext, int>,
     ops::IndexSelectCUDAKernel<paddle::platform::CUDADeviceContext, int64_t>);
 REGISTER_OP_CUDA_KERNEL(
     index_select_grad,
     ops::IndexSelectGradCUDAKernel<paddle::platform::CUDADeviceContext, float>,
     ops::IndexSelectGradCUDAKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::IndexSelectGradCUDAKernel<paddle::platform::CUDADeviceContext,
+                                   paddle::platform::float16>,
     ops::IndexSelectGradCUDAKernel<paddle::platform::CUDADeviceContext, int>,
     ops::IndexSelectGradCUDAKernel<paddle::platform::CUDADeviceContext,
                                    int64_t>);
diff --git a/paddle/fluid/operators/interpolate_v2_op_npu.cc b/paddle/fluid/operators/interpolate_v2_op_npu.cc
index b30c7ac810c01..24ad6746ced95 100644
--- a/paddle/fluid/operators/interpolate_v2_op_npu.cc
+++ b/paddle/fluid/operators/interpolate_v2_op_npu.cc
@@ -20,6 +20,369 @@ namespace operators {
 
 using Tensor = framework::Tensor;
 using DataLayout = framework::DataLayout;
+using DDim = framework::DDim;
+using fp16 = paddle::platform::float16;
+
+template <typename T>
+struct InterpolateFunction {
+ public:
+  explicit InterpolateFunction(const framework::ExecutionContext& ctx)
+      : ctx(ctx) {
+    place = ctx.GetPlace();
+    stream = ctx.template device_context<paddle::platform::NPUDeviceContext>()
+                 .stream();
+    t0.mutable_data<float>({1}, place);
+    t1.mutable_data<float>({1}, place);
+    tn.mutable_data<float>({1}, place);
+    FillNpuTensorWithConstant<float>(&t0, static_cast<float>(0));
+    FillNpuTensorWithConstant<float>(&t1, static_cast<float>(1));
+  }
+  void Arange(int n, Tensor* x) {
+    FillNpuTensorWithConstant<float>(&tn, static_cast<float>(n));
+    const auto& runner = NpuOpRunner("Range", {t0, tn, t1}, {*x}, {});
+    runner.Run(stream);
+  }
+  void ReduceSum(const Tensor* x, Tensor* y, const std::vector<int>& dim,
+                 bool keep_dims = true) {
+    const auto& runner = NpuOpRunner("ReduceSumD", {*x}, {*y},
+                                     {{"axes", dim}, {"keep_dims", keep_dims}});
+    runner.Run(stream);
+  }
+  void Add(const Tensor* x, const Tensor* y, Tensor* z) {
+    const auto& runner = NpuOpRunner("AddV2", {*x, *y}, {*z}, {});
+    runner.Run(stream);
+  }
+  void Adds(const Tensor* x, float scalar, Tensor* y) {
+    const auto& runner = NpuOpRunner("Adds", {*x}, {*y}, {{"value", scalar}});
+    runner.Run(stream);
+  }
+  void Mul(const Tensor* x, const Tensor* y, Tensor* z) {
+    const auto& runner = NpuOpRunner("Mul", {*x, *y}, {*z}, {});
+    runner.Run(stream);
+  }
+  void Sub(const Tensor* x, const Tensor* y, Tensor* z) {
+    const auto& runner = NpuOpRunner("Sub", {*x, *y}, {*z}, {});
+    runner.Run(stream);
+  }
+  void Cast(const Tensor* x, Tensor* y) {
+    auto dst_dtype = ConvertToNpuDtype(y->type());
+    const auto& runner = NpuOpRunner(
+        "Cast", {*x}, {*y}, {{"dst_type", static_cast<int>(dst_dtype)}});
+    runner.Run(stream);
+  }
+  void Gather(const Tensor* x, const Tensor* indices, const int axis,
+              Tensor* y) {
+    const auto& runner =
+        NpuOpRunner("GatherV2D", {*x, *indices}, {*y}, {{"axis", axis}});
+    runner.Run(stream);
+  }
+  void GatherGrad(const Tensor* gy, const Tensor* indices, const int axis,
+                  Tensor* gx) {
+    //  1  gy swapaxis: axis & 0
+    int len = (gy->dims()).size();
+    std::vector<int> axis_swap(len);
+    for (int i = 0; i < len; i++) {
+      axis_swap[i] = i;
+    }
+    axis_swap[0] = axis;
+    axis_swap[axis] = 0;
+    auto y_new_shape = gy->dims();
+    auto yt = y_new_shape[axis];
+    y_new_shape[axis] = y_new_shape[0];
+    y_new_shape[0] = yt;
+    Tensor gy_t;
+    gy_t.mutable_data<T>(y_new_shape, place);
+    Transpose(gy, &gy_t, axis_swap);
+    //  2  scatter
+    auto x_new_shape = gx->dims();
+    auto xt = x_new_shape[axis];
+    x_new_shape[axis] = x_new_shape[0];
+    x_new_shape[0] = xt;
+    Tensor gx_zero, gx_t;
+    gx_zero.mutable_data<T>(x_new_shape, place);
+    gx_t.mutable_data<T>(x_new_shape, place);
+    FillNpuTensorWithConstant<T>(&gx_zero, static_cast<T>(0));
+    gx_zero.Resize(x_new_shape);
+    Scatter(&gx_zero, indices, &gy_t, &gx_t);
+    //  3  gx swapaxis: axis, 0
+    Transpose(&gx_t, gx, axis_swap);
+  }
+  void Scatter(const Tensor* x, const Tensor* index, const Tensor* updates,
+               Tensor* y) {
+    const auto& runner =
+        NpuOpRunner("TensorScatterAdd", {*x, *index, *updates}, {*y}, {});
+    runner.Run(stream);
+  }
+  void Transpose(const Tensor* x, Tensor* y, const std::vector<int>& axis) {
+    const auto& runner =
+        NpuOpRunner("TransposeD", {*x}, {*y}, {{"perm", axis}});
+    runner.Run(stream);
+  }
+  void Muls(const Tensor* x, float scalar, Tensor* y) {
+    const auto& runner = NpuOpRunner("Muls", {*x}, {*y}, {{"value", scalar}});
+    runner.Run(stream);
+  }
+  void Maximum(const Tensor* x, const Tensor* y, Tensor* z) {
+    const auto& runner = NpuOpRunner("Maximum", {*x, *y}, {*z}, {});
+    runner.Run(stream);
+  }
+  void Minimum(const Tensor* x, const Tensor* y, Tensor* z) {
+    const auto& runner = NpuOpRunner("Minimum", {*x, *y}, {*z}, {});
+    runner.Run(stream);
+  }
+  void Floor(const Tensor* x, Tensor* y) {
+    const auto& runner = NpuOpRunner("Floor", {*x}, {*y}, {});
+    runner.Run(stream);
+  }
+
+ private:
+  platform::Place place;
+  aclrtStream stream;
+  const framework::ExecutionContext& ctx;
+  Tensor t0;
+  Tensor t1;
+  Tensor tn;
+};
+
+template <>
+void InterpolateFunction<fp16>::Arange(int n, Tensor* x) {
+  Tensor x_fp32(framework::proto::VarType::FP32);
+  x_fp32.mutable_data<float>(x->dims(), place);
+  FillNpuTensorWithConstant<float>(&tn, static_cast<float>(n));
+  const auto& runner = NpuOpRunner("Range", {t0, tn, t1}, {x_fp32}, {});
+  runner.Run(stream);
+  Cast(&x_fp32, x);
+}
+
+void InterpolateParamCompute(const float scale_h, const float scale_w,
+                             const bool align_corners, const int align_mode,
+                             const DataLayout& data_layout, const DDim& indim,
+                             const DDim& outdim, int* axis_h, int* axis_w,
+                             int* in_h, int* in_w, int* out_h, int* out_w,
+                             float* ratio_h, float* ratio_w) {
+  if (data_layout == DataLayout::kNCHW) {
+    *axis_h = 2;
+    *axis_w = 3;
+  } else {
+    *axis_h = 1;
+    *axis_w = 2;
+  }
+  *out_h = outdim[*axis_h];
+  *out_w = outdim[*axis_w];
+  *in_h = indim[*axis_h];
+  *in_w = indim[*axis_w];
+  *ratio_h = 0.0f;
+  *ratio_w = 0.0f;
+  if (*out_h > 1) {
+    *ratio_h =
+        align_corners
+            ? static_cast<float>(*in_h - 1) / (*out_h - 1)
+            : (scale_h > 0 ? 1 / scale_h : static_cast<float>(*in_h) / *out_h);
+  }
+  if (*out_w > 1) {
+    *ratio_w =
+        align_corners
+            ? static_cast<float>(*in_w - 1) / (*out_w - 1)
+            : (scale_w > 0 ? 1 / scale_w : static_cast<float>(*in_w) / *out_w);
+  }
+}
+
+template <typename T>
+void BilinearParamTensorCompute(const framework::ExecutionContext& ctx,
+                                const DataLayout& data_layout, int in_h,
+                                int in_w, int out_h, int out_w, bool align_cond,
+                                float ratio_h, float ratio_w, Tensor* h0,
+                                Tensor* h1, Tensor* w0, Tensor* w1,
+                                Tensor* coef_h0, Tensor* coef_h1,
+                                Tensor* coef_w0, Tensor* coef_w1) {
+  InterpolateFunction<T> F(ctx);
+  auto place = ctx.GetPlace();
+  Tensor _h0, _w0;
+  _h0.mutable_data<T>({out_h}, place);
+  _w0.mutable_data<T>({out_w}, place);
+  F.Arange(out_h, &_h0);
+  F.Arange(out_w, &_w0);
+  if (align_cond) {
+    F.Adds(&_h0, static_cast<float>(0.5), &_h0);
+    F.Adds(&_w0, static_cast<float>(0.5), &_w0);
+    F.Muls(&_h0, ratio_h, &_h0);
+    F.Muls(&_w0, ratio_w, &_w0);
+    F.Adds(&_h0, static_cast<float>(-0.5), &_h0);
+    F.Adds(&_w0, static_cast<float>(-0.5), &_w0);
+  } else {
+    F.Muls(&_h0, ratio_h, &_h0);
+    F.Muls(&_w0, ratio_w, &_w0);
+  }
+
+  Tensor zero_t;
+  Tensor one_t;
+  zero_t.mutable_data<T>({1}, place);
+  one_t.mutable_data<T>({1}, place);
+  FillNpuTensorWithConstant<T>(&zero_t, static_cast<T>(0));
+  FillNpuTensorWithConstant<T>(&one_t, static_cast<T>(1));
+  F.Maximum(&_h0, &zero_t, &_h0);
+  F.Maximum(&_w0, &zero_t, &_w0);
+
+  Tensor _h0_floor, _w0_floor;
+  _h0_floor.mutable_data<T>({out_h}, place);
+  _w0_floor.mutable_data<T>({out_w}, place);
+  F.Floor(&_h0, &_h0_floor);
+  F.Floor(&_w0, &_w0_floor);
+  F.Cast(&_h0_floor, h0);
+  F.Cast(&_w0_floor, w0);
+
+  Tensor one_int;
+  one_int.mutable_data<int>({1}, place);
+  FillNpuTensorWithConstant<int>(&one_int, static_cast<int>(1));
+  F.Add(h0, &one_int, h1);
+  F.Add(w0, &one_int, w1);
+  Tensor t_max_h, t_max_w;
+  t_max_h.mutable_data<int>({1}, place);
+  t_max_w.mutable_data<int>({1}, place);
+  FillNpuTensorWithConstant<int>(&t_max_h, static_cast<int>(in_h - 1));
+  FillNpuTensorWithConstant<int>(&t_max_w, static_cast<int>(in_w - 1));
+  F.Minimum(h1, &t_max_h, h1);
+  F.Minimum(w1, &t_max_w, w1);
+
+  F.Sub(&_h0, &_h0_floor, coef_h1);
+  F.Sub(&_w0, &_w0_floor, coef_w1);
+  F.Sub(&one_t, coef_h1, coef_h0);
+  F.Sub(&one_t, coef_w1, coef_w0);
+
+  if (data_layout == DataLayout::kNCHW) {
+    coef_h0->Resize({out_h, 1});
+    coef_h1->Resize({out_h, 1});
+  } else {
+    coef_h0->Resize({out_h, 1, 1});
+    coef_h1->Resize({out_h, 1, 1});
+    coef_w0->Resize({out_w, 1});
+    coef_w1->Resize({out_w, 1});
+  }
+}
+
+template <typename T>
+void BilinearFwdNpu(const framework::ExecutionContext& ctx, const Tensor* input,
+                    Tensor* output, const float scale_h, const float scale_w,
+                    const bool align_corners, const int align_mode,
+                    const DataLayout& data_layout) {
+  InterpolateFunction<T> F(ctx);
+  auto place = ctx.GetPlace();
+  auto outdim = output->dims();
+  auto indim = input->dims();
+
+  int axis_h, axis_w;
+  int out_h, out_w, in_h, in_w;
+  float ratio_h, ratio_w;
+  InterpolateParamCompute(scale_h, scale_w, align_corners, align_mode,
+                          data_layout, indim, outdim, &axis_h, &axis_w, &in_h,
+                          &in_w, &out_h, &out_w, &ratio_h, &ratio_w);
+
+  Tensor h0, h1, w0, w1;
+  h0.mutable_data<int>({out_h}, place);
+  h1.mutable_data<int>({out_h}, place);
+  w0.mutable_data<int>({out_w}, place);
+  w1.mutable_data<int>({out_w}, place);
+  Tensor coef_h0, coef_h1, coef_w0, coef_w1;
+  coef_h0.mutable_data<T>({out_h}, place);
+  coef_h1.mutable_data<T>({out_h}, place);
+  coef_w0.mutable_data<T>({out_w}, place);
+  coef_w1.mutable_data<T>({out_w}, place);
+  bool align_cond = align_mode == 0 && !align_corners;
+  BilinearParamTensorCompute<T>(ctx, data_layout, in_h, in_w, out_h, out_w,
+                                align_cond, ratio_h, ratio_w, &h0, &h1, &w0,
+                                &w1, &coef_h0, &coef_h1, &coef_w0, &coef_w1);
+
+  Tensor input_gather_h0, input_gather_h1;
+  auto dim_gather_h = indim;
+  dim_gather_h[axis_h] = out_h;
+  input_gather_h0.mutable_data<T>(dim_gather_h, place);
+  input_gather_h1.mutable_data<T>(dim_gather_h, place);
+
+  F.Gather(input, &h0, axis_h, &input_gather_h0);
+  F.Gather(input, &h1, axis_h, &input_gather_h1);
+
+  F.Mul(&input_gather_h0, &coef_h0, &input_gather_h0);
+  F.Mul(&input_gather_h1, &coef_h1, &input_gather_h1);
+  Tensor out_x4;
+  out_x4.mutable_data<T>({4, outdim[0], outdim[1], outdim[2], outdim[3]},
+                         place);
+  Tensor input_gather_h0_w0 = out_x4.Slice(0, 1);
+  Tensor input_gather_h0_w1 = out_x4.Slice(1, 2);
+  Tensor input_gather_h1_w0 = out_x4.Slice(2, 3);
+  Tensor input_gather_h1_w1 = out_x4.Slice(3, 4);
+  F.Gather(&input_gather_h0, &w0, axis_w, &input_gather_h0_w0);
+  F.Gather(&input_gather_h0, &w1, axis_w, &input_gather_h0_w1);
+  F.Gather(&input_gather_h1, &w0, axis_w, &input_gather_h1_w0);
+  F.Gather(&input_gather_h1, &w1, axis_w, &input_gather_h1_w1);
+  F.Mul(&input_gather_h0_w0, &coef_w0, &input_gather_h0_w0);
+  F.Mul(&input_gather_h0_w1, &coef_w1, &input_gather_h0_w1);
+  F.Mul(&input_gather_h1_w0, &coef_w0, &input_gather_h1_w0);
+  F.Mul(&input_gather_h1_w1, &coef_w1, &input_gather_h1_w1);
+  F.ReduceSum(&out_x4, output, std::vector<int>{0}, false);
+}
+
+template <typename T>
+void BilinearBwdNpu(const framework::ExecutionContext& ctx, const Tensor* gout,
+                    Tensor* gin, const float scale_h, const float scale_w,
+                    const bool align_corners, const int align_mode,
+                    const DataLayout& data_layout) {
+  InterpolateFunction<T> F(ctx);
+  auto place = ctx.GetPlace();
+  auto outdim = gout->dims();
+  auto indim = gin->dims();
+
+  int axis_h, axis_w;
+  int out_h, out_w, in_h, in_w;
+  float ratio_h, ratio_w;
+  InterpolateParamCompute(scale_h, scale_w, align_corners, align_mode,
+                          data_layout, indim, outdim, &axis_h, &axis_w, &in_h,
+                          &in_w, &out_h, &out_w, &ratio_h, &ratio_w);
+
+  Tensor h0, h1, w0, w1;
+  h0.mutable_data<int>({out_h}, place);
+  h1.mutable_data<int>({out_h}, place);
+  w0.mutable_data<int>({out_w}, place);
+  w1.mutable_data<int>({out_w}, place);
+  Tensor coef_h0, coef_h1, coef_w0, coef_w1;
+  coef_h0.mutable_data<T>({out_h}, place);
+  coef_h1.mutable_data<T>({out_h}, place);
+  coef_w0.mutable_data<T>({out_w}, place);
+  coef_w1.mutable_data<T>({out_w}, place);
+  bool align_cond = align_mode == 0 && !align_corners;
+  BilinearParamTensorCompute<T>(ctx, data_layout, in_h, in_w, out_h, out_w,
+                                align_cond, ratio_h, ratio_w, &h0, &h1, &w0,
+                                &w1, &coef_h0, &coef_h1, &coef_w0, &coef_w1);
+
+  Tensor gy_w0, gy_w1;
+  gy_w0.mutable_data<T>(outdim, place);
+  gy_w1.mutable_data<T>(outdim, place);
+  F.Mul(gout, &coef_w0, &gy_w0);
+  F.Mul(gout, &coef_w1, &gy_w1);
+
+  auto dim_gather_h = indim;
+  dim_gather_h[axis_h] = out_h;
+  Tensor g_gather_w0, g_gather_w1;
+  g_gather_w0.mutable_data<T>(dim_gather_h, place);
+  g_gather_w1.mutable_data<T>(dim_gather_h, place);
+  w0.Resize({out_w, 1});
+  w1.Resize({out_w, 1});
+  F.GatherGrad(&gy_w0, &w0, axis_w, &g_gather_w0);
+  F.GatherGrad(&gy_w1, &w1, axis_w, &g_gather_w1);
+
+  F.Add(&g_gather_w0, &g_gather_w1, &g_gather_w0);
+  F.Mul(&g_gather_w0, &coef_h1, &g_gather_w1);
+  F.Mul(&g_gather_w0, &coef_h0, &g_gather_w0);
+
+  Tensor gx_0, gx_1;
+  gx_0.mutable_data<T>(indim, place);
+  gx_1.mutable_data<T>(indim, place);
+  h0.Resize({out_h, 1});
+  h1.Resize({out_h, 1});
+  F.GatherGrad(&g_gather_w0, &h0, axis_h, &gx_0);
+  F.GatherGrad(&g_gather_w1, &h1, axis_h, &gx_1);
+
+  F.Add(&gx_0, &gx_1, gin);
+}
 
 template <typename DeviceContext, typename T>
 class InterpolateV2NPUKernel : public framework::OpKernel<T> {
@@ -39,19 +402,6 @@ class InterpolateV2NPUKernel : public framework::OpKernel<T> {
     int n, c, in_d, in_h, in_w;
     ExtractNCDWH(input_dims, data_layout, &n, &c, &in_d, &in_h, &in_w);
 
-    PADDLE_ENFORCE_EQ(
-        input->layout(), data_layout,
-        platform::errors::InvalidArgument(
-            "Interpolate OP's input tensor layout should equal to attr "
-            "data_layout, but got tensor layout <%s>, attr layout <%s>",
-            framework::DataLayoutToString(input->layout()), data_layout_str));
-    PADDLE_ENFORCE_EQ(
-        output->layout(), data_layout,
-        platform::errors::InvalidArgument(
-            "Interpolate OP's output tensor layout should equal to attr "
-            "data_layout, but got tensor layout <%s>, attr layout <%s>",
-            framework::DataLayoutToString(output->layout()), data_layout_str));
-
     auto interp_method = ctx.Attr<std::string>("interp_method");
     bool align_corners = ctx.Attr<bool>("align_corners");
 
@@ -156,17 +506,22 @@ class InterpolateV2NPUKernel : public framework::OpKernel<T> {
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
 
-    NpuOpRunner runner;
     // To-do(qili93): need to support bilineare, try ResizeD
+    // Add bilineare by zhulei
     if ("nearest" == interp_method) {
+      NpuOpRunner runner;
       runner.SetType("ResizeNearestNeighborV2")
           .AddInput(*input)
           .AddInput(std::vector<int32_t>{out_h, out_w})
           .AddOutput(*output)
           .AddAttr("align_corners", align_corners)
           .AddAttr("half_pixel_centers", false);
+      runner.Run(stream);
+    } else if ("bilinear" == interp_method) {
+      int align_mode = ctx.Attr<int>("align_mode");
+      BilinearFwdNpu<T>(ctx, input, output, scale_h, scale_w, align_corners,
+                        align_mode, data_layout);
     }
-    runner.Run(stream);
   }
 };
 
@@ -184,27 +539,6 @@ class InterpolateV2NPUGradKernel : public framework::OpKernel<T> {
     int n, c, in_d, in_h, in_w;
     ExtractNCDWH(input->dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
 
-    PADDLE_ENFORCE_EQ(
-        input->layout(), data_layout,
-        platform::errors::InvalidArgument(
-            "Interpolate OP's input tensor layout should equal to attr "
-            "data_layout, but got tensor layout <%s>, attr layout <%s>",
-            framework::DataLayoutToString(input->layout()), data_layout_str));
-    PADDLE_ENFORCE_EQ(output_grad->layout(), data_layout,
-                      platform::errors::InvalidArgument(
-                          "Interpolate OP's output_grad tensor layout should "
-                          "equal to attr data_layout, but got tensor layout is "
-                          "<%s>, and attr layout is <%s>",
-                          framework::DataLayoutToString(output_grad->layout()),
-                          data_layout_str));
-    PADDLE_ENFORCE_EQ(input_grad->layout(), data_layout,
-                      platform::errors::InvalidArgument(
-                          "Interpolate OP's input_grad tensor layout should "
-                          "equal to attr data_layout, but got tensor layout is "
-                          "<%s>, and attr layout is <%s>",
-                          framework::DataLayoutToString(input_grad->layout()),
-                          data_layout_str));
-
     auto interp_method = ctx.Attr<std::string>("interp_method");
     bool align_corners = ctx.Attr<bool>("align_corners");
 
@@ -301,17 +635,21 @@ class InterpolateV2NPUGradKernel : public framework::OpKernel<T> {
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
 
-    NpuOpRunner runner;
     // To-do(qili93): need to support bilineare, try ResizeGradD
     if ("nearest" == interp_method) {
+      NpuOpRunner runner;
       runner.SetType("ResizeNearestNeighborV2Grad")
           .AddInput(*output_grad)
           .AddInput(std::vector<int32_t>{in_h, in_w})
           .AddOutput(*input_grad)
           .AddAttr("align_corners", align_corners)
           .AddAttr("half_pixel_centers", false);
+      runner.Run(stream);
+    } else if ("bilinear" == interp_method) {
+      int align_mode = ctx.Attr<int>("align_mode");
+      BilinearBwdNpu<T>(ctx, output_grad, input_grad, scale_h, scale_w,
+                        align_corners, align_mode, data_layout);
     }
-    runner.Run(stream);
   }
 };
 
@@ -330,3 +668,13 @@ REGISTER_OP_NPU_KERNEL(
     nearest_interp_v2_grad,
     ops::InterpolateV2NPUGradKernel<plat::NPUDeviceContext, float>,
     ops::InterpolateV2NPUGradKernel<plat::NPUDeviceContext, plat::float16>);
+
+REGISTER_OP_NPU_KERNEL(
+    bilinear_interp_v2,
+    ops::InterpolateV2NPUKernel<plat::NPUDeviceContext, float>,
+    ops::InterpolateV2NPUKernel<plat::NPUDeviceContext, plat::float16>);
+
+REGISTER_OP_NPU_KERNEL(
+    bilinear_interp_v2_grad,
+    ops::InterpolateV2NPUGradKernel<plat::NPUDeviceContext, float>,
+    ops::InterpolateV2NPUGradKernel<plat::NPUDeviceContext, plat::float16>);
diff --git a/paddle/fluid/operators/kernel_primitives/compute_primitives_xpu2.h b/paddle/fluid/operators/kernel_primitives/compute_primitives_xpu2.h
new file mode 100644
index 0000000000000..3235591580916
--- /dev/null
+++ b/paddle/fluid/operators/kernel_primitives/compute_primitives_xpu2.h
@@ -0,0 +1,324 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "xpu/kernel/cluster_header.h"
+#include "xpu/kernel/debug.h"
+#include "xpu/kernel/math.h"
+
+namespace paddle {
+namespace operators {
+namespace kernel_primitives {
+namespace details {
+
+// kGlobalMode: block reduce, each block gets an output;
+// kLocalMode: thread reduce, each thread gets an output;
+enum ReduceMode { kGlobalMode, kLocalMode };
+
+template <typename T>
+class MPTypeTrait {
+ public:
+  using Type = T;
+};
+
+template <>
+class MPTypeTrait<platform::float16> {
+ public:
+  using Type = float;
+};
+
+static inline __device__ void sync_all() {
+  __asm__ __volatile__(
+      "sync_local\t\n"
+      "csr_set csr3, %0\t\n"
+      "sync_group csr3" ::"r"(-1));
+}
+
+#define ncores 64
+template <typename T, typename OpFunc, int VecSize>
+__device__ void BlockXReduce(T* data, OpFunc reducer) {
+  __shared__ T sum_array[ncores * VecSize];
+  int core_idx = core_id() * VecSize;
+  mfence();
+  sync_all();
+
+#pragma unroll
+  for (int i = 0; i < VecSize; i++) {
+    mfence();
+    sum_array[core_idx + i] = data[i];
+    mfence();
+    data[i] = 0;
+  }
+  sync_all();
+#pragma unroll
+  for (int i = 0; i < VecSize; i++) {
+#pragma unroll
+    for (int j = 0; j < ncores; j++) {
+      mfence();
+      T tmp = sum_array[j * VecSize + i];
+      mfence();
+      data[i] = reducer(data[i], tmp);
+      mfence();
+    }
+  }
+  sync_all();
+}
+#undef ncores
+
+}  // namespace details
+
+/**
+ * @brief Perform unary calculation according to OpFunc. Shape of input and
+ * output are the same.
+ *
+ * @template paraments
+ * InT: The data type of in.
+ * OutT: The data type of out.
+ * NX: The number of data columns loaded by each thread.
+ * NY: The number of data rows loaded by each thread.
+ * BlockSize: Identifies the current device thread index method. For xpu,
+ * core_id() is used as the index.
+ * OpFunc: Compute functor which has an operator() as following:
+ *     template <typename InT, typename OutT>
+ *     struct XxxFunctor {
+ *       HOSTDEVICE OutT operator()(const InT& a) const {
+ *         return ...;
+ *       }
+ *     };
+ *
+ * @param：
+ * out: The register pointer of out, the size is NX * NY.
+ * in: The register pointer of in, the size is NX * NY.
+ * compute: Compute function which was declared like OpFunc<InT, OutT>().
+ */
+template <typename InT, typename OutT, int NX, int NY, int BlockSize,
+          class OpFunc>
+__device__ __forceinline__ void ElementwiseUnary(OutT* out, const InT* in,
+                                                 OpFunc compute) {
+#pragma unroll
+  for (int idx = 0; idx < NX * NY; idx++) {
+    out[idx] = static_cast<OutT>(compute(in[idx]));
+  }
+}
+
+/**
+ * @brief Binary calculation according to OpFunc. Shape of The input and output
+ * are the same.
+ *
+ * @template paraments
+ * InT: The data type of in1 and in2.
+ * OutT: The data type of out.
+ * NX: The number of data columns computed by each thread.
+ * NY: The number of data rows computed by each thread.
+ * BlockSize: Identifies the current device thread index method. For xpu,
+ * core_id() is used as the index.
+ * OpFunc: Compute functor which has an operator() as following:
+ *     template <typename InT>
+ *     struct XxxFunctor {
+ *       HOSTDEVICE InT operator()(const InT& a, const InT& b) const {
+ *         return ...;
+ *       }
+ *     };
+ *
+ * @param：
+ * out: The register pointer of out, the size is NX * NY.
+ * in1: The register pointer of fist input, size is NX * NY.
+ * in2: The register pointer of second input, size is NX * NY.
+ * compute: Compute function which was declared like OpFunc<InT>().
+ */
+template <typename InT, typename OutT, int NX, int NY, int BlockSize,
+          class OpFunc>
+__device__ __forceinline__ void ElementwiseBinary(OutT* out, const InT* in1,
+                                                  const InT* in2,
+                                                  OpFunc compute) {
+#pragma unroll
+  for (int idx = 0; idx < NX * NY; ++idx) {
+    out[idx] = static_cast<OutT>(compute(in1[idx], in2[idx]));
+  }
+}
+
+/**
+ * @brief Ternary calculation according to OpFunc. Shape of input and output
+ * are the same.
+ *
+ * @template paraments
+ * InT: The data type of in1 and in2.
+ * OutT: The data type of out.
+ * NX: The number of data columns loaded by each thread.
+ * NY: The number of data rows loaded by each thread.
+ * BlockSize: Identifies the current device thread index method. For xpu,
+ * core_id() is used as the index.
+ * OpFunc: Compute functor which has an operator() as following
+ *     template <typename InT>
+ *     struct XxxFunctor {
+ *       HOSTDEVICE InT operator()(const InT& a, const InT& b, const InT& c)
+ * const {
+ *         return ...;
+ *       }
+ *     };
+ *
+ * @param
+ * out: The register pointer of out, the size is NX * NY.
+ * in1: The register pointer of fist input, size is NX * NY.
+ * in2: The register pointer of second input, size is NX * NY.
+ * in3: The register pointer of third input, size is NX * NY.
+ * compute: Compute function which was declared like OpFunc<InT>().
+ */
+template <typename InT, typename OutT, int NX, int NY, int BlockSize,
+          class OpFunc>
+__device__ __forceinline__ void ElementwiseTernary(OutT* out, const InT* in1,
+                                                   const InT* in2,
+                                                   const InT* in3,
+                                                   OpFunc compute) {
+#pragma unroll
+  for (int idx = 0; idx < NX * NY; ++idx) {
+    out[idx] = static_cast<OutT>(compute(in1[idx], in2[idx], in3[idx]));
+  }
+}
+
+/**
+ * @brief Multivariate calculation according to OpFunc. Shape of inputs and
+ * output are the same.
+ *
+ * @template paraments
+ * InT: The data type of in1, in2 and in3.
+ * OutT: The data type of out.
+ * NX: The number of data columns loaded by each thread.
+ * NY: The number of data rows loaded by each thread.
+ * BlockSize: Identifies the current device thread index method. For xpu,
+ * core_id() is used as the index.
+ * Arity: The size of ins
+ * OpFunc: Compute functor which has an operator() as following:
+ *     template <typename InT>
+ *     struct XxxFunctor {
+ *       HOSTDEVICE InT operator()(const InT* args) const {
+ *         return ...;
+ *       }
+ *     };
+ *
+ * @param
+ * out: The register pointer of out, the size is NX * NY.
+ * ins: A pointers of array consisting of multiple inputs.
+ * compute: Compute function which was declared like OpFunc<InT>().
+ */
+template <typename InT, typename OutT, int NX, int NY, int BlockSize, int Arity,
+          class OpFunc>
+__device__ __forceinline__ void ElementwiseAny(OutT* out, InT (*ins)[NX * NY],
+                                               OpFunc compute) {
+  __local__ InT args[Arity];
+#pragma unroll
+  for (int idx = 0; idx < NX * NY; ++idx) {
+#pragma unroll
+    for (int j = 0; j < Arity; ++j) {
+      args[j] = ins[j][idx];
+    }
+    out[idx] = static_cast<OutT>(compute(args));
+  }
+}
+
+/**
+ * @brief Binary calculation according to OpFunc. The shape of in1 and in2 are
+ * different. When in1's shape is [1, NX], in2's shape is [NY, NX], then
+ * output's shape is [NY, NX].
+ *
+ * @template paraments
+ * InT: The data type of in1 and in2.
+ * OutT: The data type of out.
+ * NX: The number of data columns loaded by each thread.
+ * NY: The number of data rows loaded by each thread.
+ * BlockSize: Identifies the current device thread index method. For xpu,
+ * core_id() is used as the index.
+ * OpFunc: Compute functor which has an operator() as following
+ *     template <typename InT, typename OutT>
+ *     struct XxxFunctor {
+ *       HOSTDEVICE OutT operator()(const InT& a, const InT& b) const {
+ *         return ...;
+ *       }
+ *     };
+ *
+ * @param
+ * out: The register pointer of out, the size is NX * NY.
+ * in1: The register pointer of fist input, size is NX * 1.
+ * in2: The register pointer of second input, size is NX * NY.
+ * compute: Compute function which was declared like OpFunc<InT, OutT>().
+ */
+template <typename InT, typename OutT, int NX, int NY, int BlockSize,
+          class OpFunc>
+__device__ __forceinline__ void CycleBinary(OutT* out, const InT* in1,
+                                            const InT* in2, OpFunc compute) {
+#pragma unroll
+  for (int idx = 0; idx < NX; idx++) {
+#pragma unroll
+    for (int idy = 0; idy < NY; idy++) {
+      out[idx + idy * NX] =
+          static_cast<OutT>(compute(in1[idx], in2[idx + idy * NX]));
+    }
+  }
+}
+
+/**
+ * @brief The Reduce provides collective methods for computing a parallel
+ * reduction of items partitioned across a CUDA block and intra thread. When
+ * ReduceMode == kLocalMode, thread reduce along nx. When ReduceMode ==
+ * kGlobalMode, use shared memory to reduce between threads.
+ *
+ * @template paraments
+ * T: The type of data.
+ * NX: The number of data continuously loaded by each thread.
+ * NY: The number of data rows loaded by each thread, only NY = 1 was supported.
+ * BlockSize: Identifies the current device thread index method. For xpu,
+ * core_id() is used as the index.
+ * ReduceFunctor: Compute functor which has an operator() as following
+ *     template <typename InT>
+ *     struct ReduceFunctor {
+ *       HOSTDEVICE InT operator()(const InT& a, const InT& b) const {
+ *         return ...;
+ *       }
+ *     };
+ * ReduceMode: Reduce mode, can be kLocalMode, kGlobalMode.
+ *
+ * @param
+ * out: The register pointer of out, the size is NX * NY.
+ * in: The register pointer of in, the size is NX * NY.
+ * reducer: Compute function which was declared like ReduceFunctor<InT>().
+ * reduce_last_dim: if the last dim gets involved in reduction.
+ */
+template <typename T, int NX, int NY, int BlockSize, class ReduceFunctor,
+          details::ReduceMode Mode>
+__device__ __forceinline__ void Reduce(T* out, const T* in,
+                                       ReduceFunctor reducer,
+                                       bool reduce_last_dim) {
+  if (Mode == kGlobalMode) {
+#pragma unroll
+    for (int i = 0; i < NY; ++i) {
+#pragma unroll
+      for (int j = 0; j < NX; ++j) {
+        out[i] = reducer(out[i], in[i * NX + j]);
+      }
+    }
+    BlockXReduce<T, OpFunc, NY>(out, reducer);
+  } else {  // else  kLocalMode
+#pragma unroll
+    for (int i = 0; i < NY; ++i) {
+#pragma unroll
+      for (int j = 0; j < NX; ++j) {
+        out[i] = reducer(out[i], in[i * NX + j]);
+      }
+    }
+  }
+}
+
+}  // namespace kernel_primitives
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/kernel_primitives/datamover_primitives_xpu2.h b/paddle/fluid/operators/kernel_primitives/datamover_primitives_xpu2.h
new file mode 100644
index 0000000000000..b27ba27b3c6f1
--- /dev/null
+++ b/paddle/fluid/operators/kernel_primitives/datamover_primitives_xpu2.h
@@ -0,0 +1,567 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "xpu/kernel/cluster_header.h"
+#include "xpu/kernel/debug.h"
+#include "xpu/kernel/math.h"
+
+namespace paddle {
+namespace operators {
+namespace kernel_primitives {
+namespace details {
+
+template <typename T, int VecSize>
+struct alignas(sizeof(T) * VecSize) VectorType {
+  T val[VecSize];
+};
+
+/**
+ * Configuration of broadcast. Calculate the input data index according to the
+ * index of the output data. if input or output shape is [dim0, dim1] then dims
+ * must be [dim1, dim0].
+ */
+template <int kDims>
+struct BroadcastConfig {
+  uint32_t stride_in[framework::DDim::kMaxRank];
+  uint32_t stride_out[framework::DDim::kMaxRank];
+  uint32_t shape_in[framework::DDim::kMaxRank];
+
+  HOSTDEVICE BroadcastConfig() {}
+
+  HOSTDEVICE BroadcastConfig(const std::vector<int64_t>& out_dims,
+                             const std::vector<int64_t>& in_dims,
+                             int dim_size) {
+    std::vector<uint32_t> strides_in;
+    std::vector<uint32_t> strides_out;
+    std::vector<uint32_t> shapes_in;
+
+    strides_out.resize(dim_size, 1);
+    strides_in.resize(dim_size, 1);
+    shapes_in.resize(dim_size, 1);
+
+    for (int i = 0; i < dim_size; ++i) {
+      shape_in[i] = in_dims[dim_size - i - 1];
+    }
+
+    for (int i = 1; i < dim_size - 1; ++i) {
+      strides_out[dim_size - i - 1] = std::accumulate(
+          out_dims.begin(), out_dims.begin() + i, 1, std::multiplies<int64_t>())
+          strides_in[dim_size - i - 1] =
+              std::accumulate(in_dims.begin(), in_dims.begin() + i, 1,
+                              std::multiplies<int64_t>())
+    }
+
+    memcpy(stride_in, strides_in.data(), kDims * sizeof(uint32_t));
+    memcpy(stride_out, strides_out.data(), kDims * sizeof(uint32_t));
+    memcpy(shape_in, shapes_in.data(), kDims * sizeof(uint32_t));
+  }
+};
+
+}  // namespace details
+
+/**
+ * @brief Read 2D data from global memory to register according to Tx type, and
+ * store it as Ty type into register.
+ *
+ * @template paraments
+ * Tx: The type of data stored in the global memory.
+ * Ty: The type of data that needs to be stored in registers.
+ * NX: The number of data columns loaded by each thread.
+ * NY: The number of data rows loaded by each thread.
+ * BlockSize: Identifies the current device thread index method. For xpu,
+ * core_id() is used as the index.
+ * IsBoundary: Indicates whether to perform block access storage out-of-bounds
+ * judgment. When the number of data processed by the block is less than
+ * NX x NY x core_num(), boundary judgment is required to avoid memory access
+ * crossing the boundary.
+ *
+ * @param：
+ * dst: The register pointer of the thread, the size is NX * NY.
+ * src: The data pointer of the current block.
+ * size_nx: The maximum offset of the current block is size_nx elements in the
+ * lowest dimension. The parameters are only calculated when isboundary = true.
+ * size_ny: The maximum offset of the current block is size_ny elements in the
+ * first dimension. The parameters are only calculated when isboundary = true.
+ * stride_nx: Each read one element stride stride_nx elements in the last dim.
+ * stride_ny: Each read one element stride stride_ny elements in the first dim.
+ */
+template <typename Tx, typename Ty, int NX, int NY, int BlockSize,
+          bool IsBoundary = false>
+__device__ __forceinline__ void ReadData(Ty* dst, const Tx _global_ptr_* src,
+                                         int size_nx, int size_ny,
+                                         int stride_nx, int stride_ny) {
+  int thread_offset = core_id();
+  int left_size_nx = size_nx - thread_offset;
+  __local__ T in_temp[1];
+  // Each branch is added for better performance
+  if (NX == 1 && NY == 1) {  // for NX == 1 and NY == 1
+    if (IsBoundary) {
+      if (left_size_nx > 0) {
+        GM2LM(src + thread_offset, in_temp, sizeof(Tx));
+        dst[0] = static_cast<Ty>(in_temp[0]);
+      }
+    } else {
+      GM2LM(src + thread_offset, in_temp, sizeof(Tx));
+      dst[0] = static_cast<Ty>(in_temp[0]);
+    }
+  } else if (NX == 1) {  // for NX == 1 and NY != 1
+#pragma unroll
+    for (int idy = 0; idy < NY; ++idy) {
+      if (IsBoundary) {
+        if (idy * stride_ny >= size_ny) {
+          break;
+        }
+      }
+      GM2LM(src + thread_offset + idy * stride_ny, in_temp, sizeof(Tx));
+      dst[idy] = static_cast<Ty>(in_temp[0]);
+    }
+  } else if (NY == 1) {  // for NY == 1 and NX != 1
+#pragma unroll
+    for (int idx = 0; idx < NX; ++idx) {
+      if (IsBoundary) {
+        if (idx * stride_nx >= left_size_nx) {
+          break;
+        }
+      }
+      GM2LM(src + thread_offset + idx * stride_nx, in_temp, sizeof(Tx));
+      dst[idx] = static_cast<Ty>(in_temp[0]);
+    }
+  } else {  // for NX != 1 and NY != 1
+#pragma unroll
+    for (int idx = 0; idx < NX; ++idx) {
+#pragma unroll
+      for (int idy = 0; idy < NY; ++idy) {
+        if (IsBoundary) {
+          if (idy * stride_ny >= size_ny || idx * stride_nx >= left_size_nx) {
+            break;
+          }
+        }
+        int fix = thread_offset + idx * stride_nx + idy * stride_ny;
+        GM2LM(src + fix, in_temp, sizeof(Tx));
+        dst[idy * NX + idx] = static_cast<Ty>(in_temp[0]);
+      }
+    }
+  }
+}
+
+/**
+ * @brief Initialize register with init_data.
+ *
+ * @template paraments
+ * T: Data type of register.
+ * NX: Number of data to initialize.
+ *
+ * @param：
+ * dst: The register pointer of the thread, the size is NX.
+ * init_data: Initial value.
+ */
+template <typename T, int NX>
+__device__ __forceinline__ void Init(T* dst, T init_data) {
+#pragma unroll
+  for (int i = 0; i < NX; i++) {
+    dst[i] = init_data;
+  }
+}
+
+/**
+ * @brief Read 1D data from global memory to register. When IsBoundary = true
+ * and (NX % 4 == 0 or Nx % 2 == 0), vectorized load data will be used to
+ * improve memory access efficiency.
+ *
+ * @template paraments
+ * T: The type of data.
+ * NX: Each thread load NX data from global memory continuously.
+ * NY: Each thread need to load NY rows, only NY = 1 was supported.
+ * BlockSize: Identifies the current device thread index method. For xpu,
+ * core_id() is used as the index.
+ * IsBoundary: Whether to make an out-of-bounds judgment on access to memory.
+ * When the number of data processed by this block is less than
+ * NX x NY x core_num(), boundary judgment is required to avoid memory access
+ * crossing the boundary.
+ *
+ * @param：
+ * dst: The register pointer of the thread, the size is NX * NY.
+ * src: The data pointer of the current block.
+ * size: The current block needs to load size data continuously.
+ */
+template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false>
+__device__ __forceinline__ void ReadData(T* dst, const T _global_ptr_* src,
+                                         int num) {
+  int thread_offset = core_id() * NX;
+  __local__ T in_temp[1];
+  if (IsBoundary) {  // core_num() * NX > num
+#pragma unroll
+    for (int idx = 0; idx < NX; ++idx) {
+      if (idx + thread_offset < num) {
+        GM2LM(src + thread_offset + idx, in_temp, sizeof(T));
+        dst[idx] = in_temp[0];
+      }
+    }
+  } else {  // core_num() * NX < num
+    GM2LM(src + thread_offset, dst, NX * sizeof(T));
+  }
+}
+
+/**
+ * @brief Read 2D data from global memory to registers with broadcast form.
+ *
+ * @template paraments
+ * T: The type of data stored in the global memory.
+ * NX: The number of data columns loaded by each thread.
+ * NY: The number of data rows loaded by each thread.
+ * BlockSize: Identifies the current device thread index method. For xpu,
+ * core_id() is used as the index.
+ * Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2.
+ * IsBoundary: Indicates whether to perform block access storage out-of-bounds
+ * judgment. When the number of data processed by the block is less than
+ * NX x NY x core_num(), boundary judgment is required to avoid memory access
+ * crossing the boundary.
+ *
+ * @param：
+ * dst: The register pointer of the thread, the size is NX * NY.
+ * src: Raw input data pointer of kernel.
+ * block_offset: Data offset of this block, core_num() *  cluster_id() * NX;
+ * config: Calculation configuration of broadcast. It is used to calculate the
+ * coordinate mapping relationship between output data and input data.
+ * total_num_output: Total number of original output.
+ * stride_nx: Each read one element stride stride_nx elements in the last dim.
+ * stride_ny: Each read one element stride stride_ny elements in the first dim.
+ */
+template <typename T, int NX, int NY, int BlockSize, int Rank,
+          bool IsBoundary = false>
+__device__ __forceinline__ void ReadDataBc(
+    T* dst, const T _global_ptr_* src, uint32_t block_offset,
+    details::BroadcastConfig<Rank> config, int total_num_output, int stride_nx,
+    int stride_ny) {
+  uint32_t thread_offset = block_offset + core_id();
+  uint32_t index_src = 0;
+  __local__ T in_temp[1];
+
+#pragma unroll
+  for (int ny = 0; ny < NY; ++ny) {
+#pragma unroll
+    for (uint32_t nx = 0; nx < NX; ++nx) {
+      uint32_t index_output = thread_offset + ny * stride_ny + nx * stride_nx;
+      index_src = 0;
+      if (IsBoundary) {
+        if (index_output >= total_num_output) {
+          break;
+        }
+      }
+#pragma unroll
+      for (int i = 0; i < Rank; ++i) {
+        uint32_t tmp = index_output / config.stride_out[i];
+        index_output = index_output - tmp * config.stride_out[i];
+        index_src += (tmp % config.shape_in[i]) * config.stride_in[i];
+      }
+      GM2LM(src + index_src, in_temp, sizeof(T));
+      dst[nx + ny * NX] = in_temp[0];
+    }
+  }
+}
+
+/**
+ * @brief Read 2D data from global memory to register with reduce form.
+ *
+ * @template paraments
+ * T: The type of data.
+ * NX: The number of data columns loaded by each thread.
+ * NY: The number of data rows loaded by each thread.
+ * BlockSize: Identifies the current device thread index method. For xpu,
+ * core_id() is used as the index.
+ * Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2.
+ * IsBoundary: Indicates whether to perform block access storage out-of-bounds
+ * judgment. When the number of data processed by the block is less than
+ * NX x NY x core_num(), boundary judgment is required to avoid memory access
+ * crossing the boundary.
+ *
+ * @param：
+ * dst: The register pointer of the thread, the size is NX * NY.
+ * src: The input data pointer of this block.
+ * block_offset: The data offset of this block, blockDim.x * cluster_id() * NX.
+ * index_cal: Calculation configuration of Reduce. It is used to calculate the
+ * coordinate mapping relationship between output data and input data.
+ * size_nx: The current block needs to load size_nx columns of data, this
+ * parameter will participate in the calculation when isboundary = true.
+ * size_ny: The current block needs to load size_ny rows of data, this parameter
+ * will participate in the calculation when isboundary = true.
+ * will be used when IsBoundary = true.
+ * stride_nx: Each read one element stride stride_nx columns.
+ * stride_ny: Each read one element stride stride_ny raws.
+ * reduce_last_dim: Used to indicate whether the dimension of reduce contains
+ * the lowest dimension.
+ */
+template <typename T, int NX, int NY, int BlockSize, int Rank,
+          typename IndexCal, bool IsBoundary = false>
+__device__ __forceinline__ void ReadDataReduce(
+    T* dst, const T _global_ptr_* src, int block_offset,
+    const IndexCal& index_cal, int size_nx, int size_ny, int stride_nx,
+    int stride_ny, bool reduce_last_dim) {
+  __local__ T in_temp[1];
+  int thread_offset = 0;
+  int left_size_nx = size_nx;
+  int left_size_ny = size_ny;
+  if (reduce_last_dim) {
+    thread_offset = block_offset + core_id();
+    left_size_nx -= thread_offset;
+  } else {
+    thread_offset = block_offset + core_id();
+    left_size_ny -= thread_offset;
+  }
+
+  if (NX == 1) {
+#pragma unroll
+    for (int ny = 0; ny < NY; ++ny) {
+      if (IsBoundary) {
+        if (ny * stride_ny >= left_size_ny) {
+          break;
+        }
+      }
+      uint32_t index_src = index_cal(thread_offset);
+      GM2LM(src + index_src, in_temp, sizeof(T));
+      dst[ny] = in_temp[0];
+      thread_offset += stride_ny;
+    }
+  } else {
+#pragma unroll
+    for (int nx = 0; nx < NX; ++nx) {
+#pragma unroll
+      for (int ny = 0; ny < NY; ++ny) {
+        if (IsBoundary) {
+          if ((ny * stride_ny >= left_size_ny) ||
+              (nx * stride_nx >= left_size_nx)) {
+            break;
+          }
+        }
+        uint32_t index_src = index_cal(thread_offset);
+        GM2LM(src + index_src, in_temp, sizeof(T));
+        dst[nx + ny * NX] = in_temp[0];
+        thread_offset += stride_ny;
+      }
+      thread_offset += stride_nx;
+    }
+  }
+}
+/**
+ * @brief Write 1D data from registers to global memory. When IsBoundary = true
+ * and (NX % 4 == 0 or Nx % 2 == 0), the data will be vectorized to improve the
+ * data loading efficiency
+ *
+ * @template paraments
+ * T: The type of data.
+ * NX: The number of data continuously writed by each thread.
+ * NY: The number of data rows loaded by each thread, only NY = 1 was supported.
+ * BlockSize: Identifies the current device thread index method. For xpu,
+ * core_id() is used as the index.
+ * IsBoundary: Indicates whether to perform block access storage out-of-bounds
+ * judgment. When the number of data processed by the block is less than
+ * NX x NY x core_num(), boundary judgment is required to avoid memory access
+ * crossing the boundary.
+ *
+ * @param：
+ * dst: The data pointer of the current block.
+ * src: The register pointer, the size is NX * NY.
+ * size: The current block needs to load size elements continuously.
+ */
+
+template <typename T, int NX, int NY, int BlockSize, bool IsBoundary>
+__device__ void WriteData(T _global_ptr_* dst, const T* src, int num) {
+  int thread_offset = core_id() * NX;
+  __local__ T in_temp[1];
+  if (IsBoundary) {  // core_num() * NX > num
+#pragma unroll
+    for (int idx = 0; idx < NX; ++idx) {
+      if (idx + thread_offset < num) {
+        in_temp[0] = src[idx];
+        LM2GM(in_temp, dst + idx + thread_offset, sizeof(T));
+      }
+    }
+  } else {  // core_num() * NX < num
+    LM2GM(src, dst + thread_offset, NX * sizeof(T));
+  }
+}
+
+/**
+ * @brief Write 2D data from register to global memory according to Tx type, and
+ * store it as Ty type.
+ *
+ * @template paraments
+ * Tx: The type of data that needs to be stored in registers.
+ * Ty: The type of data stored in the global memory.
+ * NX: The number of data columns loaded by each thread.
+ * NY: The number of data rows loaded by each thread.
+ * BlockSize: Identifies the current device thread index method. For xpu,
+ * core_id() is used as the index.
+ * IsBoundary: Indicates whether to perform block access storage out-of-bounds
+ * judgment. When the number of data processed by the block is less than
+ * NX x NY x core_num(), boundary judgment is required to avoid memory access
+ * crossing the boundary.
+ *
+ * @param：
+ * dst: Data pointer of the current block.
+ * src: The register pointer of the thread, the size is NX * NY.
+ * size_nx: The current block needs to load size_nx columns of data, this
+ * parameter will be used when IsBoundary = true.
+ * size_ny: The current block needs to load size_ny rows of data. This parameter
+ * will be used when IsBoundary = true.
+ * stride_nx: Each read one element stride stride_nx elements in the last dim.
+ * stride_ny: Each read one element stride stride_ny elements in the first dim.
+ */
+template <typename Tx, typename Ty, int NX, int NY, int BlockSize,
+          bool IsBoundary = false>
+__device__ __forceinline__ void WriteData(Ty _global_ptr_* dst, const Tx* src,
+                                          int size_nx, int size_ny,
+                                          int stride_nx, int stride_ny) {
+  int thread_offset = core_id();
+  int left_size_nx = size_nx - thread_offset;
+  __local__ Ty in_temp[1];
+
+  // Each branch is added for better performance
+  if (NX == 1 && NY == 1) {
+    if (IsBoundary) {
+      if (left_size_nx > 0) {
+        in_temp[0] = static_cast<Ty>(src[0]);
+        LM2GM(in_temp, dst + thread_offset, sizeof(T));
+      }
+    } else {
+      in_temp[0] = static_cast<Ty>(src[0]);
+      LM2GM(in_temp, dst + thread_offset, sizeof(T));
+    }
+  } else if (NX == 1) {
+#pragma unroll
+    for (int idy = 0; idy < NY; ++idy) {
+      if (IsBoundary) {
+        if (idy * stride_ny >= size_ny) {
+          break;
+        }
+      }
+
+      in_temp[0] = static_cast<Ty>(src[idy]);
+      LM2GM(in_temp, dst + thread_offset + idy * stride_ny, sizeof(T));
+    }
+  } else if (NY == 1) {  // for NY == 1 and NX != 1
+#pragma unroll
+    for (int idx = 0; idx < NX; ++idx) {
+      if (IsBoundary) {
+        if (idx * stride_nx >= left_size_nx) {
+          break;
+        }
+      }
+
+      in_temp[0] = static_cast<Ty>(src[idx]);
+      LM2GM(in_temp, dst + thread_offset + idx * stride_nx, sizeof(T));
+    }
+  } else {  // for NX != 1 and NY != 1
+#pragma unroll
+    for (int idx = 0; idx < NX; ++idx) {
+      if (IsBoundary) {
+        if (idx * stride_nx >= left_size_nx) {
+          break;
+        }
+      }
+#pragma unroll
+      for (int idy = 0; idy < NY; ++idy) {
+        if (IsBoundary) {
+          if (idy * stride_ny >= size_ny) {
+            break;
+          }
+        }
+        in_temp[0] = static_cast<Ty>(src[idx + idy * NX]);
+        LM2GM(in_temp, dst + thread_offset + idx * stride_nx + idy * stride_ny,
+              sizeof(T));
+      }
+    }
+  }
+}
+
+/**
+ * @brief Initialize register with init_data.
+ *
+ * @template paraments
+ * T: Data type of register.
+ * NX: Number of data to initialize.
+ *
+ * @param：
+ * dst: The register pointer of the thread, the size is NX.
+ * init_data: The register pointer of init data, the size is NX.
+ */
+template <typename T, int NX, bool IsBoundary = false>
+__device__ __forceinline__ void Init(T* dst, T* init_data, int num) {
+#pragma unroll
+  for (int i = 0; i < NX; i++) {
+    if (IsBoundary) {
+      if (i >= num) {
+        break;
+      }
+    }
+    dst[i] = init_data[i];
+  }
+}
+
+/**
+ * @brief Read 1D data from global memory to register with broadcast form.
+ *
+ * @template paraments
+ * T: The type of data stored in the global memory.
+ * NX: The number of data continuously loaded by each thread.
+ * NY: The number of data rows loaded by each thread, only NY = 1 was supported.
+ * BlockSize: Identifies the current device thread index method. For xpu,
+ * core_id() is used as the index.
+ * Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2.
+ * IsBoundary: Indicates whether to perform block access storage out-of-bounds
+ * judgment. When the number of data processed by the block is less than
+ * NX x NY x core_num(), boundary judgment is required to avoid memory access
+ * crossing the boundary.
+ *
+ * @param：
+ * dst: The register pointer of the thread, the size is NX * NY.
+ * src: The original input data pointer of kernel.
+ * block_offset: The data offset of this block, core_num() * blockIdx.x * NX;
+ * config: Calculation configuration of broadcast. It is used to calculate the
+ * coordinate mapping relationship between output data and input data.
+ * total_num_output: Total number of original output.
+ */
+template <typename T, int NX, int NY, int BlockSize, int Rank,
+          bool IsBoundary = false>
+__device__ __forceinline__ void ReadDataBc(
+    T* dst, const T _global_ptr_* src, uint32_t block_offset,
+    details::BroadcastConfig<Rank> config, int total_num_output) {
+  uint32_t thread_offset = block_offset + core_id() * NX;
+  uint32_t index_src = 0;
+  __local__ T in_temp[1];
+
+#pragma unroll
+  for (uint32_t nx = 0; nx < NX; ++nx) {
+    uint32_t index_output = thread_offset + nx;
+    index_src = 0;
+    if (IsBoundary) {
+      if (index_output >= total_num_output) {
+        break;
+      }
+    }
+#pragma unroll
+    for (int i = 0; i < Rank; ++i) {
+      uint32_t tmp = index_output / config.stride_out[i];
+      index_output = index_output - tmp * config.stride_out[i];
+      index_src += (tmp % config.shape_in[i]) * config.stride_in[i];
+    }
+    GM2LM(src + index_src, in_temp, sizeof(T));
+    dst[nx + ny * NX] = in_temp[0];
+  }
+}
+
+}  // namespace kernel_primitives
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/lookup_table_v2_op_npu.cc b/paddle/fluid/operators/lookup_table_v2_op_npu.cc
index 3cb91c712335d..a6fd7e5c7a97d 100644
--- a/paddle/fluid/operators/lookup_table_v2_op_npu.cc
+++ b/paddle/fluid/operators/lookup_table_v2_op_npu.cc
@@ -101,6 +101,7 @@ class LookupTableV2GradNPUKernel : public framework::OpKernel<T> {
     auto stream =
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
+    int64_t padding_idx = ctx.Attr<int64_t>("padding_idx");
 
     /* EmbeddingDenseGrad has bug on large shape, temporarily disable it.
 
@@ -123,13 +124,34 @@ class LookupTableV2GradNPUKernel : public framework::OpKernel<T> {
         NpuOpRunner("ZerosLike", {*table_grad_t}, {*table_grad_t});
     runner_zeros.Run(stream);
 
-    // NOTE(zhiqiu): It seems in cann 20.1, the first input and output
-    // can be different tensor, but in cann 20.2+, it does inplace operation.
-    // Thus, the first input and output should be same tensor.
-    const auto &runner_scatter =
-        NpuOpRunner("ScatterAdd", {*table_grad_t, *ids_t, *output_grad_t},
-                    {*table_grad_t}, {{"use_locking", true}});
-    runner_scatter.Run(stream);
+    if (padding_idx == kNoPadding) {
+      // NOTE(zhiqiu): It seems in cann 20.1, the first input and output
+      // can be different tensor, but in cann 20.2+, it does inplace operation.
+      // Thus, the first input and output should be same tensor.
+      const auto &runner_scatter =
+          NpuOpRunner("ScatterAdd", {*table_grad_t, *ids_t, *output_grad_t},
+                      {*table_grad_t}, {{"use_locking", true}});
+      runner_scatter.Run(stream);
+    } else {
+      Tensor casted_ids_t;
+      if (ids_t->type() != framework::proto::VarType::INT32) {
+        casted_ids_t.mutable_data<int32_t>(ids_t->dims(), ctx.GetPlace());
+        const auto &cast_runner = NpuOpRunner("Cast", {*ids_t}, {casted_ids_t},
+                                              {{"dst_type", ACL_INT32}});
+        cast_runner.Run(stream);
+      } else {
+        casted_ids_t.ShareDataWith(*ids_t);
+      }
+      auto table_grad_dims = table_grad_t->dims();
+
+      NpuOpRunner runner;
+      runner.SetType("UnsortedSegmentSum")
+          .AddInput(*output_grad_t)
+          .AddInput(casted_ids_t)
+          .AddInput(std::vector<int64_t>{table_grad_dims[0]})
+          .AddOutput(*table_grad_t);
+      runner.Run(stream);
+    }
   }
 };
 }  // namespace operators
diff --git a/paddle/fluid/operators/math/blas.h b/paddle/fluid/operators/math/blas.h
index 6546f854df0f4..f245bad01aa4c 100644
--- a/paddle/fluid/operators/math/blas.h
+++ b/paddle/fluid/operators/math/blas.h
@@ -253,6 +253,12 @@ class Blas {
   void BatchedGETRS(CBLAS_TRANSPOSE trans, int n, int nrhs, const T** a,
                     int lda, int* ipiv, T** b, int ldb, int* info,
                     int batch_size) const;
+
+  // cuBlas triangular_solve
+  template <typename T>
+  void BatchedTRSM(CBLAS_SIDE side, CBLAS_UPLO uplo, CBLAS_TRANSPOSE transA,
+                   CBLAS_DIAG diag, int M, int N, T alpha, const T** a, int lda,
+                   T** b, int ldb, int batch_size) const;
 #endif
 
  private:
@@ -414,6 +420,12 @@ class BlasT : private Blas<DeviceContext> {
   void BatchedGETRS(ARGS... args) const {
     Base()->template BatchedGETRS<T>(args...);
   }
+
+  // triangular_solve
+  template <typename... ARGS>
+  void BatchedTRSM(ARGS... args) const {
+    Base()->template BatchedTRSM<T>(args...);
+  }
 #endif
 
  private:
diff --git a/paddle/fluid/operators/math/blas_impl.cu.h b/paddle/fluid/operators/math/blas_impl.cu.h
index 6f83faf1e40d8..70c6cf9dcab03 100644
--- a/paddle/fluid/operators/math/blas_impl.cu.h
+++ b/paddle/fluid/operators/math/blas_impl.cu.h
@@ -120,6 +120,11 @@ struct CUBlas<float> {
     PADDLE_ENFORCE_CUDA_SUCCESS(
         platform::dynload::cublasSgetrsBatched(args...));
   }
+
+  template <typename... ARGS>
+  static void TRSM_BATCH(ARGS... args) {
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasStrsmBatched(args...));
+  }
 };
 
 template <>
@@ -194,6 +199,11 @@ struct CUBlas<double> {
     PADDLE_ENFORCE_CUDA_SUCCESS(
         platform::dynload::cublasDgetrsBatched(args...));
   }
+
+  template <typename... ARGS>
+  static void TRSM_BATCH(ARGS... args) {
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasDtrsmBatched(args...));
+  }
 };
 
 template <>
@@ -339,6 +349,19 @@ struct CUBlas<platform::complex<float>> {
         reinterpret_cast<cuFloatComplex *>(C), ldc));
   }
 
+  static void TRSM(cublasHandle_t handle, cublasSideMode_t side,
+                   cublasFillMode_t uplo, cublasOperation_t transa,
+                   cublasDiagType_t diag, int m, int n,
+                   const paddle::platform::complex<float> *alpha,
+                   const paddle::platform::complex<float> *A, int lda,
+                   paddle::platform::complex<float> *B, int ldb) {
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasCtrsm(
+        handle, side, uplo, transa, diag, m, n,
+        reinterpret_cast<const cuFloatComplex *>(alpha),
+        reinterpret_cast<const cuFloatComplex *>(A), lda,
+        reinterpret_cast<cuFloatComplex *>(B), ldb));
+  }
+
   // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
   // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
   template <typename... ARGS>
@@ -370,6 +393,20 @@ struct CUBlas<platform::complex<float>> {
         "cublasGemmEx is not supported on cuda <= 7.5"));
 #endif
   }
+
+  static void TRSM_BATCH(cublasHandle_t handle, cublasSideMode_t side,
+                         cublasFillMode_t uplo, cublasOperation_t transa,
+                         cublasDiagType_t diag, int m, int n,
+                         const paddle::platform::complex<float> *alpha,
+                         const paddle::platform::complex<float> **A, int lda,
+                         paddle::platform::complex<float> **B, int ldb,
+                         int batch_size) {
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasCtrsmBatched(
+        handle, side, uplo, transa, diag, m, n,
+        reinterpret_cast<const cuFloatComplex *>(alpha),
+        reinterpret_cast<const cuFloatComplex **>(A), lda,
+        reinterpret_cast<cuFloatComplex **>(B), ldb, batch_size));
+  }
 };
 
 template <>
@@ -440,6 +477,33 @@ struct CUBlas<platform::complex<double>> {
         reinterpret_cast<cuDoubleComplex *>(C), ldc));
   }
 
+  static void TRSM(cublasHandle_t handle, cublasSideMode_t side,
+                   cublasFillMode_t uplo, cublasOperation_t transa,
+                   cublasDiagType_t diag, int m, int n,
+                   const paddle::platform::complex<double> *alpha,
+                   const paddle::platform::complex<double> *A, int lda,
+                   paddle::platform::complex<double> *B, int ldb) {
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasZtrsm(
+        handle, side, uplo, transa, diag, m, n,
+        reinterpret_cast<const cuDoubleComplex *>(alpha),
+        reinterpret_cast<const cuDoubleComplex *>(A), lda,
+        reinterpret_cast<cuDoubleComplex *>(B), ldb));
+  }
+
+  static void TRSM_BATCH(cublasHandle_t handle, cublasSideMode_t side,
+                         cublasFillMode_t uplo, cublasOperation_t transa,
+                         cublasDiagType_t diag, int m, int n,
+                         const paddle::platform::complex<double> *alpha,
+                         const paddle::platform::complex<double> **A, int lda,
+                         paddle::platform::complex<double> **B, int ldb,
+                         int batch_size) {
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasZtrsmBatched(
+        handle, side, uplo, transa, diag, m, n,
+        reinterpret_cast<const cuDoubleComplex *>(alpha),
+        reinterpret_cast<const cuDoubleComplex **>(A), lda,
+        reinterpret_cast<cuDoubleComplex **>(B), ldb, batch_size));
+  }
+
   // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
   // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
   template <typename... ARGS>
@@ -897,6 +961,30 @@ void Blas<platform::CUDADeviceContext>::BatchedGETRS(
   });
 }
 
+template <>
+template <typename T>
+void Blas<platform::CUDADeviceContext>::BatchedTRSM(
+    CBLAS_SIDE side, CBLAS_UPLO uplo, CBLAS_TRANSPOSE transA, CBLAS_DIAG diag,
+    int M, int N, T alpha, const T **A, int lda, T **B, int ldb,
+    int batch_size) const {
+  // solve row major `op ( A ) X = α B` by taking it as `X' op ( A' )  =  α B'`
+  // where ' stands for transpose
+  cublasSideMode_t cuSide =
+      (side == CblasLeft) ? CUBLAS_SIDE_RIGHT : CUBLAS_SIDE_LEFT;
+  cublasFillMode_t cuUplo =
+      (uplo == CblasLower) ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER;
+  // use CUBLAS_OP_C (conjugate transpose) for complex
+  cublasOperation_t cuTransA =
+      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasDiagType_t cuDiag =
+      (diag == CblasUnit) ? CUBLAS_DIAG_UNIT : CUBLAS_DIAG_NON_UNIT;
+
+  context_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<T>::TRSM_BATCH(handle, cuSide, cuUplo, cuTransA, cuDiag, N, M,
+                          &alpha, A, lda, B, ldb, batch_size);
+  });
+}
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h
index cb4044b1b08c7..4bcf3baa64932 100644
--- a/paddle/fluid/operators/math/blas_impl.h
+++ b/paddle/fluid/operators/math/blas_impl.h
@@ -434,6 +434,17 @@ struct CBlas<platform::complex<float>> {
                                    a_, lda, b_, ldb, &beta, c_, ldc);
   }
 
+  static void TRSM(CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO uplo,
+                   CBLAS_TRANSPOSE trans_a, CBLAS_DIAG diag, int M, int N,
+                   paddle::platform::complex<float> alpha,
+                   const paddle::platform::complex<float> *A, int lda,
+                   paddle::platform::complex<float> *B, int ldb) {
+    const void *a_ = (const void *)(A);
+    void *b_ = static_cast<void *>(B);
+    platform::dynload::cblas_ctrsm(layout, side, uplo, trans_a, diag, M, N,
+                                   &alpha, a_, lda, b_, ldb);
+  }
+
   template <typename... ARGS>
   static void GEMM_BATCH(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE *trans_a,
                          CBLAS_TRANSPOSE *trans_b, int *M, int *N, int *K,
@@ -562,6 +573,17 @@ struct CBlas<platform::complex<double>> {
                                    a_, lda, b_, ldb, &beta, c_, ldc);
   }
 
+  static void TRSM(CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO uplo,
+                   CBLAS_TRANSPOSE trans_a, CBLAS_DIAG diag, int M, int N,
+                   paddle::platform::complex<double> alpha,
+                   const paddle::platform::complex<double> *A, int lda,
+                   paddle::platform::complex<double> *B, int ldb) {
+    const void *a_ = (const void *)(A);
+    void *b_ = static_cast<void *>(B);
+    platform::dynload::cblas_ztrsm(layout, side, uplo, trans_a, diag, M, N,
+                                   &alpha, a_, lda, b_, ldb);
+  }
+
   template <typename... ARGS>
   static void GEMM_BATCH(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE *trans_a,
                          CBLAS_TRANSPOSE *trans_b, int *M, int *N, int *K,
@@ -682,6 +704,15 @@ struct CBlas<platform::complex<float>> {
     cblas_cgemm(layout, TransA, TransB, M, N, K, &alpha, A, lda, B, ldb, &beta,
                 C, ldc);
   }
+
+  static void TRSM(const CBLAS_LAYOUT layout, const CBLAS_SIDE side,
+                   const CBLAS_UPLO uplo, const CBLAS_TRANSPOSE transA,
+                   const CBLAS_DIAG diag, const int M, const int N,
+                   const paddle::platform::complex<float> alpha,
+                   const paddle::platform::complex<float> *A, const int lda,
+                   paddle::platform::complex<double> *B, const int ldb) {
+    cblas_ctrsm(layout, side, uplo, transA, diag, M, N, &alpha, A, lda, B, ldb);
+  }
 };
 
 template <>
@@ -720,6 +751,15 @@ struct CBlas<platform::complex<double>> {
     cblas_zgemm(layout, TransA, TransB, M, N, K, &alpha, A, lda, B, ldb, &beta,
                 C, ldc);
   }
+
+  static void TRSM(const CBLAS_LAYOUT layout, const CBLAS_SIDE side,
+                   const CBLAS_UPLO uplo, const CBLAS_TRANSPOSE transA,
+                   const CBLAS_DIAG diag, const int M, const int N,
+                   const paddle::platform::complex<double> alpha,
+                   const paddle::platform::complex<double> *A, const int lda,
+                   paddle::platform::complex<double> *B, const int ldb) {
+    cblas_ztrsm(layout, side, uplo, transA, diag, M, N, &alpha, A, lda, B, ldb);
+  }
 };
 
 #endif
diff --git a/paddle/fluid/operators/math/blas_impl.hip.h b/paddle/fluid/operators/math/blas_impl.hip.h
index 1ce5bac5242ab..f972d38adda5f 100644
--- a/paddle/fluid/operators/math/blas_impl.hip.h
+++ b/paddle/fluid/operators/math/blas_impl.hip.h
@@ -90,6 +90,12 @@ struct CUBlas<float> {
     PADDLE_THROW(platform::errors::Unimplemented(
         "cublasSmatinvBatched is not supported on HIP platform."));
   }
+
+  template <typename... ARGS>
+  static void TRSM_BATCH(ARGS... args) {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "cublasStrsmBatched is not supported on HIP platform."));
+  }
 };
 
 template <>
@@ -153,6 +159,12 @@ struct CUBlas<double> {
     PADDLE_THROW(platform::errors::Unimplemented(
         "cublasDmatinvBatched is not supported on HIP platform."));
   }
+
+  template <typename... ARGS>
+  static void TRSM_BATCH(ARGS... args) {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "cublasDtrsmBatched is not supported on HIP platform."));
+  }
 };
 
 template <>
@@ -730,6 +742,32 @@ void Blas<platform::CUDADeviceContext>::BatchedGETRS(
                            batch_size);
   });
 }
+
+template <>
+template <typename T>
+void Blas<platform::CUDADeviceContext>::BatchedTRSM(
+    CBLAS_SIDE side, CBLAS_UPLO uplo, CBLAS_TRANSPOSE transA, CBLAS_DIAG diag,
+    int M, int N, T alpha, const T **A, int lda, T **B, int ldb,
+    int batch_size) const {
+  // solve row major `op ( A ) X = α B` by taking it as `X' op ( A' )  =  α B'`
+  // where ' stands for transpose
+  rocblas_side cuSide =
+      (side == CblasLeft) ? rocblas_side_right : rocblas_side_left;
+  rocblas_fill cuUplo =
+      (uplo == CblasLower) ? rocblas_fill_upper : rocblas_fill_lower;
+  // use CUBLAS_OP_C (conjugate transpose) for complex
+  rocblas_operation cuTransA = (transA == CblasNoTrans)
+                                   ? rocblas_operation_none
+                                   : rocblas_operation_transpose;
+  rocblas_diagonal cuDiag =
+      (diag == CblasUnit) ? rocblas_diagonal_unit : rocblas_diagonal_non_unit;
+
+  context_.CublasCall([&](rocblas_handle handle) {
+    CUBlas<T>::TRSM_BATCH(handle, cuSide, cuUplo, cuTransA, cuDiag, N, M,
+                          &alpha, A, lda, B, ldb, batch_size);
+  });
+}
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/math/concat_and_split.cu b/paddle/fluid/operators/math/concat_and_split.cu
index b9481f1c8e40e..614ae93d9fa82 100644
--- a/paddle/fluid/operators/math/concat_and_split.cu
+++ b/paddle/fluid/operators/math/concat_and_split.cu
@@ -18,6 +18,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/mixed_vector.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
+#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
 #include "paddle/fluid/platform/float16.h"
 
@@ -286,10 +287,13 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
     const T** dev_ins_data = nullptr;
     if (!has_same_shape || in_num < 2 || in_num > 4) {
       tmp_dev_ins_data = memory::Alloc(context, in_num * sizeof(T*));
-      memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()),
-                   tmp_dev_ins_data->ptr(), platform::CPUPlace(),
-                   static_cast<void*>(inputs_data), in_num * sizeof(T*),
-                   context.stream());
+      {
+        platform::SkipCUDAGraphCaptureGuard guard;
+        memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()),
+                     tmp_dev_ins_data->ptr(), platform::CPUPlace(),
+                     static_cast<void*>(inputs_data), in_num * sizeof(T*),
+                     context.stream());
+      }
       dev_ins_data = reinterpret_cast<const T**>(tmp_dev_ins_data->ptr());
     }
 
@@ -313,10 +317,13 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
     } else {
       auto tmp_dev_ins_col_data =
           memory::Alloc(context, inputs_col_num * sizeof(int64_t));
-      memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()),
-                   tmp_dev_ins_col_data->ptr(), platform::CPUPlace(),
-                   static_cast<void*>(inputs_col),
-                   inputs_col_num * sizeof(int64_t), context.stream());
+      {
+        platform::SkipCUDAGraphCaptureGuard guard;
+        memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()),
+                     tmp_dev_ins_col_data->ptr(), platform::CPUPlace(),
+                     static_cast<void*>(inputs_col),
+                     inputs_col_num * sizeof(int64_t), context.stream());
+      }
       int64_t* dev_ins_col_data =
           static_cast<int64_t*>(tmp_dev_ins_col_data->ptr());
 
@@ -415,10 +422,13 @@ class SplitFunctor<platform::CUDADeviceContext, T> {
     T** dev_out_gpu_data = nullptr;
     if (!has_same_shape || o_num < 2 || o_num > 4) {
       tmp_dev_outs_data = memory::Alloc(context, o_num * sizeof(T*));
-      memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()),
-                   tmp_dev_outs_data->ptr(), platform::CPUPlace(),
-                   reinterpret_cast<void*>(outputs_data), o_num * sizeof(T*),
-                   context.stream());
+      {
+        platform::SkipCUDAGraphCaptureGuard guard;
+        memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()),
+                     tmp_dev_outs_data->ptr(), platform::CPUPlace(),
+                     reinterpret_cast<void*>(outputs_data), o_num * sizeof(T*),
+                     context.stream());
+      }
       dev_out_gpu_data = reinterpret_cast<T**>(tmp_dev_outs_data->ptr());
     }
 
@@ -442,10 +452,13 @@ class SplitFunctor<platform::CUDADeviceContext, T> {
     } else {
       auto tmp_dev_ins_col_data =
           memory::Alloc(context, outputs_cols_num * sizeof(int64_t));
-      memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()),
-                   tmp_dev_ins_col_data->ptr(), platform::CPUPlace(),
-                   reinterpret_cast<void*>(outputs_cols),
-                   outputs_cols_num * sizeof(int64_t), context.stream());
+      {
+        platform::SkipCUDAGraphCaptureGuard guard;
+        memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()),
+                     tmp_dev_ins_col_data->ptr(), platform::CPUPlace(),
+                     reinterpret_cast<void*>(outputs_cols),
+                     outputs_cols_num * sizeof(int64_t), context.stream());
+      }
       int64_t* dev_outs_col_data =
           reinterpret_cast<int64_t*>(tmp_dev_ins_col_data->ptr());
 
diff --git a/paddle/fluid/operators/math/matrix_solve.cc b/paddle/fluid/operators/math/matrix_solve.cc
index 7f13b5c8a70ee..95c84d83976f5 100644
--- a/paddle/fluid/operators/math/matrix_solve.cc
+++ b/paddle/fluid/operators/math/matrix_solve.cc
@@ -34,6 +34,45 @@ class MatrixSolveFunctor<platform::CPUDeviceContext, T> {
 template class MatrixSolveFunctor<platform::CPUDeviceContext, float>;
 template class MatrixSolveFunctor<platform::CPUDeviceContext, double>;
 
+template <typename T>
+class TriangularSolveFunctor<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor* a, framework::Tensor* b, bool left,
+                  bool upper, bool transpose, bool unitriangular) {
+    CBLAS_SIDE side = left ? CblasLeft : CblasRight;
+    CBLAS_UPLO uplo = upper ? CblasUpper : CblasLower;
+    CBLAS_TRANSPOSE transA = transpose ? CblasTrans : CblasNoTrans;
+    CBLAS_DIAG diag = unitriangular ? CblasUnit : CblasNonUnit;
+
+    const T* a_data = a->data<T>();
+    T* b_data = b->mutable_data<T>(context.GetPlace());
+
+    int a_dim_size = a->dims().size();
+    int b_dim_size = b->dims().size();
+
+    int M = static_cast<int>(b->dims()[b_dim_size - 2]);
+    int N = static_cast<int>(b->dims()[b_dim_size - 1]);
+    auto lda = left ? std::max(1, M) : std::max(1, N);
+    auto ldb = std::max(1, N);
+
+    int batch_size = 1;
+    auto& a_dim = a->dims();
+    for (int i = 0; i < a_dim_size - 2; i++) {
+      batch_size *= a_dim[i];
+    }
+
+    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
+    for (int i = 0; i < batch_size; i++) {
+      blas.TRSM(side, uplo, transA, diag, M, N, T(1), a_data + i * M * M, lda,
+                b_data + i * N * M, ldb);
+    }
+  }
+};
+
+template class TriangularSolveFunctor<platform::CPUDeviceContext, float>;
+template class TriangularSolveFunctor<platform::CPUDeviceContext, double>;
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/math/matrix_solve.cu.cc b/paddle/fluid/operators/math/matrix_solve.cu.cc
index efb3a07e4c1b4..4e5601248c1a2 100644
--- a/paddle/fluid/operators/math/matrix_solve.cu.cc
+++ b/paddle/fluid/operators/math/matrix_solve.cu.cc
@@ -163,6 +163,68 @@ class MatrixSolveFunctor<platform::CUDADeviceContext, T> {
 template class MatrixSolveFunctor<platform::CUDADeviceContext, float>;
 template class MatrixSolveFunctor<platform::CUDADeviceContext, double>;
 
+template <typename T>
+class TriangularSolveFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context, const Tensor* a,
+                  Tensor* b, bool left, bool upper, bool transpose,
+                  bool unitriangular) {
+    CBLAS_SIDE side = left ? CblasLeft : CblasRight;
+    CBLAS_UPLO uplo = upper ? CblasUpper : CblasLower;
+    CBLAS_TRANSPOSE transA = transpose ? CblasTrans : CblasNoTrans;
+    CBLAS_DIAG diag = unitriangular ? CblasUnit : CblasNonUnit;
+
+    const T* a_data = a->data<T>();
+    T* b_data = b->mutable_data<T>(context.GetPlace());
+
+    int a_dim_size = a->dims().size();
+    int b_dim_size = b->dims().size();
+
+    int M = static_cast<int>(b->dims()[b_dim_size - 2]);
+    int N = static_cast<int>(b->dims()[b_dim_size - 1]);
+    auto lda = left ? std::max(1, M) : std::max(1, N);
+    auto ldb = std::max(1, N);
+
+    int batch_size = 1;
+    auto& a_dim = a->dims();
+    for (int i = 0; i < a_dim_size - 2; i++) {
+      batch_size *= a_dim[i];
+    }
+
+    auto blas = math::GetBlas<platform::CUDADeviceContext, T>(context);
+    if (batch_size <= 8 && M >= 64) {
+      for (auto i = 0; i < batch_size; i++) {
+        blas.TRSM(side, uplo, transA, diag, M, N, static_cast<T>(1.0),
+                  a_data + i * M * M, lda, b_data + i * N * M, ldb);
+      }
+    } else {
+      std::vector<const T*> cpu_ptrs(batch_size * 2);
+      for (int i = 0; i < batch_size; ++i) {
+        cpu_ptrs[i] = a_data + i * M * M;
+        cpu_ptrs[i + batch_size] = b_data + i * M * N;
+      }
+
+      // Copy the addresses of A and tmp_b from host to device.
+      memory::allocation::AllocationPtr tmp_gpu_ptrs_data =
+          memory::Alloc(context, cpu_ptrs.size() * sizeof(T*));
+      memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()),
+                   tmp_gpu_ptrs_data->ptr(), platform::CPUPlace(),
+                   static_cast<void*>(cpu_ptrs.data()),
+                   cpu_ptrs.size() * sizeof(T*), context.stream());
+
+      const T** gpu_a_ptrs =
+          reinterpret_cast<const T**>(tmp_gpu_ptrs_data->ptr());
+      T** gpu_b_ptrs =
+          reinterpret_cast<T**>(tmp_gpu_ptrs_data->ptr()) + batch_size;
+      blas.BatchedTRSM(side, uplo, transA, diag, M, N, static_cast<T>(1.0),
+                       gpu_a_ptrs, lda, gpu_b_ptrs, ldb, batch_size);
+    }
+  }
+};
+
+template class TriangularSolveFunctor<platform::CUDADeviceContext, float>;
+template class TriangularSolveFunctor<platform::CUDADeviceContext, double>;
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/math/matrix_solve.h b/paddle/fluid/operators/math/matrix_solve.h
index 415d0c6dd8e0c..1dc43205592f6 100644
--- a/paddle/fluid/operators/math/matrix_solve.h
+++ b/paddle/fluid/operators/math/matrix_solve.h
@@ -117,6 +117,14 @@ class MatrixSolveFunctor {
                   const framework::Tensor& b, framework::Tensor* out);
 };
 
+template <typename DeviceContext, typename T>
+class TriangularSolveFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor* a,
+                  framework::Tensor* b, bool left, bool upper, bool transpose,
+                  bool unitriangular);
+};
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/math/sequence_padding.cc b/paddle/fluid/operators/math/sequence_padding.cc
index dca58f796a76f..e29313e9f742c 100644
--- a/paddle/fluid/operators/math/sequence_padding.cc
+++ b/paddle/fluid/operators/math/sequence_padding.cc
@@ -33,8 +33,7 @@ void CopyValidData(framework::Tensor* dst_tensor,
                    const framework::Tensor* src_tensor,
                    const framework::Vector<size_t>& seq_offsets,
                    int pad_seq_len, int step_width, bool norm_by_len,
-                   bool norm_by_batchsize, bool norm_by_total_logits_len,
-                   int total_logits_len, CopyType type, PadLayout layout) {
+                   CopyType type, PadLayout layout) {
   int seq_num = seq_offsets.size() - 1;
   const T* src_data = src_tensor->data<T>();
   T* dst_data = dst_tensor->data<T>();
@@ -55,21 +54,7 @@ void CopyValidData(framework::Tensor* dst_tensor,
     int pad_data_offset = layout == kBatchLengthWidth
                               ? seq_idx * pad_seq_len * step_width
                               : seq_idx * step_width;
-
-    float scale = 1.0f;
-    if (norm_by_total_logits_len) {
-      scale = 1.0f / static_cast<float>(total_logits_len);
-      VLOG(3) << "[warpctc grad][norm_by_total_logits_len]: scale " << scale
-              << "total_logits_len " << total_logits_len;
-    } else if (norm_by_batchsize) {
-      scale = 1.0f / static_cast<float>(seq_num);
-      VLOG(3) << "[warpctc grad][norm_by_batchsize]: scale " << scale << "B "
-              << seq_num;
-    } else if (norm_by_len) {
-      scale = 1.0f / static_cast<float>(valid_seq_len);
-      VLOG(3) << "[warpctc grad][norm_by_len]: scale " << scale << "T "
-              << valid_seq_len;
-    }
+    float scale = 1.0f / static_cast<float>(valid_seq_len);
 
     for (int step_idx = 0; step_idx < valid_seq_len; ++step_idx) {
       const T* src =
@@ -112,8 +97,6 @@ class PaddingLoDTensorFunctor<platform::CPUDeviceContext, T> {
                   framework::LoDTensor* pad_tensor,
                   const framework::LoDTensor& pad_value, int pad_seq_len = -1,
                   int lod_level = 0, bool norm_by_times = false,
-                  bool norm_by_batchsize = false,
-                  bool norm_by_total_logits_len = false,
                   const PadLayout layout = kBatchLengthWidth) {
     auto seq_lod = seq_tensor.lod();
     const auto seq_offsets = framework::ToAbsOffset(seq_lod)[lod_level];
@@ -148,8 +131,7 @@ class PaddingLoDTensorFunctor<platform::CPUDeviceContext, T> {
     }
 
     CopyValidData<T>(pad_tensor, &seq_tensor, seq_offsets, pad_seq_len,
-                     step_width, norm_by_times, false, false, 0, kSeqToPad,
-                     layout);
+                     step_width, norm_by_times, kSeqToPad, layout);
   }
 };
 
@@ -160,8 +142,6 @@ class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, T> {
                   const framework::LoDTensor& pad_tensor,
                   framework::LoDTensor* seq_tensor, int pad_seq_len = -1,
                   int lod_level = 0, bool norm_by_times = false,
-                  bool norm_by_batchsize = false,
-                  bool norm_by_total_logits_len = false,
                   const PadLayout layout = kBatchLengthWidth) {
     auto seq_offsets = framework::ToAbsOffset(seq_tensor->lod())[lod_level];
     const auto& seq_tensor_dims = seq_tensor->dims();
@@ -169,16 +149,13 @@ class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, T> {
     if (pad_seq_len == -1) {
       pad_seq_len = MaximumSequenceLength(seq_offsets);
     }
-    int total_logits_len = TotalSequenceLength(seq_offsets);
     int step_width = seq_tensor->numel() / seq_tensor_dims[0];
 
     CheckDims(seq_tensor_dims, pad_tensor_dims, seq_offsets, pad_seq_len,
               step_width, layout);
 
     CopyValidData<T>(seq_tensor, &pad_tensor, seq_offsets, pad_seq_len,
-                     step_width, norm_by_times, norm_by_batchsize,
-                     norm_by_total_logits_len, total_logits_len, kPadToSeq,
-                     layout);
+                     step_width, norm_by_times, kPadToSeq, layout);
   }
 };
 
diff --git a/paddle/fluid/operators/math/sequence_padding.cu b/paddle/fluid/operators/math/sequence_padding.cu
index 3578d7e91fd8c..19c3af03411b8 100644
--- a/paddle/fluid/operators/math/sequence_padding.cu
+++ b/paddle/fluid/operators/math/sequence_padding.cu
@@ -23,9 +23,7 @@ template <typename T, CopyType Type>
 __global__ void SequencePaddingKernel(
     T* dst, const T* src, const T* pad_value, bool is_constant_pad,
     const size_t* seq_offsets, const size_t seq_num, const size_t pad_seq_len,
-    const size_t step_width, bool norm_by_len, bool norm_by_batchsize,
-    bool norm_by_total_logits_len, int total_logits_len,
-    const PadLayout layout) {
+    const size_t step_width, bool norm_by_len, const PadLayout layout) {
   size_t seq_idx = blockIdx.y;
   size_t seq_len = seq_offsets[seq_idx + 1] - seq_offsets[seq_idx];
 
@@ -40,15 +38,7 @@ __global__ void SequencePaddingKernel(
       src + (Type == kSeqToPad ? seq_data_offset : pad_data_offset);
 
   if (step_idx < seq_len) {
-    float scale = 1.0f;
-    if (norm_by_total_logits_len) {
-      scale = 1.0f / static_cast<float>(total_logits_len);
-    } else if (norm_by_batchsize) {
-      scale = 1.0f / static_cast<float>(seq_num);
-    } else if (norm_by_len) {
-      scale = norm_by_len ? (1.0f / static_cast<float>(seq_len)) : 1.0f;
-    }
-
+    float scale = norm_by_len ? (1.0f / static_cast<float>(seq_len)) : 1.0f;
     for (size_t i = threadIdx.x; i < step_width; i += blockDim.x) {
       dst_data[i] = scale * src_data[i];
     }
@@ -67,8 +57,6 @@ class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
                   framework::LoDTensor* pad_tensor,
                   const framework::LoDTensor& pad_value, int pad_seq_len = -1,
                   int lod_level = 0, bool norm_by_times = false,
-                  bool norm_by_batchsize = false,
-                  bool norm_by_total_logits_len = false,
                   const PadLayout layout = kBatchLengthWidth) {
     auto seq_lod = seq_tensor.lod();
     const auto seq_offsets = framework::ToAbsOffset(seq_lod)[lod_level];
@@ -119,7 +107,7 @@ class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
     SequencePaddingKernel<T, kSeqToPad><<<grid, threads, 0, context.stream()>>>(
         pad_data, seq_data, pad_value_data, pad_value.numel() == 1,
         seq_offsets.CUDAData(context.GetPlace()), seq_num, pad_seq_len,
-        step_width, norm_by_times, false, false, 0, layout);
+        step_width, norm_by_times, layout);
   }
 };
 
@@ -130,8 +118,6 @@ class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
                   const framework::LoDTensor& pad_tensor,
                   framework::LoDTensor* seq_tensor, int pad_seq_len = -1,
                   int lod_level = 0, bool norm_by_times = false,
-                  bool norm_by_batchsize = false,
-                  bool norm_by_total_logits_len = false,
                   const PadLayout layout = kBatchLengthWidth) {
     auto seq_offsets = framework::ToAbsOffset(seq_tensor->lod())[lod_level];
     const auto& seq_tensor_dims = seq_tensor->dims();
@@ -140,7 +126,6 @@ class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
     if (pad_seq_len == -1) {
       pad_seq_len = max_seq_len;
     }
-    int total_logits_len = TotalSequenceLength(seq_offsets);
     int step_width = seq_tensor->numel() / seq_tensor_dims[0];
     int seq_num = seq_offsets.size() - 1;
 
@@ -174,8 +159,7 @@ class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
     SequencePaddingKernel<T, kPadToSeq><<<grid, threads, 0, context.stream()>>>(
         seq_data, pad_data, nullptr, false,
         seq_offsets.CUDAData(context.GetPlace()), seq_num, pad_seq_len,
-        step_width, norm_by_times, norm_by_batchsize, norm_by_total_logits_len,
-        total_logits_len, layout);
+        step_width, norm_by_times, layout);
   }
 };
 
diff --git a/paddle/fluid/operators/math/sequence_padding.h b/paddle/fluid/operators/math/sequence_padding.h
index 308e1eedebd37..956a4ff6a2d45 100644
--- a/paddle/fluid/operators/math/sequence_padding.h
+++ b/paddle/fluid/operators/math/sequence_padding.h
@@ -107,8 +107,6 @@ class PaddingLoDTensorFunctor {
                   framework::LoDTensor* pad_tensor,
                   const framework::LoDTensor& pad_value, int pad_seq_len = -1,
                   int lod_level = 0, bool norm_by_times = false,
-                  bool norm_by_batchsize = false,
-                  bool norm_by_total_logits_len = false,
                   const PadLayout layout = kBatchLengthWidth);
 };
 
@@ -119,8 +117,6 @@ class UnpaddingLoDTensorFunctor {
                   const framework::LoDTensor& pad_tensor,
                   framework::LoDTensor* seq_tensor, int pad_seq_len = -1,
                   int lod_level = 0, bool norm_by_times = false,
-                  bool norm_by_batchsize = false,
-                  bool norm_by_total_logits_len = false,
                   const PadLayout layout = kBatchLengthWidth);
 };
 
diff --git a/paddle/fluid/operators/math/sequence_padding_test.cc b/paddle/fluid/operators/math/sequence_padding_test.cc
index 590d1d6191de4..ea31b10c5558f 100644
--- a/paddle/fluid/operators/math/sequence_padding_test.cc
+++ b/paddle/fluid/operators/math/sequence_padding_test.cc
@@ -66,13 +66,13 @@ void TestSequencePadding(const DeviceContext &context,
   }
 
   paddle::operators::math::PaddingLoDTensorFunctor<DeviceContext, T>()(
-      context, seq, &padding, pad_value, -1, 0, false, false, false,
+      context, seq, &padding, pad_value, -1, 0, false,
       paddle::operators::math::kLengthBatchWidth);
 
   seq_back.set_lod(lod);
   seq_back.mutable_data<T>(seq_dims, place);
   paddle::operators::math::UnpaddingLoDTensorFunctor<DeviceContext, T>()(
-      context, padding, &seq_back, -1, 0, false, false, false,
+      context, padding, &seq_back, -1, 0, false,
       paddle::operators::math::kLengthBatchWidth);
 
   if (paddle::platform::is_cpu_place(place)) {
diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc
index 4e435660ff6dc..051f97ad4ec8d 100644
--- a/paddle/fluid/operators/matmul_op.cc
+++ b/paddle/fluid/operators/matmul_op.cc
@@ -336,6 +336,8 @@ framework::DDim GetDimForInput(const framework::InferShapeContext &ctx,
                         "The Input(%s) has not been initialized properly. The "
                         "shape of Input(%s) = [%s].",
                         dim));
+
+  // if mkldnn reshape+transpose+matmul fuse activated
   if (!shape.empty() && !axis.empty()) {
     PADDLE_ENFORCE_GE(
         shape.size(), 2,
@@ -355,6 +357,43 @@ framework::DDim GetDimForInput(const framework::InferShapeContext &ctx,
             "Ranks of shape_%s and axis_%s attributes of MatMulOp "
             "must be equal.",
             input_name, input_name));
+
+    int num_negative = std::count(shape.begin(), shape.end(), -1);
+    PADDLE_ENFORCE_LE(num_negative, 1,
+                      platform::errors::InvalidArgument(
+                          "The max number of -1 in fused_reshape_%s is 1 "
+                          "but received %d.",
+                          input_name, num_negative));
+
+    auto it_zero = std::find(shape.begin(), shape.end(), 0);
+    if (it_zero != shape.end()) {
+      for (uint64_t i = 0; i < shape.size(); i++) {
+        if (shape[i] == 0) {
+          PADDLE_ENFORCE_LT(i, dim.size(),
+                            platform::errors::InvalidArgument(
+                                "The index of 0 in fused_reshape_%s ",
+                                "should be less than output dim size, ",
+                                "but the index is %d and output dim size is %d",
+                                input_name, i, dim.size()));
+          shape[i] = dim.at(i);
+        }
+      }
+    }
+
+    // if "-1" is present then one of reshape dims must be infered
+    auto it_negative = std::find(shape.begin(), shape.end(), -1);
+    if (it_negative != shape.end()) {
+      int64_t dim_product = 1;
+      for (int i = 0; i < dim.size(); i++) {
+        dim_product *= dim.at(i);
+      }
+
+      int64_t shape_product = std::accumulate(shape.begin(), shape.end(), -1,
+                                              std::multiplies<int>());
+      int index = std::distance(shape.begin(), it_negative);
+      shape[index] = dim_product / shape_product;
+    }
+
     dim = dim.reshape(shape).transpose(axis);
   }
   return dim;
diff --git a/paddle/fluid/operators/matmul_op_xpu.cc b/paddle/fluid/operators/matmul_op_xpu.cc
index 7097b5327d86f..53593d2db01f7 100644
--- a/paddle/fluid/operators/matmul_op_xpu.cc
+++ b/paddle/fluid/operators/matmul_op_xpu.cc
@@ -20,6 +20,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/xpu_api_wrapper.h"
 
 namespace paddle {
 namespace operators {
@@ -151,28 +152,26 @@ static void MatMulXPUFunction(const Tensor *x, const Tensor *y, Tensor *out,
                         x_dims.to_str().c_str(), y_dims.to_str().c_str()));
 
   float alpha = static_cast<T>(ctx.Attr<float>("alpha"));
-
   T *data_c = out->data<T>();
   int m = mat_dim_a.height_;
   int n = mat_dim_b.width_;
   int k = mat_dim_a.width_;
   int batch_size = mat_dim_a.batch_size_;
-
   int ldx = mat_dim_a.trans_ ? m : k;
   int ldy = mat_dim_b.trans_ ? k : n;
   int ldout = n;
   if (batch_size <= 1) {
     int r = 0;
-    r = xpu::fc_fusion<XPUType, XPUType, XPUType, FCT>(
+    r = xpu_fc_wrapper<XPUType, FCT>(
         dev_ctx.x_context(), reinterpret_cast<const XPUType *>(x->data<T>()),
         reinterpret_cast<const XPUType *>(y->data<T>()),
         reinterpret_cast<XPUType *>(data_c), m, n, k, mat_dim_a.trans_,
         mat_dim_b.trans_, nullptr, nullptr, nullptr, ldx, ldy, ldout, alpha, 0,
         nullptr, xpu::Activation_t::LINEAR);
-    PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
-                      platform::errors::External(
-                          "XPU fc_fusion kernel return wrong value[%d %s]", r,
-                          XPUAPIErrorMsg[r]));
+    PADDLE_ENFORCE_EQ(
+        r, XPU_SUCCESS,
+        platform::errors::External("XPU fc kernel return wrong value[%d %s]", r,
+                                   XPUAPIErrorMsg[r]));
   } else {
     // batch matmul
     int r = xpu::fc_batched<XPUType, XPUType, XPUType, FCT>(
@@ -216,8 +215,10 @@ class MatMulXPUKernel : public framework::OpKernel<T> {
     if (std::is_same<paddle::platform::float16, T>::value) {
       MatMulXPUFunction<T, int16_t>(x, y, out, trans_x, trans_y, context);
     } else {
-      if (std::getenv("XPU_PADDLE_MAT_MUL_FCINT32") != nullptr) {
+      if (std::getenv("XPU_PADDLE_FC_INT32") != nullptr) {
         MatMulXPUFunction<T, int32_t>(x, y, out, trans_x, trans_y, context);
+      } else if (std::getenv("XPU_PADDLE_FC_LOCAL_INT16") != nullptr) {
+        MatMulXPUFunction<T, float>(x, y, out, trans_x, trans_y, context);
       } else {
         MatMulXPUFunction<T, int16_t>(x, y, out, trans_x, trans_y, context);
       }
@@ -292,8 +293,10 @@ class MatMulGradXPUKernel : public framework::OpKernel<T> {
     if (std::is_same<paddle::platform::float16, T>::value) {
       MatMulXPUFunction<T, int16_t>(&a, &b, out, trans_a, trans_b, context);
     } else {
-      if (std::getenv("XPU_PADDLE_MAT_MUL_GRAD_FCINT32") != nullptr) {
+      if (std::getenv("XPU_PADDLE_FC_INT32") != nullptr) {
         MatMulXPUFunction<T, int32_t>(&a, &b, out, trans_a, trans_b, context);
+      } else if (std::getenv("XPU_PADDLE_FC_LOCAL_INT16") != nullptr) {
+        MatMulXPUFunction<T, float>(&a, &b, out, trans_a, trans_b, context);
       } else {
         MatMulXPUFunction<T, int16_t>(&a, &b, out, trans_a, trans_b, context);
       }
diff --git a/paddle/fluid/operators/matmul_v2_op.cc b/paddle/fluid/operators/matmul_v2_op.cc
index 1b609b15d6e56..bd32af1c8f623 100644
--- a/paddle/fluid/operators/matmul_v2_op.cc
+++ b/paddle/fluid/operators/matmul_v2_op.cc
@@ -347,6 +347,76 @@ class MatMulV2OpDoubleGradMaker : public framework::SingleGradOpMaker<T> {
     op->SetAttrMap(this->Attrs());
   }
 };
+class MatMulV2OpTripleGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* context) const override {
+    OP_INOUT_CHECK(context->HasInput("X"), "Input", "X",
+                   "matmul_v2_triple_grad");
+    OP_INOUT_CHECK(context->HasInput("Y"), "Input", "Y",
+                   "matmul_v2_triple_grad");
+    OP_INOUT_CHECK(context->HasInput("DOut"), "Input", "DOut",
+                   "matmul_v2_triple_grad");
+    OP_INOUT_CHECK(context->HasInput("DDX"), "Input", "DDX",
+                   "matmul_v2_triple_grad");
+    OP_INOUT_CHECK(context->HasInput("DDY"), "Input", "DDY",
+                   "matmul_v2_triple_grad");
+    OP_INOUT_CHECK(context->HasInput("D_DX"), "Input", "D_DX",
+                   "matmul_v2_triple_grad");
+    OP_INOUT_CHECK(context->HasInput("D_DY"), "Input", "D_DY",
+                   "matmul_v2_triple_grad");
+    OP_INOUT_CHECK(context->HasInput("D_DDOut"), "Input", "D_DDOut",
+                   "matmul_v2_triple_grad");
+
+    if (context->HasOutput("D_X_out")) {
+      context->ShareDim("X", "D_X_out");
+    }
+    if (context->HasOutput("D_Y_out")) {
+      context->ShareDim("Y", "D_Y_out");
+    }
+    if (context->HasOutput("D_DOut_out")) {
+      context->ShareDim("DOut", "D_DOut_out");
+    }
+    if (context->HasOutput("D_DDX_out")) {
+      context->ShareDim("X", "D_DDX_out");
+    }
+    if (context->HasOutput("D_DDY_out")) {
+      context->ShareDim("Y", "D_DDY_out");
+    }
+  }
+};
+
+template <typename T>
+class MatMulV2OpTripleGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("matmul_v2_triple_grad");
+
+    // get input from double grad
+    op->SetInput("X", this->Input("X"));
+    op->SetInput("Y", this->Input("Y"));
+    op->SetInput("DOut", this->Input("DOut"));
+    op->SetInput("DDX", this->Input("DDX"));
+    op->SetInput("DDY", this->Input("DDY"));
+    op->SetInput("D_DX", this->OutputGrad("DX"));
+    op->SetInput("D_DY", this->OutputGrad("DY"));
+    op->SetInput("D_DDOut", this->OutputGrad("DDOut"));
+
+    // set outputs
+    op->SetOutput("D_X_out", this->InputGrad("X"));
+    op->SetOutput("D_Y_out", this->InputGrad("Y"));
+    op->SetOutput("D_DOut_out", this->InputGrad("DOut"));
+    op->SetOutput("D_DDX_out", this->InputGrad("DDX"));
+    op->SetOutput("D_DDY_out", this->InputGrad("DDY"));
+
+    op->SetAttrMap(this->Attrs());
+  }
+};
 }  // namespace operators
 }  // namespace paddle
 
@@ -359,7 +429,11 @@ REGISTER_OPERATOR(matmul_v2_grad, ops::MatMulV2OpGrad,
                   ops::MatMulV2OpDoubleGradMaker<paddle::framework::OpDesc>,
                   ops::MatMulV2OpDoubleGradMaker<paddle::imperative::OpBase>);
 
-REGISTER_OPERATOR(matmul_v2_grad_grad, ops::MatMulV2OpDoubleGrad);
+REGISTER_OPERATOR(matmul_v2_grad_grad, ops::MatMulV2OpDoubleGrad,
+                  ops::MatMulV2OpTripleGradMaker<paddle::framework::OpDesc>,
+                  ops::MatMulV2OpTripleGradMaker<paddle::imperative::OpBase>);
+
+REGISTER_OPERATOR(matmul_v2_triple_grad, ops::MatMulV2OpTripleGrad);
 
 REGISTER_OP_CPU_KERNEL(
     matmul_v2, ops::MatMulV2Kernel<paddle::platform::CPUDeviceContext, float>,
@@ -385,3 +459,12 @@ REGISTER_OP_CPU_KERNEL(
                                   paddle::platform::complex<float>>,
     ops::MatMulV2DoubleGradKernel<paddle::platform::CPUDeviceContext,
                                   paddle::platform::complex<double>>);
+
+REGISTER_OP_CPU_KERNEL(
+    matmul_v2_triple_grad,
+    ops::MatMulV2TripleGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::MatMulV2TripleGradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::MatMulV2TripleGradKernel<paddle::platform::CPUDeviceContext,
+                                  paddle::platform::complex<float>>,
+    ops::MatMulV2TripleGradKernel<paddle::platform::CPUDeviceContext,
+                                  paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/matmul_v2_op.cu b/paddle/fluid/operators/matmul_v2_op.cu
index b258077456e1e..c9602a1eab931 100644
--- a/paddle/fluid/operators/matmul_v2_op.cu
+++ b/paddle/fluid/operators/matmul_v2_op.cu
@@ -40,3 +40,13 @@ REGISTER_OP_CUDA_KERNEL(
                                   paddle::platform::complex<float>>,
     ops::MatMulV2DoubleGradKernel<paddle::platform::CUDADeviceContext,
                                   paddle::platform::complex<double>>);
+
+REGISTER_OP_CUDA_KERNEL(
+    matmul_v2_triple_grad,
+    ops::MatMulV2TripleGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::MatMulV2TripleGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::MatMulV2TripleGradKernel<plf::CUDADeviceContext, plf::float16>,
+    ops::MatMulV2TripleGradKernel<paddle::platform::CUDADeviceContext,
+                                  paddle::platform::complex<float>>,
+    ops::MatMulV2TripleGradKernel<paddle::platform::CUDADeviceContext,
+                                  paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/matmul_v2_op.h b/paddle/fluid/operators/matmul_v2_op.h
index dd9940db29f77..ee95881caa9c5 100644
--- a/paddle/fluid/operators/matmul_v2_op.h
+++ b/paddle/fluid/operators/matmul_v2_op.h
@@ -25,6 +25,11 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/complex_functors.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_sum_op.h"
 
+// only can include the headers in paddle/pten/api dirs
+#include "paddle/pten/api/lib/utils/tensor_utils.h"
+#include "paddle/pten/include/core.h"
+#include "paddle/pten/include/linalg.h"
+
 #if defined(__NVCC__) || defined(__HIPCC__)
 #include "paddle/fluid/operators/reduce_ops/cub_reduce.h"
 #endif
@@ -380,15 +385,17 @@ class MatMulV2Kernel : public framework::OpKernel<T> {
     auto* Out = ctx.Output<Tensor>("Out");
     bool trans_x = ctx.Attr<bool>("trans_x");
     bool trans_y = ctx.Attr<bool>("trans_y");
-    PADDLE_ENFORCE_NE(framework::product(X->dims()), 0,
-                      platform::errors::InvalidArgument(
-                          "The Input(X) dims size must not be equal 0,"
-                          " but reviced dims size is 0. "));
-    PADDLE_ENFORCE_NE(framework::product(Y->dims()), 0,
-                      platform::errors::InvalidArgument(
-                          "The Input(Y) dims size must not be equal 0,"
-                          " but reviced dims size is 0. "));
-    MatMulFunction<DeviceContext, T>(X, Y, Out, trans_x, trans_y, ctx);
+
+    auto& dev_ctx = ctx.device_context<DeviceContext>();
+    Out->mutable_data<T>(X->place());
+
+    auto pt_x = paddle::experimental::MakePtenDenseTensor(*X);
+    auto pt_y = paddle::experimental::MakePtenDenseTensor(*Y);
+    auto pt_out = paddle::experimental::MakePtenDenseTensor(*Out);
+
+    // call new kernel
+    pten::Matmul<T>(dev_ctx, *pt_x.get(), *pt_y.get(), trans_x, trans_y,
+                    pt_out.get());
   }
 };
 
@@ -779,6 +786,421 @@ struct DotDoubleGradFunction<DeviceContext, T, math::DisableComplex<T>> {
   }
 };
 
+template <typename DeviceContext, typename T, typename Enabel = void>
+struct DotTripleGradFunction {
+  void operator()(const Tensor* in_tensor_x, const Tensor* in_tensor_y,
+                  const Tensor* in_tensor_ddx, const Tensor* in_tensor_ddy,
+                  const Tensor* in_tensor_d_dx, const Tensor* in_tensor_d_dy,
+                  const Tensor* in_tensor_dout, const Tensor* in_tensor_d_ddout,
+                  Tensor* out_tensor_d_x, Tensor* out_tensor_d_y,
+                  Tensor* out_tensor_d_dout, Tensor* out_tensor_d_ddx,
+                  Tensor* out_tensor_d_ddy,
+                  const paddle::framework::ExecutionContext& ctx);
+};
+
+// TODO(wuweilong): enable this function when the unittests framewark for multi
+// grad is ok (dtype: complex64 or complex128).
+template <typename DeviceContext, typename T>
+struct DotTripleGradFunction<DeviceContext, T, math::EnableComplex<T>> {
+  void operator()(const Tensor* in_tensor_x, const Tensor* in_tensor_y,
+                  const Tensor* in_tensor_ddx, const Tensor* in_tensor_ddy,
+                  const Tensor* in_tensor_d_dx, const Tensor* in_tensor_d_dy,
+                  const Tensor* in_tensor_dout, const Tensor* in_tensor_d_ddout,
+                  Tensor* out_tensor_d_x, Tensor* out_tensor_d_y,
+                  Tensor* out_tensor_d_dout, Tensor* out_tensor_d_ddx,
+                  Tensor* out_tensor_d_ddy,
+                  const paddle::framework::ExecutionContext& ctx) {
+#if defined(__NVCC__) || defined(__HIPCC__)
+    if (1 == in_tensor_d_ddout->dims().size()) {
+      framework::Tensor in_tensor_d_ddout_help;
+      auto& dev_raw = ctx.template device_context<DeviceContext>();
+      auto& dev = *dev_raw.eigen_device();
+      if (out_tensor_d_x || out_tensor_d_y) {
+        in_tensor_d_ddout_help.Resize(in_tensor_d_ddout->dims());
+        in_tensor_d_ddout_help.mutable_data<T>(ctx.GetPlace());
+        paddle::platform::ForRange<DeviceContext> for_range(
+            dev_raw, in_tensor_d_ddout->numel());
+        math::ConjFunctor<T> functor(in_tensor_d_ddout->data<T>(),
+                                     in_tensor_d_ddout->numel(),
+                                     in_tensor_d_ddout_help.data<T>());
+        for_range(functor);
+      }
+      if (out_tensor_d_x) {
+        auto ddy = framework::EigenVector<T>::Flatten(*in_tensor_ddy);
+        Eigen::DSizes<int, 1> size(in_tensor_ddy->numel());
+        auto d_x = framework::EigenVector<T>::Flatten(*out_tensor_d_x);
+        auto d_ddout =
+            framework::EigenVector<T>::Flatten(in_tensor_d_ddout_help);
+        d_x.device(dev) = ddy * d_ddout.broadcast(size);
+      }
+
+      if (out_tensor_d_y) {
+        auto ddx = framework::EigenVector<T>::Flatten(*in_tensor_ddx);
+        Eigen::DSizes<int, 1> size(in_tensor_ddx->numel());
+        auto d_y = framework::EigenVector<T>::Flatten(*out_tensor_d_y);
+        auto d_ddout =
+            framework::EigenVector<T>::Flatten(in_tensor_d_ddout_help);
+        d_y.device(dev) = ddx * d_ddout.broadcast(size);
+      }
+
+      if (out_tensor_d_dout) {
+        framework::Tensor in_tensor_ddx_help, in_tensor_ddy_help;
+        in_tensor_ddx_help.Resize(in_tensor_ddx->dims());
+        in_tensor_ddx_help.mutable_data<T>(ctx.GetPlace());
+        in_tensor_ddy_help.Resize(in_tensor_ddy->dims());
+        in_tensor_ddy_help.mutable_data<T>(ctx.GetPlace());
+
+        auto& dev_raw = ctx.template device_context<DeviceContext>();
+        auto& dev = *dev_raw.eigen_device();
+        paddle::platform::ForRange<DeviceContext> for_range(
+            dev_raw, in_tensor_ddx->numel());
+        math::ConjFunctor<T> functor_ddx(in_tensor_ddx->data<T>(),
+                                         in_tensor_ddx->numel(),
+                                         in_tensor_ddx_help.data<T>());
+        for_range(functor_ddx);
+        math::ConjFunctor<T> functor_ddy(in_tensor_ddy->data<T>(),
+                                         in_tensor_ddy->numel(),
+                                         in_tensor_ddy_help.data<T>());
+        for_range(functor_ddy);
+        auto ddx = framework::EigenVector<T>::Flatten(in_tensor_ddx_help);
+        auto ddy = framework::EigenVector<T>::Flatten(in_tensor_ddy_help);
+        auto d_dx = framework::EigenVector<T>::Flatten(*in_tensor_d_dx);
+        auto d_dy = framework::EigenVector<T>::Flatten(*in_tensor_d_dy);
+        auto d_dout = framework::EigenVector<T>::Flatten(*out_tensor_d_dout);
+        d_dout.device(dev) = (ddx * d_dy + ddy * d_dx).sum();
+      }
+      if (out_tensor_d_ddx) {
+        framework::Tensor in_tensor_dout_help, in_tensor_y_help;
+        in_tensor_dout_help.Resize(in_tensor_dout->dims());
+        in_tensor_dout_help.mutable_data<T>(ctx.GetPlace());
+        in_tensor_y_help.Resize(in_tensor_y->dims());
+        in_tensor_y_help.mutable_data<T>(ctx.GetPlace());
+
+        auto& dev_raw = ctx.template device_context<DeviceContext>();
+        auto& dev = *dev_raw.eigen_device();
+        paddle::platform::ForRange<DeviceContext> for_range(
+            dev_raw, in_tensor_dout->numel());
+        math::ConjFunctor<T> functor_dout(in_tensor_dout->data<T>(),
+                                          in_tensor_dout->numel(),
+                                          in_tensor_dout_help.data<T>());
+        for_range(functor_dout);
+        math::ConjFunctor<T> functor_y(in_tensor_y->data<T>(),
+                                       in_tensor_y->numel(),
+                                       in_tensor_y_help.data<T>());
+        for_range(functor_y);
+        auto dout = framework::EigenVector<T>::Flatten(in_tensor_dout_help);
+        auto y = framework::EigenVector<T>::Flatten(in_tensor_y_help);
+        auto d_ddout = framework::EigenVector<T>::Flatten(*in_tensor_d_ddout);
+        auto d_dy = framework::EigenVector<T>::Flatten(*in_tensor_d_dy);
+        auto d_ddx = framework::EigenVector<T>::Flatten(*out_tensor_d_ddx);
+        Eigen::DSizes<int, 1> size(in_tensor_y->numel());
+        d_ddx.device(dev) =
+            (dout.broadcast(size) * d_dy + y * d_ddout.broadcast(size));
+      }
+      if (out_tensor_d_ddy) {
+        framework::Tensor in_tensor_dout_help, in_tensor_x_help;
+        in_tensor_dout_help.Resize(in_tensor_dout->dims());
+        in_tensor_dout_help.mutable_data<T>(ctx.GetPlace());
+        in_tensor_x_help.Resize(in_tensor_x->dims());
+        in_tensor_x_help.mutable_data<T>(ctx.GetPlace());
+
+        auto& dev_raw = ctx.template device_context<DeviceContext>();
+        auto& dev = *dev_raw.eigen_device();
+        paddle::platform::ForRange<DeviceContext> for_range(
+            dev_raw, in_tensor_dout->numel());
+        math::ConjFunctor<T> functor_dout(in_tensor_dout->data<T>(),
+                                          in_tensor_dout->numel(),
+                                          in_tensor_dout_help.data<T>());
+        for_range(functor_dout);
+        math::ConjFunctor<T> functor_x(in_tensor_x->data<T>(),
+                                       in_tensor_x->numel(),
+                                       in_tensor_x_help.data<T>());
+        for_range(functor_x);
+        auto dout = framework::EigenVector<T>::Flatten(in_tensor_dout_help);
+        auto x = framework::EigenVector<T>::Flatten(in_tensor_x_help);
+        auto d_ddout = framework::EigenVector<T>::Flatten(*in_tensor_d_ddout);
+        auto d_dx = framework::EigenVector<T>::Flatten(*in_tensor_d_dx);
+        auto d_ddy = framework::EigenVector<T>::Flatten(*out_tensor_d_ddy);
+        Eigen::DSizes<int, 1> size(in_tensor_x->numel());
+        d_ddy.device(dev) =
+            (dout.broadcast(size) * d_dx + x * d_ddout.broadcast(size));
+      }
+    }
+#else
+    const auto* data_d_ddout = in_tensor_d_ddout->data<T>();
+
+    if (out_tensor_d_x) {
+      auto* data_d_x = out_tensor_d_x->mutable_data<T>(ctx.GetPlace());
+      const auto* data_ddy = in_tensor_ddy->data<T>();
+
+      const framework::DDim& dim = out_tensor_d_x->dims();
+      size_t N = static_cast<size_t>(framework::product(dim));
+      auto step = dim[dim.size() - 1];
+      int s = -1;
+
+      for (size_t i = 0; i < N; ++i) {
+        if (0 == i % step) ++s;
+        data_d_x[i] = T(data_ddy[i].real, -data_ddy[i].imag) * data_d_ddout[s];
+      }
+    }
+
+    if (out_tensor_d_y) {
+      auto* data_d_y = out_tensor_d_y->mutable_data<T>(ctx.GetPlace());
+      const auto* data_ddx = in_tensor_ddx->data<T>();
+
+      const framework::DDim& dim = out_tensor_d_y->dims();
+      size_t N = static_cast<size_t>(framework::product(dim));
+      auto step = dim[dim.size() - 1];
+      int s = -1;
+
+      for (size_t i = 0; i < N; ++i) {
+        if (0 == i % step) ++s;
+        data_d_y[i] = T(data_ddx[i].real, -data_ddx[i].imag) * data_d_ddout[s];
+      }
+    }
+
+    if (out_tensor_d_dout) {
+      auto* data_d_dout = out_tensor_d_dout->mutable_data<T>(ctx.GetPlace());
+      auto* data_ddx = in_tensor_ddx->data<T>();
+      auto* data_ddy = in_tensor_ddy->data<T>();
+      auto* data_d_dx = in_tensor_d_dx->data<T>();
+      auto* data_d_dy = in_tensor_d_dy->data<T>();
+
+      const framework::DDim& dim = out_tensor_d_dout->dims();
+      size_t N = static_cast<size_t>(framework::product(dim));
+      auto step = dim[dim.size() - 1];
+      int s = -1;
+      bool new_s = false;
+
+      for (size_t i = 0; i < N; ++i) {
+        if (0 == i % step) {
+          ++s;
+          new_s = true;
+        }
+        if (new_s) {
+          data_d_dout[s] =
+              T(data_ddy[i].real, -data_ddy[i].imag) * data_d_dx[i] +
+              T(data_ddx[i].real, -data_ddx[i].imag) * data_d_dy[i];
+        } else {
+          data_d_dout[s] +=
+              T(data_ddy[i].real, -data_ddy[i].imag) * data_d_dx[i] +
+              T(data_ddx[i].real, -data_ddx[i].imag) * data_d_dy[i];
+        }
+        new_s = false;
+      }
+    }
+
+    if (out_tensor_d_ddx) {
+      auto* data_d_ddx = out_tensor_d_ddx->mutable_data<T>(ctx.GetPlace());
+      auto* data_dout = in_tensor_dout->data<T>();
+      auto* data_d_dy = in_tensor_d_dy->data<T>();
+      auto* data_y = in_tensor_y->data<T>();
+      auto* data_d_ddout = in_tensor_d_ddout->data<T>();
+
+      const framework::DDim& dim = out_tensor_d_ddx->dims();
+      size_t N = static_cast<size_t>(framework::product(dim));
+      auto step = dim[dim.size() - 1];
+      int s = -1;
+
+      for (size_t i = 0; i < N; ++i) {
+        if (0 == i % step) ++s;
+        data_d_ddx[i] =
+            T(data_dout[s].real, -data_dout[s].imag) * data_d_dy[i] +
+            T(data_y[i].real, -data_y[i].imag) * data_d_ddout[s];
+      }
+    }
+
+    if (out_tensor_d_ddy) {
+      auto* data_d_ddy = out_tensor_d_ddy->mutable_data<T>(ctx.GetPlace());
+      auto* data_dout = in_tensor_dout->data<T>();
+      auto* data_d_dx = in_tensor_d_dx->data<T>();
+      auto* data_x = in_tensor_x->data<T>();
+      auto* data_d_ddout = in_tensor_d_ddout->data<T>();
+
+      const framework::DDim& dim = out_tensor_d_ddy->dims();
+      size_t N = static_cast<size_t>(framework::product(dim));
+      auto step = dim[dim.size() - 1];
+      int s = -1;
+
+      for (size_t i = 0; i < N; ++i) {
+        if (0 == i % step) ++s;
+        data_d_ddy[i] =
+            T(data_dout[s].real, -data_dout[s].imag) * data_d_dx[i] +
+            T(data_x[i].real, -data_x[i].imag) * data_d_ddout[s];
+      }
+    }
+#endif
+  }
+};
+
+template <typename DeviceContext, typename T>
+struct DotTripleGradFunction<DeviceContext, T, math::DisableComplex<T>> {
+  void operator()(const Tensor* in_tensor_x, const Tensor* in_tensor_y,
+                  const Tensor* in_tensor_ddx, const Tensor* in_tensor_ddy,
+                  const Tensor* in_tensor_d_dx, const Tensor* in_tensor_d_dy,
+                  const Tensor* in_tensor_dout, const Tensor* in_tensor_d_ddout,
+                  Tensor* out_tensor_d_x, Tensor* out_tensor_d_y,
+                  Tensor* out_tensor_d_dout, Tensor* out_tensor_d_ddx,
+                  Tensor* out_tensor_d_ddy,
+                  const paddle::framework::ExecutionContext& ctx) {
+#if defined(__NVCC__) || defined(__HIPCC__)
+    if (1 == in_tensor_d_ddout->dims().size()) {
+      auto& dev_raw = ctx.template device_context<DeviceContext>();
+      auto& dev = *dev_raw.eigen_device();
+      auto d_ddout = framework::EigenVector<T>::Flatten(*in_tensor_d_ddout);
+      if (out_tensor_d_x) {
+        out_tensor_d_x->mutable_data<T>(ctx.GetPlace());
+        auto ddy = framework::EigenVector<T>::Flatten(*in_tensor_ddy);
+        Eigen::DSizes<int, 1> size(in_tensor_ddy->numel());
+        auto d_x = framework::EigenVector<T>::Flatten(*out_tensor_d_x);
+        d_x.device(dev) = ddy * d_ddout.broadcast(size);
+      }
+
+      if (out_tensor_d_y) {
+        out_tensor_d_y->mutable_data<T>(ctx.GetPlace());
+        auto ddx = framework::EigenVector<T>::Flatten(*in_tensor_ddx);
+        Eigen::DSizes<int, 1> size(in_tensor_ddx->numel());
+
+        auto d_y = framework::EigenVector<T>::Flatten(*out_tensor_d_y);
+        d_y.device(dev) = ddx * d_ddout.broadcast(size);
+      }
+
+      if (out_tensor_d_dout) {
+        out_tensor_d_dout->mutable_data<T>(ctx.GetPlace());
+        auto ddx = framework::EigenVector<T>::Flatten(*in_tensor_ddx);
+        auto ddy = framework::EigenVector<T>::Flatten(*in_tensor_ddy);
+        auto d_dx = framework::EigenVector<T>::Flatten(*in_tensor_d_dx);
+        auto d_dy = framework::EigenVector<T>::Flatten(*in_tensor_d_dy);
+        auto d_dout = framework::EigenVector<T>::Flatten(*out_tensor_d_dout);
+        d_dout.device(dev) = (ddx * d_dy + ddy * d_dx).sum();
+      }
+
+      if (out_tensor_d_ddx) {
+        out_tensor_d_ddx->mutable_data<T>(ctx.GetPlace());
+        auto dout = framework::EigenVector<T>::Flatten(*in_tensor_dout);
+        auto y = framework::EigenVector<T>::Flatten(*in_tensor_y);
+        auto d_ddout = framework::EigenVector<T>::Flatten(*in_tensor_d_ddout);
+        auto d_dy = framework::EigenVector<T>::Flatten(*in_tensor_d_dy);
+        auto d_ddx = framework::EigenVector<T>::Flatten(*out_tensor_d_ddx);
+        Eigen::DSizes<int, 1> size(in_tensor_y->numel());
+        d_ddx.device(dev) =
+            (dout.broadcast(size) * d_dy + y * d_ddout.broadcast(size));
+      }
+
+      if (out_tensor_d_ddy) {
+        out_tensor_d_ddy->mutable_data<T>(ctx.GetPlace());
+        auto dout = framework::EigenVector<T>::Flatten(*in_tensor_dout);
+        auto x = framework::EigenVector<T>::Flatten(*in_tensor_x);
+        auto d_ddout = framework::EigenVector<T>::Flatten(*in_tensor_d_ddout);
+        auto d_dx = framework::EigenVector<T>::Flatten(*in_tensor_d_dx);
+        auto d_ddy = framework::EigenVector<T>::Flatten(*out_tensor_d_ddy);
+        Eigen::DSizes<int, 1> size(in_tensor_x->numel());
+        d_ddy.device(dev) =
+            (dout.broadcast(size) * d_dx + x * d_ddout.broadcast(size));
+      }
+    }
+#else
+    const auto* data_d_ddout = in_tensor_d_ddout->data<T>();
+
+    if (out_tensor_d_x) {
+      auto* data_d_x = out_tensor_d_x->mutable_data<T>(ctx.GetPlace());
+      const auto* data_ddy = in_tensor_ddy->data<T>();
+
+      const framework::DDim& dim = out_tensor_d_x->dims();
+      size_t N = static_cast<size_t>(framework::product(dim));
+      auto step = dim[dim.size() - 1];
+      int s = -1;
+
+      for (size_t i = 0; i < N; ++i) {
+        if (0 == i % step) ++s;
+        data_d_x[i] = data_ddy[i] * data_d_ddout[s];
+      }
+    }
+
+    if (out_tensor_d_y) {
+      auto* data_d_y = out_tensor_d_y->mutable_data<T>(ctx.GetPlace());
+      const auto* data_ddx = in_tensor_ddx->data<T>();
+
+      const framework::DDim& dim = out_tensor_d_y->dims();
+      size_t N = static_cast<size_t>(framework::product(dim));
+      auto step = dim[dim.size() - 1];
+      int s = -1;
+
+      for (size_t i = 0; i < N; ++i) {
+        if (0 == i % step) ++s;
+        data_d_y[i] = data_ddx[i] * data_d_ddout[s];
+      }
+    }
+
+    if (out_tensor_d_dout) {
+      auto* data_d_dout = out_tensor_d_dout->mutable_data<T>(ctx.GetPlace());
+      auto* data_ddx = in_tensor_ddx->data<T>();
+      auto* data_ddy = in_tensor_ddy->data<T>();
+      auto* data_d_dx = in_tensor_d_dx->data<T>();
+      auto* data_d_dy = in_tensor_d_dy->data<T>();
+
+      const framework::DDim& dim = in_tensor_ddx->dims();
+      size_t N = static_cast<size_t>(framework::product(dim));
+      auto step = dim[dim.size() - 1];
+      int s = -1;
+      bool new_s = false;
+      for (size_t i = 0; i < N; ++i) {
+        if (0 == i % step) {
+          ++s;
+          new_s = true;
+        }
+        if (new_s) {
+          data_d_dout[s] =
+              data_ddy[i] * data_d_dx[i] + data_ddx[i] * data_d_dy[i];
+        } else {
+          data_d_dout[s] +=
+              data_ddy[i] * data_d_dx[i] + data_ddx[i] * data_d_dy[i];
+        }
+        new_s = false;
+      }
+    }
+
+    if (out_tensor_d_ddx) {
+      auto* data_d_ddx = out_tensor_d_ddx->mutable_data<T>(ctx.GetPlace());
+      auto* data_dout = in_tensor_dout->data<T>();
+      auto* data_d_dy = in_tensor_d_dy->data<T>();
+      auto* data_y = in_tensor_y->data<T>();
+      auto* data_d_ddout = in_tensor_d_ddout->data<T>();
+
+      const framework::DDim& dim = out_tensor_d_ddx->dims();
+      size_t N = static_cast<size_t>(framework::product(dim));
+      auto step = dim[dim.size() - 1];
+      int s = -1;
+
+      for (size_t i = 0; i < N; ++i) {
+        if (0 == i % step) ++s;
+        data_d_ddx[i] =
+            data_dout[s] * data_d_dy[i] + data_y[i] * data_d_ddout[s];
+      }
+    }
+
+    if (out_tensor_d_ddy) {
+      auto* data_d_ddy = out_tensor_d_ddy->mutable_data<T>(ctx.GetPlace());
+      auto* data_dout = in_tensor_dout->data<T>();
+      auto* data_d_dx = in_tensor_d_dx->data<T>();
+      auto* data_x = in_tensor_x->data<T>();
+      auto* data_d_ddout = in_tensor_d_ddout->data<T>();
+
+      const framework::DDim& dim = out_tensor_d_ddy->dims();
+      size_t N = static_cast<size_t>(framework::product(dim));
+      auto step = dim[dim.size() - 1];
+      int s = -1;
+
+      for (size_t i = 0; i < N; ++i) {
+        if (0 == i % step) ++s;
+        data_d_ddy[i] =
+            data_dout[s] * data_d_dx[i] + data_x[i] * data_d_ddout[s];
+      }
+    }
+#endif
+  }
+};
+
 template <typename DeviceContext, typename T>
 class MatMulV2GradKernel : public framework::OpKernel<T> {
  public:
@@ -1322,7 +1744,7 @@ class MatMulV2DoubleGradKernel : public framework::OpKernel<T> {
       }
 
       if (ddout) {
-        // Caluate the gradient of OutputGrad(Out)
+        // Calculate the gradient of OutputGrad(Out)
         MatMulFunction<DeviceContext, T>(ddx, &y_conj, x_dims, y_dims, ddout,
                                          transpose_x, transpose_y, context);
         MatMulFunction<DeviceContext, T>(&x_conj, ddy, x_dims, y_dims, ddout,
@@ -1332,5 +1754,609 @@ class MatMulV2DoubleGradKernel : public framework::OpKernel<T> {
     }
   }
 };
+
+template <typename DeviceContext, typename T>
+class MatMulV2TripleGradKernel : public framework::OpKernel<T> {
+ public:
+  void MatMul(const framework::ExecutionContext& context,
+              const framework::Tensor& a, bool trans_a,
+              const framework::Tensor& b, bool trans_b, framework::Tensor* out,
+              bool flag) const {
+    out->mutable_data<T>(context.GetPlace());
+    auto blas = math::GetBlas<DeviceContext, T>(context);
+    auto mat_dim_a = math::CreateMatrixDescriptor(a.dims(), 0, trans_a);
+    auto mat_dim_b = math::CreateMatrixDescriptor(b.dims(), 0, trans_b);
+    if (a.dims().size() == 3 && b.dims().size() <= 2) {
+      // the transpose_X must be false, if is true, the transpose cost much time
+      if (!trans_a) {
+        mat_dim_a.height_ *= mat_dim_a.batch_size_;
+        mat_dim_a.batch_size_ = 0;
+      }
+    }
+    blas.MatMul(a, mat_dim_a, b, mat_dim_b, static_cast<T>(1), out,
+                static_cast<T>(flag));
+  }
+
+  void CalcInputGrad(const framework::ExecutionContext& context,
+                     const framework::Tensor& a, bool trans_a,
+                     bool is_fold_init_dims_a, const framework::Tensor& b,
+                     bool trans_b, bool is_fold_init_dims_b,
+                     framework::Tensor* out, bool flag) const {
+    if (out == nullptr) return;
+    bool need_combine = (a.dims().size() == 3 || b.dims().size() == 3) &&
+                        out->dims().size() == 2;
+    if (!need_combine) {
+      MatMul(context, a, trans_a, b, trans_b, out, flag);
+    } else {
+      auto& ctx = context.template device_context<DeviceContext>();
+      MatMul(context, is_fold_init_dims_a
+                          ? FoldInitDims(a)
+                          : FoldHeadAndLastDims<DeviceContext, T>(ctx, a),
+             trans_a, is_fold_init_dims_b
+                          ? FoldInitDims(b)
+                          : FoldHeadAndLastDims<DeviceContext, T>(ctx, b),
+             trans_b, out, flag);
+    }
+  }
+
+  void Compute(const framework::ExecutionContext& context) const override {
+    // get input
+    auto x = *context.Input<framework::Tensor>("X");
+    auto y = *context.Input<framework::Tensor>("Y");
+    auto dout = *context.Input<framework::Tensor>("DOut");
+    auto ddx = *context.Input<framework::Tensor>("DDX");
+    auto ddy = *context.Input<framework::Tensor>("DDY");
+
+    auto* d_dx = context.Input<framework::Tensor>("D_DX");
+    auto* d_dy = context.Input<framework::Tensor>("D_DY");
+    auto* d_ddout = context.Input<framework::Tensor>("D_DDOut");
+
+    // get output
+    auto* out_d_x = context.Output<framework::Tensor>("D_X_out");
+    auto* out_d_y = context.Output<framework::Tensor>("D_Y_out");
+    auto* out_d_dout = context.Output<framework::Tensor>("D_DOut_out");
+
+    auto* out_d_ddx = context.Output<framework::Tensor>("D_DDX_out");
+    auto* out_d_ddy = context.Output<framework::Tensor>("D_DDY_out");
+
+    bool transpose_x = context.Attr<bool>("trans_x");
+    bool transpose_y = context.Attr<bool>("trans_y");
+
+    // Get dims from the input x, y, output_grad
+    std::vector<std::int64_t> x_dims = vectorize(x.dims());
+    std::vector<std::int64_t> y_dims = vectorize(y.dims());
+    std::vector<std::int64_t> dout_dims = vectorize(dout.dims());
+    framework::Tensor x_conj(x.type());
+    framework::Tensor y_conj(y.type());
+    framework::Tensor dout_conj(dout.type());
+    framework::Tensor ddx_conj(ddx.type());
+    framework::Tensor ddy_conj(ddy.type());
+
+    int x_ndim = x_dims.size();
+    int y_ndim = y_dims.size();
+    int ndim = dout_dims.size();
+
+    // Case1 : x's and y's dim = 1
+    if (x_ndim == 1 && y_ndim == 1) {
+      VLOG(3) << "========  MatMulV2TripleGradKernel, Compute ====== Case 1";
+
+      DotTripleGradFunction<DeviceContext, T>()(
+          &x, &y, &ddx, &ddy, d_dx, d_dy, &dout, d_ddout, out_d_x, out_d_y,
+          out_d_dout, out_d_ddx, out_d_ddy, context);
+      return;
+    }
+
+    bool is_broadcast = true;
+    if (x_ndim <= 2 || y_ndim <= 2) {
+      is_broadcast = false;
+    } else if (x_ndim != y_ndim) {
+      is_broadcast = true;
+    } else {
+      is_broadcast = !std::equal(x_dims.cbegin(), x_dims.cbegin() + x_ndim - 2,
+                                 y_dims.cbegin());
+    }
+
+    if (!is_broadcast) {
+      // Case2: no broadcast or no batch size
+      VLOG(3) << "========  MatMulV2TripleGradKernel, Compute ====== Case 2";
+      ReshapeXYOutIntoMatrixSequence(&x, &y, &dout, transpose_x, transpose_y);
+
+      if (ddx.dims() != x.dims()) {
+        ddx.Resize(x.dims());
+      }
+
+      if (ddy.dims() != y.dims()) {
+        ddy.Resize(y.dims());
+      }
+
+      ConjHelper<DeviceContext, T> conj_helper(context);
+
+      framework::DDim out_dx_dims;
+      if (out_d_x) {
+        out_dx_dims = out_d_x->dims();
+        if (out_dx_dims != x.dims()) {
+          out_d_x->Resize(x.dims());
+        }
+      }
+
+      framework::DDim out_dy_dims;
+      if (out_d_y) {
+        out_dy_dims = out_d_y->dims();
+        if (out_dy_dims != y.dims()) {
+          out_d_y->Resize(y.dims());
+        }
+      }
+
+      framework::DDim out_d_dout_dims;
+      if (out_d_dout) {
+        out_d_dout_dims = out_d_dout->dims();
+        if (out_d_dout_dims != dout.dims()) {
+          out_d_dout->Resize(dout.dims());
+        }
+      }
+
+      framework::DDim out_d_ddx_dims;
+      if (out_d_ddx) {
+        out_d_ddx_dims = out_d_ddx->dims();
+        if (out_d_ddx_dims != x.dims()) {
+          out_d_ddx->Resize(x.dims());
+        }
+      }
+
+      framework::DDim out_d_ddy_dims;
+      if (out_d_ddy) {
+        out_d_ddy_dims = out_d_ddy->dims();
+        if (out_d_ddy_dims != y.dims()) {
+          out_d_ddy->Resize(y.dims());
+        }
+      }
+
+      if (out_d_dout) {
+        ConjHelper<DeviceContext, T> conj_helper(context);
+        conj_helper(ddx, ddx_conj);
+        conj_helper(ddy, ddy_conj);
+      }
+
+      if (out_d_ddx || out_d_ddy) {
+        ConjHelper<DeviceContext, T> conj_helper(context);
+        conj_helper(x, x_conj);
+        conj_helper(y, y_conj);
+        conj_helper(dout, dout_conj);
+      }
+
+      bool d_dout_flag = false;
+      bool d_ddx_flag = false;
+      bool d_ddy_flag = false;
+
+      if (d_ddout) {
+        auto d_ddout_mat = *d_ddout;
+        if (d_ddout_mat.dims() != dout.dims()) {
+          d_ddout_mat.Resize(dout.dims());
+        }
+
+        if (out_d_y) {
+          if (transpose_x && transpose_y) {
+            // out_d_y = d_ddout' * ddx'
+            CalcInputGrad(context, d_ddout_mat, true, true, ddx_conj, true,
+                          false, out_d_y, false);
+          } else if (transpose_x) {
+            // out_d_y = ddx * d_ddout
+            CalcInputGrad(context, ddx_conj, false, false, d_ddout_mat, false,
+                          true, out_d_y, false);
+          } else if (transpose_y) {
+            // out_d_y = d_ddout' * ddx
+            CalcInputGrad(context, d_ddout_mat, true, true, ddx_conj, false,
+                          true, out_d_y, false);
+          } else {
+            // out_d_y = ddx' * d_ddout
+            CalcInputGrad(context, ddx_conj, true, true, d_ddout_mat, false,
+                          true, out_d_y, false);
+          }
+        }
+
+        if (out_d_x) {
+          if (transpose_x && transpose_y) {
+            // out_d_x = ddy' * d_ddout'
+            CalcInputGrad(context, ddy_conj, true, true, d_ddout_mat, true,
+                          false, out_d_x, false);
+          } else if (transpose_x) {
+            // out_d_x = ddy * d_ddout'
+            CalcInputGrad(context, ddy_conj, false, false, d_ddout_mat, true,
+                          false, out_d_x, false);
+          } else if (transpose_y) {
+            // out_d_x = d_ddout * ddy
+            CalcInputGrad(context, d_ddout_mat, false, false, ddy_conj, false,
+                          true, out_d_x, false);
+          } else {
+            // out_d_x = d_ddout * ddy'
+            CalcInputGrad(context, d_ddout_mat, false, false, ddy_conj, true,
+                          false, out_d_x, false);
+          }
+        }
+
+        // equations:
+        // d_ddx = DOut * D_DY + Y * D_DDOut
+        // Let: d_ddx1 = Y * D_DDOut
+        // Let: d_ddx2 = DOut * D_DY
+
+        // d_ddy = DOut * D_DX + X * D_DDOut
+        // Let: d_ddy1 = X * D_DDOut
+        // Let: d_ddy2 = DOut * D_DX
+
+        // d_dout = DDY * D_DX + DDX * D_DY
+        // Let: d_dout1 = DDX * D_DY
+        // Let: d_dout2 = DDY * D_DX
+
+        // compute d_ddx1
+        if (out_d_ddx) {
+          if (transpose_x && transpose_y) {
+            // out_d_ddx1 = y' * d_ddout'
+            CalcInputGrad(context, y_conj, true, true, d_ddout_mat, true, false,
+                          out_d_ddx, d_ddx_flag);
+          } else if (transpose_x) {
+            // out_d_ddx1 = y * d_ddout'
+            CalcInputGrad(context, y_conj, false, false, d_ddout_mat, true,
+                          false, out_d_ddx, d_ddx_flag);
+          } else if (transpose_y) {
+            // out_d_ddx1 = d_ddout * y
+            CalcInputGrad(context, d_ddout_mat, false, false, y_conj, false,
+                          true, out_d_ddx, d_ddx_flag);
+          } else {
+            // out_d_ddx1 = d_ddout * y'
+            CalcInputGrad(context, d_ddout_mat, false, false, y_conj, true,
+                          false, out_d_ddx, d_ddx_flag);
+          }
+          d_ddx_flag = true;
+        }
+
+        // compute d_ddy1
+        if (out_d_ddy) {
+          if (transpose_x && transpose_y) {
+            // out_d_ddy1 = d_ddout' * x'
+            CalcInputGrad(context, d_ddout_mat, true, true, x_conj, true, false,
+                          out_d_ddy, false);
+          } else if (transpose_x) {
+            // out_d_ddy1 = x * d_ddout
+            CalcInputGrad(context, x_conj, false, false, d_ddout_mat, false,
+                          true, out_d_ddy, false);
+          } else if (transpose_y) {
+            // out_d_ddy1 = d_ddout' * x
+            CalcInputGrad(context, d_ddout_mat, true, true, x_conj, false, true,
+                          out_d_ddy, false);
+          } else {
+            // out_d_ddy1 = x' * d_ddout
+            CalcInputGrad(context, x_conj, true, true, d_ddout_mat, false, true,
+                          out_d_ddy, false);
+          }
+          d_ddy_flag = true;
+        }
+      }
+
+      if (d_dy) {
+        auto d_dy_mat = *d_dy;
+        if (d_dy_mat.dims() != y.dims()) {
+          d_dy_mat.Resize(y.dims());
+        }
+
+        // compute d_dout1
+        if (out_d_dout) {
+          CalcInputGrad(context, ddx_conj, transpose_x, true, d_dy_mat,
+                        transpose_y, false, out_d_dout, d_dout_flag);
+          d_dout_flag = true;
+        }
+
+        // compute d_ddx2
+        if (out_d_ddx) {
+          if (transpose_x && transpose_y) {
+            // out_d_ddx2 = D_DY' * DOut'
+            CalcInputGrad(context, d_dy_mat, true, true, dout_conj, true, false,
+                          out_d_ddx, d_ddx_flag);
+          } else if (transpose_x) {
+            // out_d_ddx2 = D_DY * Dout'
+            CalcInputGrad(context, d_dy_mat, false, false, dout_conj, true,
+                          false, out_d_ddx, d_ddx_flag);
+          } else if (transpose_y) {
+            // out_d_ddx2 = Dout * D_DY
+            CalcInputGrad(context, dout_conj, false, false, d_dy_mat, false,
+                          true, out_d_ddx, d_ddx_flag);
+          } else {
+            // out_d_ddx2 = Dout * D_DY'
+            CalcInputGrad(context, dout_conj, false, false, d_dy_mat, true,
+                          false, out_d_ddx, d_ddx_flag);
+          }
+        }
+      }
+
+      if (d_dx) {
+        auto d_dx_mat = *d_dx;
+        if (d_dx_mat.dims() != x.dims()) {
+          d_dx_mat.Resize(x.dims());
+        }
+
+        // compute d_dout2
+        if (out_d_dout) {
+          CalcInputGrad(context, d_dx_mat, transpose_x, true, ddy_conj,
+                        transpose_y, false, out_d_dout, d_dout_flag);
+        }
+
+        // compute d_ddy2
+        if (out_d_ddy) {
+          if (transpose_x && transpose_y) {
+            // out_d_ddy2 = dout' * d_dx'
+            CalcInputGrad(context, dout_conj, true, true, d_dx_mat, true, false,
+                          out_d_ddy, d_ddy_flag);
+          } else if (transpose_x) {
+            // out_d_ddy2 = d_dx * dout
+            CalcInputGrad(context, d_dx_mat, false, false, dout_conj, false,
+                          true, out_d_ddy, d_ddy_flag);
+          } else if (transpose_y) {
+            // out_d_ddy2 = dout' * d_dx
+            CalcInputGrad(context, dout_conj, true, true, d_dx_mat, false, true,
+                          out_d_ddy, d_ddy_flag);
+          } else {
+            // out_d_ddy2 = d_dx' * dout
+            CalcInputGrad(context, d_dx_mat, true, true, dout_conj, false, true,
+                          out_d_ddy, d_ddy_flag);
+          }
+        }
+      }
+
+      if (out_d_x) {
+        if (out_dx_dims != x.dims()) {
+          out_d_x->Resize(out_dx_dims);
+        }
+      }
+
+      if (out_d_y) {
+        if (out_dy_dims != y.dims()) {
+          out_d_y->Resize(out_dy_dims);
+        }
+      }
+
+      if (out_d_dout) {
+        if (out_d_dout_dims != dout.dims()) {
+          out_d_dout->Resize(out_d_dout_dims);
+        }
+      }
+
+      if (out_d_ddx) {
+        if (out_d_ddx_dims != x.dims()) {
+          out_d_ddx->Resize(out_d_ddx_dims);
+        }
+      }
+
+      if (out_d_ddy) {
+        if (out_d_ddy_dims != x.dims()) {
+          out_d_ddy->Resize(out_d_ddy_dims);
+        }
+      }
+
+    } else {
+      // Case3: broadcast. It need cost much time to reduce sum for the
+      // broadcast and wastes the memory.
+      // So we should avoid the case in reality.
+      VLOG(3) << "========  MatMulV2TripleGradKernel, Compute ====== Case 3";
+      VLOG(3) << "It need cost much time to reduce sum for the broadcast and "
+                 "wastes the memory. So we should avoid the case in reality";
+
+      Tensor out_dx_help, out_dy_help;
+      Tensor out_d_ddx_help, out_d_ddy_help;
+      if (out_d_dout) {
+        ConjHelper<DeviceContext, T> conj_helper(context);
+        conj_helper(ddx, ddx_conj);
+        conj_helper(ddy, ddy_conj);
+      }
+      if (out_d_ddx || out_d_ddy) {
+        ConjHelper<DeviceContext, T> conj_helper(context);
+        conj_helper(x, x_conj);
+        conj_helper(y, y_conj);
+        conj_helper(dout, dout_conj);
+      }
+
+      if (transpose_x) {
+        if (transpose_y) {
+          // dX = ddY' d_ddout’, dY = d_ddout’ ddX'
+          if (out_d_x)
+            MatMulFunction<DeviceContext, T>(&ddy_conj, d_ddout, y_dims,
+                                             dout_dims, &out_dx_help, true,
+                                             true, context);
+          if (out_d_y)
+            MatMulFunction<DeviceContext, T>(d_ddout, &ddx_conj, dout_dims,
+                                             x_dims, &out_dy_help, true, true,
+                                             context);
+        } else {
+          // dX = ddY d_ddout', dY = ddX d_ddout
+          if (out_d_x)
+            MatMulFunction<DeviceContext, T>(&ddy_conj, d_ddout, y_dims,
+                                             dout_dims, &out_dx_help, false,
+                                             true, context);
+          if (out_d_y)
+            MatMulFunction<DeviceContext, T>(&ddx_conj, d_ddout, x_dims,
+                                             dout_dims, &out_dy_help, false,
+                                             false, context);
+        }
+      } else {
+        if (transpose_y) {
+          // dX = d_ddout ddY, dY = d_ddout’ ddX
+          if (out_d_x)
+            MatMulFunction<DeviceContext, T>(d_ddout, &ddy_conj, dout_dims,
+                                             y_dims, &out_dx_help, false, false,
+                                             context);
+          if (out_d_y)
+            MatMulFunction<DeviceContext, T>(d_ddout, &ddx_conj, dout_dims,
+                                             x_dims, &out_dy_help, true, false,
+                                             context);
+        } else {
+          // dX = d_ddout ddY', dY = ddX' d_ddout
+          if (out_d_x)
+            MatMulFunction<DeviceContext, T>(d_ddout, &ddy_conj, dout_dims,
+                                             y_dims, &out_dx_help, false, true,
+                                             context);
+          if (out_d_y)
+            MatMulFunction<DeviceContext, T>(&ddx_conj, d_ddout, x_dims,
+                                             dout_dims, &out_dy_help, true,
+                                             false, context);
+        }
+      }
+
+      // get help dims
+      const std::vector<std::int64_t> dx_help_dims =
+          vectorize(out_dx_help.dims());
+      const std::vector<std::int64_t> dy_help_dims =
+          vectorize(out_dx_help.dims());
+
+      std::vector<std::int64_t> dx_broadcast_dims(ndim);
+      std::vector<std::int64_t> dy_broadcast_dims(ndim);
+
+      std::fill(dx_broadcast_dims.data(),
+                dx_broadcast_dims.data() + ndim - x_ndim, 1);
+      std::fill(dy_broadcast_dims.data(),
+                dy_broadcast_dims.data() + ndim - y_ndim, 1);
+      std::copy(x_dims.data(), x_dims.data() + x_ndim,
+                dx_broadcast_dims.data() + ndim - x_ndim);
+      std::copy(y_dims.data(), y_dims.data() + y_ndim,
+                dy_broadcast_dims.data() + ndim - y_ndim);
+
+      std::vector<int> dx_reduce_dims;
+      std::vector<int> dy_reduce_dims;
+      for (int idx = 0; idx <= ndim - 3; idx++) {
+        if (dx_help_dims[idx] != 1 && dx_broadcast_dims[idx] == 1) {
+          dx_reduce_dims.push_back(idx);
+        }
+        if (dy_help_dims[idx] != 1 && dy_broadcast_dims[idx] == 1) {
+          dy_reduce_dims.push_back(idx);
+        }
+      }
+      // Reduce sum to get grad by ReduceSum
+      if (out_d_x) {
+        if (dx_reduce_dims.empty()) {
+          *out_d_x = std::move(out_dx_help);
+        } else {
+          ReduceSumForMatmulGrad<DeviceContext, T>(&out_dx_help, out_d_x,
+                                                   dx_reduce_dims, context);
+        }
+        out_d_x->Resize(x.dims());
+      }
+
+      if (out_d_y) {
+        if (dy_reduce_dims.empty()) {
+          *out_d_y = std::move(out_dy_help);
+        } else {
+          ReduceSumForMatmulGrad<DeviceContext, T>(&out_dy_help, out_d_y,
+                                                   dy_reduce_dims, context);
+        }
+        out_d_y->Resize(y.dims());
+      }
+
+      // compute d_dout
+      if (out_d_dout) {
+        MatMulFunction<DeviceContext, T>(d_dx, &ddy_conj, x_dims, y_dims,
+                                         out_d_dout, transpose_x, transpose_y,
+                                         context);
+        MatMulFunction<DeviceContext, T>(&ddx_conj, d_dy, x_dims, y_dims,
+                                         out_d_dout, transpose_x, transpose_y,
+                                         context, true);
+      }
+
+      // compute d_ddx
+      if (out_d_ddx) {
+        if (transpose_x && transpose_y) {
+          // out_d_ddx1 = y' * d_ddout'
+          MatMulFunction<DeviceContext, T>(&y_conj, d_ddout, y_dims, dout_dims,
+                                           &out_d_ddx_help, true, true,
+                                           context);
+          // out_d_ddx2 = D_DY' * DOut'
+          MatMulFunction<DeviceContext, T>(d_dy, &dout_conj, y_dims, dout_dims,
+                                           &out_d_ddx_help, true, true, context,
+                                           true);
+        } else if (transpose_x) {
+          // out_d_ddx1 = y * d_ddout'
+          MatMulFunction<DeviceContext, T>(&y_conj, d_ddout, y_dims, dout_dims,
+                                           &out_d_ddx_help, false, true,
+                                           context);
+          // out_d_ddx2 = D_DY * Dout'
+          MatMulFunction<DeviceContext, T>(d_dy, &dout_conj, y_dims, dout_dims,
+                                           &out_d_ddx_help, false, true,
+                                           context, true);
+        } else if (transpose_y) {
+          // out_d_ddx1 = d_ddout * y
+          MatMulFunction<DeviceContext, T>(d_ddout, &y_conj, dout_dims, y_dims,
+                                           &out_d_ddx_help, false, false,
+                                           context);
+          // out_d_ddx2 = Dout * D_DY
+          MatMulFunction<DeviceContext, T>(&dout_conj, d_dy, dout_dims, y_dims,
+                                           &out_d_ddx_help, false, false,
+                                           context, true);
+        } else {
+          // out_d_ddx1 = d_ddout * y'
+          MatMulFunction<DeviceContext, T>(d_ddout, &y_conj, dout_dims, y_dims,
+                                           &out_d_ddx_help, false, true,
+                                           context);
+          // out_d_ddx2 = Dout * D_DY'
+          MatMulFunction<DeviceContext, T>(&dout_conj, d_dy, dout_dims, y_dims,
+                                           &out_d_ddx_help, false, true,
+                                           context, true);
+        }
+        if (dx_reduce_dims.empty()) {
+          *out_d_ddx = std::move(out_d_ddx_help);
+        } else {
+          ReduceSumForMatmulGrad<DeviceContext, T>(&out_d_ddx_help, out_d_ddx,
+                                                   dx_reduce_dims, context);
+        }
+        out_d_ddx->Resize(x.dims());
+      }
+
+      // compute d_ddy
+      if (out_d_ddy) {
+        if (transpose_x && transpose_y) {
+          // out_d_ddy1 = d_ddout' * x'
+          MatMulFunction<DeviceContext, T>(d_ddout, &x_conj, dout_dims, x_dims,
+                                           &out_d_ddy_help, true, true,
+                                           context);
+          // out_d_ddy2 = dout' * d_dx'
+          MatMulFunction<DeviceContext, T>(&dout_conj, d_dx, dout_dims, x_dims,
+                                           &out_d_ddy_help, true, true, context,
+                                           true);
+        } else if (transpose_x) {
+          // out_d_ddy1 = x * d_ddout
+          MatMulFunction<DeviceContext, T>(&x_conj, d_ddout, x_dims, dout_dims,
+                                           &out_d_ddy_help, false, false,
+                                           context);
+          // out_d_ddy2 = d_dx * dout
+          MatMulFunction<DeviceContext, T>(d_dx, &dout_conj, x_dims, dout_dims,
+                                           &out_d_ddy_help, false, false,
+                                           context, true);
+        } else if (transpose_y) {
+          // out_d_ddy1 = d_ddout' * x
+          MatMulFunction<DeviceContext, T>(d_ddout, &x_conj, dout_dims, x_dims,
+                                           &out_d_ddy_help, true, false,
+                                           context);
+          // out_d_ddy2 = dout' * d_dx
+          MatMulFunction<DeviceContext, T>(&dout_conj, d_dx, dout_dims, x_dims,
+                                           &out_d_ddy_help, true, false,
+                                           context, true);
+        } else {
+          // out_d_ddy1 = x' * d_ddout
+          MatMulFunction<DeviceContext, T>(&x_conj, d_ddout, x_dims, dout_dims,
+                                           &out_d_ddy_help, true, false,
+                                           context);
+          // out_d_ddy2 = d_dx' * dout
+          MatMulFunction<DeviceContext, T>(d_dx, &dout_conj, x_dims, dout_dims,
+                                           &out_d_ddy_help, true, false,
+                                           context, true);
+        }
+
+        if (dy_reduce_dims.empty()) {
+          *out_d_ddy = std::move(out_d_ddy_help);
+        } else {
+          ReduceSumForMatmulGrad<DeviceContext, T>(&out_d_ddy_help, out_d_ddy,
+                                                   dy_reduce_dims, context);
+        }
+        out_d_ddy->Resize(y.dims());
+      }
+    }
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/matmul_v2_op_xpu.cc b/paddle/fluid/operators/matmul_v2_op_xpu.cc
index ae1e9358f6811..908a23c4ecc63 100644
--- a/paddle/fluid/operators/matmul_v2_op_xpu.cc
+++ b/paddle/fluid/operators/matmul_v2_op_xpu.cc
@@ -18,6 +18,8 @@
 #include <string>
 #include <vector>
 
+#include "paddle/fluid/operators/xpu_api_wrapper.h"
+
 namespace paddle {
 namespace operators {
 
@@ -74,17 +76,21 @@ static void MatMulXPUFunction(const Tensor* x, const Tensor* y, Tensor* out,
   int n = mat_dim_b.width_;
   int k = mat_dim_a.width_;
   int batch_size = mat_dim_a.batch_size_;
+  int ldx = mat_dim_a.trans_ ? m : k;
+  int ldy = mat_dim_b.trans_ ? k : n;
+  int ldout = n;
   if (batch_size <= 1) {
     int r = 0;
-    r = xpu::fc<XPUType, XPUType, XPUType, FCT>(
+    r = xpu_fc_wrapper<XPUType, FCT>(
         dev_ctx.x_context(), reinterpret_cast<const XPUType*>(x->data<T>()),
         reinterpret_cast<const XPUType*>(y->data<T>()),
         reinterpret_cast<XPUType*>(data_c), m, n, k, mat_dim_a.trans_,
-        mat_dim_b.trans_, nullptr, nullptr, nullptr);
+        mat_dim_b.trans_, nullptr, nullptr, nullptr, ldx, ldy, ldout, 1.0, 0,
+        nullptr, xpu::Activation_t::LINEAR);
     PADDLE_ENFORCE_EQ(
         r, XPU_SUCCESS,
         platform::errors::External(
-            "XPU fc_fusion kernel return wrong value[%d %s] , m = %d, n = "
+            "XPU fc kernel return wrong value[%d %s] , m = %d, n = "
             "%d, "
             "k = %d, a_tr = %d, b_tr = %d",
             r, XPUAPIErrorMsg[r], m, n, k, mat_dim_a.trans_, mat_dim_b.trans_));
@@ -129,8 +135,10 @@ class MatMulV2XPUKernel : public framework::OpKernel<T> {
     if (std::is_same<paddle::platform::float16, T>::value) {
       MatMulXPUFunction<T, int16_t>(x, y, out, trans_x, trans_y, ctx);
     } else {
-      if (std::getenv("XPU_PADDLE_MAT_MUL_V2_FCINT32") != nullptr) {
+      if (std::getenv("XPU_PADDLE_FC_INT32") != nullptr) {
         MatMulXPUFunction<T, int32_t>(x, y, out, trans_x, trans_y, ctx);
+      } else if (std::getenv("XPU_PADDLE_FC_LOCAL_INT16") != nullptr) {
+        MatMulXPUFunction<T, float>(x, y, out, trans_x, trans_y, ctx);
       } else {
         MatMulXPUFunction<T, int16_t>(x, y, out, trans_x, trans_y, ctx);
       }
@@ -178,8 +186,10 @@ class MatMulV2XPUGradKernel : public framework::OpKernel<T> {
     if (std::is_same<paddle::platform::float16, T>::value) {
       MatMulXPUFunction<T, int16_t>(&a, &b, out, trans_a, trans_b, ctx);
     } else {
-      if (std::getenv("XPU_PADDLE_MAT_MUL_GRAD_V2_FCINT32") != nullptr) {
+      if (std::getenv("XPU_PADDLE_FC_INT32") != nullptr) {
         MatMulXPUFunction<T, int32_t>(&a, &b, out, trans_a, trans_b, ctx);
+      } else if (std::getenv("XPU_PADDLE_FC_LOCAL_INT16") != nullptr) {
+        MatMulXPUFunction<T, float>(&a, &b, out, trans_a, trans_b, ctx);
       } else {
         MatMulXPUFunction<T, int16_t>(&a, &b, out, trans_a, trans_b, ctx);
       }
diff --git a/paddle/fluid/operators/mean_op.cu b/paddle/fluid/operators/mean_op.cu
index 430036bc67de7..26c844392d4d7 100644
--- a/paddle/fluid/operators/mean_op.cu
+++ b/paddle/fluid/operators/mean_op.cu
@@ -25,17 +25,6 @@ namespace cub = hipcub;
 namespace paddle {
 namespace operators {
 
-template <typename T>
-struct DivideFunctor {
-  HOSTDEVICE explicit inline DivideFunctor(int n)
-      : n_inv(static_cast<T>(1.0 / n)) {}
-
-  HOSTDEVICE inline T operator()(const T& x) const { return x * n_inv; }
-
- private:
-  T n_inv;
-};
-
 template <typename T>
 __global__ void MeanRunKernel(const T* in_data, T* out_data, int N) {
   int idx = blockDim.x * blockIdx.x + threadIdx.x;
@@ -45,37 +34,6 @@ __global__ void MeanRunKernel(const T* in_data, T* out_data, int N) {
   }
 }
 
-template <typename DeviceContext, typename T>
-class MeanCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* input = context.Input<Tensor>("X");
-    auto* output = context.Output<Tensor>("Out");
-
-    output->mutable_data<T>(context.GetPlace());
-    auto size_prob = input->numel();
-    const T* in_data = input->data<T>();
-    T* out_data = output->mutable_data<T>(context.GetPlace());
-    auto stream = context.cuda_device_context().stream();
-
-    DivideFunctor<T> transformer(size_prob);
-    cub::TransformInputIterator<T, DivideFunctor<T>, const T*> trans_x(
-        in_data, transformer);
-    size_t temp_storage_bytes = 0;
-
-    auto err = cub::DeviceReduce::Sum(nullptr, temp_storage_bytes, trans_x,
-                                      out_data, size_prob, stream);
-    PADDLE_ENFORCE_CUDA_SUCCESS(err);
-    framework::Tensor tmp;
-    auto* temp_storage = tmp.mutable_data<uint8_t>(
-        framework::make_ddim({static_cast<int64_t>(temp_storage_bytes)}),
-        context.GetPlace());
-    err = cub::DeviceReduce::Sum(temp_storage, temp_storage_bytes, trans_x,
-                                 out_data, size_prob, stream);
-    PADDLE_ENFORCE_CUDA_SUCCESS(err);
-  }
-};
-
 template <typename DeviceContext, typename T>
 class MeanCUDAGradKernel : public framework::OpKernel<T> {
  public:
@@ -104,10 +62,11 @@ class MeanCUDAGradKernel : public framework::OpKernel<T> {
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
+
 REGISTER_OP_CUDA_KERNEL(
-    mean, ops::MeanCUDAKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MeanCUDAKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::MeanCUDAKernel<paddle::platform::CUDADeviceContext, plat::float16>);
+    mean, ops::MeanKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::MeanKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::MeanKernel<paddle::platform::CUDADeviceContext, plat::float16>);
 REGISTER_OP_CUDA_KERNEL(
     mean_grad,
     ops::MeanCUDAGradKernel<paddle::platform::CUDADeviceContext, float>,
diff --git a/paddle/fluid/operators/mean_op.h b/paddle/fluid/operators/mean_op.h
index 4780150751bf6..2b398771c5c96 100644
--- a/paddle/fluid/operators/mean_op.h
+++ b/paddle/fluid/operators/mean_op.h
@@ -15,6 +15,12 @@ limitations under the License. */
 #pragma once
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/pten_utils.h"
+
+// only can include the headers in paddle/top/api dirs
+#include "paddle/pten/api/lib/utils/tensor_utils.h"
+#include "paddle/pten/include/core.h"
+#include "paddle/pten/include/math.h"
 
 namespace paddle {
 namespace operators {
@@ -27,21 +33,40 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 
+/** [ Why still keep the original kernel implementation? ]
+ *
+ * Removal of the original kernel implementation and kernel registration needs
+ * to ensure that the new kernel mechanism adapts to multiple sets of execution
+ * mechanisms, including:
+ *
+ * 1. Executor and ParallelExecutor
+ * 2. Dygraph OpBase (Tracer and Engine)
+ * 3. New Executor
+ * 4. Predictor
+ * 5. NPU and XPU lack kernel and need to reuse CPU Kernel
+ *
+ * Removal of the original Kernel requires a more complete solution to ensure
+ * that it will not affect the current execution system.
+ * Currently, only the first two cases are adapted.
+ *
+ * The principle here is that the implementation in the kernel must reuse the
+ * corresponding functions in the Tensor Operation library and cannot maintain
+ * two copies of the code.
+ */
 template <typename DeviceContext, typename T>
 class MeanKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* input = context.Input<Tensor>("X");
-    auto* output = context.Output<Tensor>("Out");
-
-    output->mutable_data<T>(context.GetPlace());
+    auto* x = context.Input<Tensor>("X");
+    auto* out = context.Output<Tensor>("Out");
+    auto& dev_ctx = context.device_context<DeviceContext>();
+    out->mutable_data<T>(x->place());
 
-    auto X = EigenVector<T>::Flatten(*input);
-    auto y = EigenScalar<T>::From(*output);
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
+    auto pt_x = paddle::experimental::MakePtenDenseTensor(*x);
+    auto pt_out = paddle::experimental::MakePtenDenseTensor(*out);
 
-    y.device(place) = X.mean();
+    // call new kernel
+    pten::Mean<T>(dev_ctx, *pt_x.get(), pt_out.get());
   }
 };
 
diff --git a/paddle/fluid/operators/memcpy_d2h_op.cc b/paddle/fluid/operators/memcpy_d2h_op.cc
index 3158b0963a43a..1eb8d09c783b0 100644
--- a/paddle/fluid/operators/memcpy_d2h_op.cc
+++ b/paddle/fluid/operators/memcpy_d2h_op.cc
@@ -125,24 +125,33 @@ REGISTER_OPERATOR(
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
 
-REGISTER_OP_CPU_KERNEL_FUNCTOR(memcpy_d2h, float, ops::MemcpyD2HKernel, double,
-                               ops::MemcpyD2HKernel, int, ops::MemcpyD2HKernel,
-                               int64_t, ops::MemcpyD2HKernel, bool,
-                               ops::MemcpyD2HKernel, plat::float16,
-                               ops::MemcpyD2HKernel);
+REGISTER_OP_CPU_KERNEL_FUNCTOR(
+    memcpy_d2h, float, ops::MemcpyD2HKernel, double, ops::MemcpyD2HKernel,
+    int8_t, ops::MemcpyD2HKernel, uint8_t, ops::MemcpyD2HKernel, int,
+    ops::MemcpyD2HKernel, int64_t, ops::MemcpyD2HKernel, bool,
+    ops::MemcpyD2HKernel, paddle::platform::bfloat16, ops::MemcpyD2HKernel,
+    paddle::platform::complex<float>, ops::MemcpyD2HKernel,
+    paddle::platform::complex<double>, ops::MemcpyD2HKernel, plat::float16,
+    ops::MemcpyD2HKernel, int16_t, ops::MemcpyD2HKernel);
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-REGISTER_OP_CUDA_KERNEL_FUNCTOR(memcpy_d2h, float, ops::MemcpyD2HKernel, double,
-                                ops::MemcpyD2HKernel, int, ops::MemcpyD2HKernel,
-                                int64_t, ops::MemcpyD2HKernel, bool,
-                                ops::MemcpyD2HKernel, plat::float16,
-                                ops::MemcpyD2HKernel);
+REGISTER_OP_CUDA_KERNEL_FUNCTOR(
+    memcpy_d2h, float, ops::MemcpyD2HKernel, double, ops::MemcpyD2HKernel,
+    int8_t, ops::MemcpyD2HKernel, uint8_t, ops::MemcpyD2HKernel, int,
+    ops::MemcpyD2HKernel, int64_t, ops::MemcpyD2HKernel, bool,
+    ops::MemcpyD2HKernel, paddle::platform::bfloat16, ops::MemcpyD2HKernel,
+    paddle::platform::complex<float>, ops::MemcpyD2HKernel,
+    paddle::platform::complex<double>, ops::MemcpyD2HKernel, plat::float16,
+    ops::MemcpyD2HKernel, int16_t, ops::MemcpyD2HKernel);
 #endif
 
 #ifdef PADDLE_WITH_ASCEND_CL
-REGISTER_OP_NPU_KERNEL_FUNCTOR(memcpy_d2h, float, ops::MemcpyD2HKernel, double,
-                               ops::MemcpyD2HKernel, int, ops::MemcpyD2HKernel,
-                               int64_t, ops::MemcpyD2HKernel, bool,
-                               ops::MemcpyD2HKernel, plat::float16,
-                               ops::MemcpyD2HKernel);
+REGISTER_OP_NPU_KERNEL_FUNCTOR(
+    memcpy_d2h, float, ops::MemcpyD2HKernel, double, ops::MemcpyD2HKernel,
+    int8_t, ops::MemcpyD2HKernel, uint8_t, ops::MemcpyD2HKernel, int,
+    ops::MemcpyD2HKernel, int64_t, ops::MemcpyD2HKernel, bool,
+    ops::MemcpyD2HKernel, paddle::platform::bfloat16, ops::MemcpyD2HKernel,
+    paddle::platform::complex<float>, ops::MemcpyD2HKernel,
+    paddle::platform::complex<double>, ops::MemcpyD2HKernel, plat::float16,
+    ops::MemcpyD2HKernel, int16_t, ops::MemcpyD2HKernel);
 #endif
diff --git a/paddle/fluid/operators/memcpy_h2d_op.cc b/paddle/fluid/operators/memcpy_h2d_op.cc
index f100dc6f7a53e..0e27ec0dc75b7 100644
--- a/paddle/fluid/operators/memcpy_h2d_op.cc
+++ b/paddle/fluid/operators/memcpy_h2d_op.cc
@@ -125,24 +125,33 @@ REGISTER_OPERATOR(
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
 
-REGISTER_OP_CPU_KERNEL_FUNCTOR(memcpy_h2d, float, ops::MemcpyH2DKernel, double,
-                               ops::MemcpyH2DKernel, int, ops::MemcpyH2DKernel,
-                               int64_t, ops::MemcpyH2DKernel, bool,
-                               ops::MemcpyH2DKernel, plat::float16,
-                               ops::MemcpyH2DKernel);
+REGISTER_OP_CPU_KERNEL_FUNCTOR(
+    memcpy_h2d, float, ops::MemcpyH2DKernel, double, ops::MemcpyH2DKernel,
+    int8_t, ops::MemcpyH2DKernel, uint8_t, ops::MemcpyH2DKernel, int,
+    ops::MemcpyH2DKernel, int64_t, ops::MemcpyH2DKernel, bool,
+    ops::MemcpyH2DKernel, paddle::platform::bfloat16, ops::MemcpyH2DKernel,
+    paddle::platform::complex<float>, ops::MemcpyH2DKernel,
+    paddle::platform::complex<double>, ops::MemcpyH2DKernel, plat::float16,
+    ops::MemcpyH2DKernel, int16_t, ops::MemcpyH2DKernel);
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-REGISTER_OP_CUDA_KERNEL_FUNCTOR(memcpy_h2d, float, ops::MemcpyH2DKernel, double,
-                                ops::MemcpyH2DKernel, int, ops::MemcpyH2DKernel,
-                                int64_t, ops::MemcpyH2DKernel, bool,
-                                ops::MemcpyH2DKernel, plat::float16,
-                                ops::MemcpyH2DKernel);
+REGISTER_OP_CUDA_KERNEL_FUNCTOR(
+    memcpy_h2d, float, ops::MemcpyH2DKernel, double, ops::MemcpyH2DKernel,
+    int8_t, ops::MemcpyH2DKernel, uint8_t, ops::MemcpyH2DKernel, int,
+    ops::MemcpyH2DKernel, int64_t, ops::MemcpyH2DKernel, bool,
+    ops::MemcpyH2DKernel, paddle::platform::bfloat16, ops::MemcpyH2DKernel,
+    paddle::platform::complex<float>, ops::MemcpyH2DKernel,
+    paddle::platform::complex<double>, ops::MemcpyH2DKernel, plat::float16,
+    ops::MemcpyH2DKernel, int16_t, ops::MemcpyH2DKernel);
 #endif
 
 #ifdef PADDLE_WITH_ASCEND_CL
-REGISTER_OP_NPU_KERNEL_FUNCTOR(memcpy_h2d, float, ops::MemcpyH2DKernel, double,
-                               ops::MemcpyH2DKernel, int, ops::MemcpyH2DKernel,
-                               int64_t, ops::MemcpyH2DKernel, bool,
-                               ops::MemcpyH2DKernel, plat::float16,
-                               ops::MemcpyH2DKernel);
+REGISTER_OP_NPU_KERNEL_FUNCTOR(
+    memcpy_h2d, float, ops::MemcpyH2DKernel, double, ops::MemcpyH2DKernel,
+    int8_t, ops::MemcpyH2DKernel, uint8_t, ops::MemcpyH2DKernel, int,
+    ops::MemcpyH2DKernel, int64_t, ops::MemcpyH2DKernel, bool,
+    ops::MemcpyH2DKernel, paddle::platform::bfloat16, ops::MemcpyH2DKernel,
+    paddle::platform::complex<float>, ops::MemcpyH2DKernel,
+    paddle::platform::complex<double>, ops::MemcpyH2DKernel, plat::float16,
+    ops::MemcpyH2DKernel, int16_t, ops::MemcpyH2DKernel);
 #endif
diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
index cce835e6bc035..fa2428458e569 100644
--- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
@@ -389,6 +389,49 @@ class ConvMKLDNNHandlerT
     }
   }
 
+  std::shared_ptr<std::tuple<float, std::vector<float>>> get_int8_bias_scales(
+      const framework::ExecutionContext& ctx) {
+    // Get scales int8 bias key
+    const std::string key_bs = this->key_ + "@bs";
+
+    // Scales for int8 bias are to be cached to avoid
+    // computing them each iteration
+    auto bias_scale_tuple =
+        std::static_pointer_cast<std::tuple<float, std::vector<float>>>(
+            this->dev_ctx_.GetBlob(key_bs));
+    if (bias_scale_tuple) return bias_scale_tuple;
+
+    const auto* filter = ctx.Input<Tensor>("Filter");
+    const auto& weights_tz = framework::vectorize(filter->dims());
+    const int groups = std::max(ctx.Attr<int>("groups"), 1);
+
+    const auto& scale_weights_data =
+        ctx.Attr<std::vector<float>>("Scale_weights");
+    const auto& scale_in_data = ctx.Attr<float>("Scale_in");
+
+    bool is_multi_channel = scale_weights_data.size() > 1;
+    int mask_reorder = is_multi_channel ? 1 << 0 : 1;
+
+    int count = 1;
+    if (is_multi_channel) {
+      count *= weights_tz[0];
+      if (groups > 1) {
+        count *= weights_tz[1];
+      }
+    }
+
+    bias_scale_tuple =
+        std::make_shared<std::tuple<float, std::vector<float>>>(std::make_tuple(
+            static_cast<float>(mask_reorder), std::vector<float>(count)));
+    for (int i = 0; i < count; i++) {
+      std::get<1>(*bias_scale_tuple)[i] = scale_in_data * scale_weights_data[i];
+    }
+
+    this->dev_ctx_.SetBlob(key_bs, bias_scale_tuple);
+
+    return bias_scale_tuple;
+  }
+
   std::tuple<float, std::vector<float>> get_int8_scales(
       const framework::ExecutionContext& ctx) const {
     const auto* filter = ctx.Input<Tensor>("Filter");
@@ -428,32 +471,6 @@ class ConvMKLDNNHandlerT
     return std::make_tuple(sum_scale, output_shift_scale);
   }
 
-  std::tuple<float, std::vector<float>> get_int8_bias_scales(
-      const framework::ExecutionContext& ctx) const {
-    const auto* filter = ctx.Input<Tensor>("Filter");
-    const auto& weights_tz = framework::vectorize(filter->dims());
-    const int groups = std::max(ctx.Attr<int>("groups"), 1);
-
-    const auto& scale_weights_data =
-        ctx.Attr<std::vector<float>>("Scale_weights");
-    const auto& scale_in_data = ctx.Attr<float>("Scale_in");
-
-    bool is_multi_channel = scale_weights_data.size() > 1;
-    int mask_reorder = is_multi_channel ? 1 << 0 : 1;
-    int count =
-        is_multi_channel
-            ? (groups > 1 ? (weights_tz)[1] * (weights_tz)[0] : (weights_tz)[0])
-            : 1;
-    std::vector<float> scale_bias_data(count);
-
-#pragma omp parallel for if (count > 50)
-    for (int i = 0; i < count; i++) {
-      scale_bias_data[i] = scale_in_data * scale_weights_data[i];
-    }
-
-    return std::make_tuple(mask_reorder, scale_bias_data);
-  }
-
   mkldnn::primitive_attr CreatePostOps(
       std::string fuse_activation, float fuse_alpha, float fuse_beta,
       bool fuse_residual_conn, const std::vector<float> output_shift_scale = {},
@@ -475,23 +492,25 @@ class ConvMKLDNNHandlerT
     }
     // Fusion with ReLU layer is executed through the PostOps feature. Create a
     // PostOps object and configure it to execute an eltwise relu operation.
+    constexpr float scale = 1.0f;
     if (fuse_activation == "relu" || fuse_activation == "leaky_relu") {
-      constexpr float scale = 1.0f;
       post_operations.append_eltwise(scale, mkldnn::algorithm::eltwise_relu,
                                      fuse_alpha, fuse_beta);
     } else if (fuse_activation == "relu6") {
-      constexpr float scale = 1.0f;
       post_operations.append_eltwise(scale,
                                      mkldnn::algorithm::eltwise_bounded_relu,
                                      fuse_alpha, fuse_beta);
     } else if (fuse_activation == "swish") {
-      constexpr float scale = 1.0f;
       post_operations.append_eltwise(scale, mkldnn::algorithm::eltwise_swish,
                                      fuse_alpha, fuse_beta);
     } else if (fuse_activation == "hard_swish") {
-      constexpr float scale = 1.0f;
       post_operations.append_eltwise(
           scale, mkldnn::algorithm::eltwise_hardswish, fuse_alpha, fuse_beta);
+    } else if (fuse_activation == "hard_sigmoid") {
+      post_operations.append_eltwise(scale, mkldnn::algorithm::eltwise_linear,
+                                     fuse_alpha, fuse_beta);
+      post_operations.append_eltwise(scale, mkldnn::algorithm::eltwise_clip,
+                                     0.0f, 1.0f);
     }
     conv_attr.set_post_ops(post_operations);
     return conv_attr;
@@ -563,7 +582,7 @@ class ConvMKLDNNHandlerT
       const auto target_mem_p = this->AcquireMemory(target_key_suffix);
       user_mem_p->set_data_handle(platform::to_void_cast<T>(in_mem_data));
       if (user_mem_p != target_mem_p) {
-        this->AcquireReorder(user_mem_p, target_mem_p, key_mem);
+        this->AcquireReorder(user_mem_p, target_mem_p);
       }
       return target_mem_p;
     }
@@ -641,7 +660,7 @@ class ConvMKLDNNHandlerT
         platform::GetMKLDNNFormat(this->fwd_pd_->dst_desc())) {
       auto residual_memory_p = this->AcquireResidualMemory(residual_param);
       dst_memory_p = this->template AcquireDstMemory<T_out>(output);
-      this->AcquireReorder(residual_memory_p, dst_memory_p, "@residual_dst");
+      this->AcquireReorder(residual_memory_p, dst_memory_p);
     } else {
       // Changing ShareDataWith to TensorCopy results in performance drop
       // on ResNet architectures
@@ -816,13 +835,11 @@ class ConvMKLDNNOpKernel : public framework::OpKernel<T> {
         {MKLDNN_ARG_DST, *dst_memory_p}};
 
     if (bias) {
-      float mask_reorder;
-      std::vector<float> scale_bias_data;
-      std::tie(mask_reorder, scale_bias_data) =
-          handler.get_int8_bias_scales(ctx);
+      auto p_scales_tuple = handler.get_int8_bias_scales(ctx);
 
       auto bias_memory_p = handler.AcquireBiasMemoryWithReorder(
-          bias, is_test, scale_bias_data, mask_reorder);
+          bias, is_test, std::get<1>(*p_scales_tuple),
+          std::get<0>(*p_scales_tuple));
       args.insert({MKLDNN_ARG_BIAS, *bias_memory_p});
     }
 
diff --git a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc
index 8d43e9f0dca44..b68c950aa9232 100644
--- a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc
@@ -40,151 +40,144 @@ inline mkldnn::memory::dims GetWeightsTz(const Tensor* filter,
 
 template <typename T, typename K, typename T_out>
 class ConvTransposeMKLDNNHandlerT
-    : public platform::MKLDNNHandlerT<T, mkldnn::deconvolution_forward> {
+    : public platform::MKLDNNHandlerNoCachingT<T,
+                                               mkldnn::deconvolution_forward> {
  public:
   ConvTransposeMKLDNNHandlerT(const framework::ExecutionContext& ctx,
-                              const platform::MKLDNNDeviceContext& dev_ctx,
                               const mkldnn::engine mkldnn_engine,
-                              platform::Place cpu_place, const Tensor* input,
-                              const Tensor* filter, const Tensor* bias,
-                              Tensor* output, const std::string& unique_name)
-      : platform::MKLDNNHandlerT<T, mkldnn::deconvolution_forward>(
-            dev_ctx, mkldnn_engine, cpu_place,
-            platform::CreateKey(dev_ctx, framework::vectorize(input->dims()),
-                                unique_name)) {
-    if (!this->isCached()) {
-      const bool is_test = ctx.Attr<bool>("is_test");
-      PADDLE_ENFORCE_EQ(is_test, true,
-                        platform::errors::InvalidArgument(
-                            "ConvTransposeMKLDNN works only for inference. "
-                            "The attribute \'is_test\' value should be set to "
-                            "True, but got is_test=False."));
-
-      PADDLE_ENFORCE_EQ(
-          input->layout(), DataLayout::kMKLDNN,
-          platform::errors::InvalidArgument(
-              "Got wrong layout = %d for Input tensor.", input->layout()));
-      PADDLE_ENFORCE_NE(input->format(), MKLDNNMemoryFormat::undef,
-                        platform::errors::InvalidArgument(
-                            "Got wrong format for Input tensor. The input "
-                            "format is undefined."));
+                              const Tensor* input, const Tensor* filter,
+                              const Tensor* bias, Tensor* output)
+      : platform::MKLDNNHandlerNoCachingT<T, mkldnn::deconvolution_forward>(
+            mkldnn_engine, ctx.GetPlace()),
+        is_test_(ctx.Attr<bool>("is_test")) {
+    PADDLE_ENFORCE_EQ(is_test_, true,
+                      platform::errors::InvalidArgument(
+                          "ConvTransposeMKLDNN works only for inference. "
+                          "The attribute \'is_test\' value should be set to "
+                          "True, but got is_test=False."));
+
+    PADDLE_ENFORCE_EQ(
+        input->layout(), DataLayout::kMKLDNN,
+        platform::errors::InvalidArgument(
+            "Got wrong layout = %d for Input tensor.", input->layout()));
+    PADDLE_ENFORCE_NE(input->format(), MKLDNNMemoryFormat::undef,
+                      platform::errors::InvalidArgument(
+                          "Got wrong format for Input tensor. The input "
+                          "format is undefined."));
+
+    PADDLE_ENFORCE_EQ(
+        filter->layout(), DataLayout::kMKLDNN,
+        platform::errors::InvalidArgument(
+            "The filter tensor's laytout should be %d, but got %d.",
+            DataLayout::kMKLDNN, filter->layout()));
+    PADDLE_ENFORCE_NE(filter->format(), MKLDNNMemoryFormat::undef,
+                      platform::errors::InvalidArgument(
+                          "Got wrong formats for Filter tensor."));
+
+    PADDLE_ENFORCE_EQ(
+        input->dims().size(), 4,
+        platform::errors::InvalidArgument("Input must be with 4 dimensions, "
+                                          "i.e. NCHW. but got dimension =%d",
+                                          input->dims().size()));
+    PADDLE_ENFORCE_EQ(
+        filter->dims().size(), 4,
+        platform::errors::InvalidArgument("Filter must be with 4 dimensions, "
+                                          "i.e. OIHW, but got dimension =%d",
+                                          filter->dims().size()));
 
+    if (bias) {
       PADDLE_ENFORCE_EQ(
-          filter->layout(), DataLayout::kMKLDNN,
+          bias->layout(), DataLayout::kMKLDNN,
           platform::errors::InvalidArgument(
-              "The filter tensor's laytout should be %d, but got %d.",
-              DataLayout::kMKLDNN, filter->layout()));
-      PADDLE_ENFORCE_NE(filter->format(), MKLDNNMemoryFormat::undef,
+              "The bias tensor's laytout should be %d, but got %d.",
+              DataLayout::kMKLDNN, bias->layout()));
+      PADDLE_ENFORCE_NE(bias->format(), MKLDNNMemoryFormat::undef,
                         platform::errors::InvalidArgument(
-                            "Got wrong formats for Filter tensor."));
+                            "Got wrong format for Bias tensor."));
 
       PADDLE_ENFORCE_EQ(
-          input->dims().size(), 4,
-          platform::errors::InvalidArgument("Input must be with 4 dimensions, "
-                                            "i.e. NCHW. but got dimension =%d",
-                                            input->dims().size()));
-      PADDLE_ENFORCE_EQ(
-          filter->dims().size(), 4,
-          platform::errors::InvalidArgument("Filter must be with 4 dimensions, "
-                                            "i.e. OIHW, but got dimension =%d",
-                                            filter->dims().size()));
-
-      if (bias) {
-        PADDLE_ENFORCE_EQ(
-            bias->layout(), DataLayout::kMKLDNN,
-            platform::errors::InvalidArgument(
-                "The bias tensor's laytout should be %d, but got %d.",
-                DataLayout::kMKLDNN, bias->layout()));
-        PADDLE_ENFORCE_NE(bias->format(), MKLDNNMemoryFormat::undef,
-                          platform::errors::InvalidArgument(
-                              "Got wrong format for Bias tensor."));
-
-        PADDLE_ENFORCE_EQ(bias->dims().size(), 1,
-                          platform::errors::InvalidArgument(
-                              "Bias must only have 1 dimension, "
-                              "i.e. X, but got dimension = %d .",
-                              bias->dims().size()));
-      }
-
-      std::vector<int> strides_temp = ctx.Attr<std::vector<int>>("strides");
-      mkldnn::memory::dims strides(begin(strides_temp), end(strides_temp));
-
-      std::vector<int> paddings_temp = ctx.Attr<std::vector<int>>("paddings");
-      mkldnn::memory::dims paddings(begin(paddings_temp), end(paddings_temp));
-
-      std::vector<int> dilations_temp = ctx.Attr<std::vector<int>>("dilations");
-      mkldnn::memory::dims dilations(begin(dilations_temp),
-                                     end(dilations_temp));
-
-      int groups = ctx.Attr<int>("groups");
-      std::string padding_algorithm =
-          ctx.Attr<std::string>("padding_algorithm");
+          bias->dims().size(), 1,
+          platform::errors::InvalidArgument("Bias must only have 1 dimension, "
+                                            "i.e. X, but got dimension = %d .",
+                                            bias->dims().size()));
+    }
 
-      PADDLE_ENFORCE_EQ(
-          strides.size(), 2,
-          platform::errors::Unimplemented(
-              "Now we only support 2d oneDNN convolution transpose op"));
-
-      const auto& input_dims = input->dims();
-      const auto data_dims =
-          framework::slice_ddim(input_dims, 2, input_dims.size());
-      const auto& filter_dims = filter->dims();
-      const auto filter_data_dims =
-          framework::slice_ddim(filter_dims, 2, filter_dims.size());
-
-      const auto ksize = framework::vectorize(filter_data_dims);
-
-      UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
-                               data_dims, strides, ksize);
-
-      std::transform(dilations.begin(), dilations.end(), dilations.begin(),
-                     [](int64_t i) { return i - 1; });
-
-      const auto src_tz = framework::vectorize(input->dims());
-      const auto weights_tz = GetWeightsTz(filter, groups);
-      const auto dst_tz = framework::vectorize(output->dims());
-      const auto mkldnn_paddings = platform::ToMkldnnPadding(paddings);
-
-      /* create memory descriptor for convolution without specified format
-       * ('any') which lets a primitive (convolution in this case) choose
-       * the memory format preferred for best performance
-       */
-      const auto chosen_memory_format = MKLDNNMemoryFormat::any;
-      const std::string fuse_activation =
-          ctx.Attr<std::string>("fuse_activation");
-      const float fuse_alpha = ctx.Attr<float>("fuse_alpha");
-      const float fuse_beta = ctx.Attr<float>("fuse_beta");
-
-      auto data_type = mkldnn::memory::data_type::f32;
-      if (ctx.Attr<std::string>("mkldnn_data_type") == "bfloat16" ||
-          std::is_same<T_out, platform::bfloat16>::value)
-        data_type = mkldnn::memory::data_type::bf16;
-
-      const auto src_md =
-          platform::MKLDNNMemDesc(src_tz, data_type, chosen_memory_format);
-      const auto weights_md =
-          platform::MKLDNNMemDesc(weights_tz, data_type, chosen_memory_format);
-      const auto dst_md = platform::MKLDNNMemDesc(
-          dst_tz, platform::MKLDNNGetDataType<T_out>(), chosen_memory_format);
-
-      const mkldnn::primitive_attr conv_trans_attr =
-          CreatePostOps(fuse_activation, fuse_alpha, fuse_beta);
-      auto fwd_prop_kind = is_test ? mkldnn::prop_kind::forward_inference
-                                   : mkldnn::prop_kind::forward_training;
-      if (bias) {
-        std::vector<int64_t> bias_tz = framework::vectorize(bias->dims());
-        const auto bias_md =
-            platform::MKLDNNMemDesc(bias_tz, data_type, MKLDNNMemoryFormat::x);
-        this->AcquireForwardPrimitiveDescriptor(
-            conv_trans_attr, fwd_prop_kind,
-            dnnl::algorithm::deconvolution_direct, src_md, weights_md, bias_md,
-            dst_md, strides, dilations, mkldnn_paddings[0], mkldnn_paddings[1]);
-      } else {
-        this->AcquireForwardPrimitiveDescriptor(
-            conv_trans_attr, fwd_prop_kind,
-            dnnl::algorithm::deconvolution_direct, src_md, weights_md, dst_md,
-            strides, dilations, mkldnn_paddings[0], mkldnn_paddings[1]);
-      }
+    std::vector<int> strides_temp = ctx.Attr<std::vector<int>>("strides");
+    mkldnn::memory::dims strides(begin(strides_temp), end(strides_temp));
+
+    std::vector<int> paddings_temp = ctx.Attr<std::vector<int>>("paddings");
+    mkldnn::memory::dims paddings(begin(paddings_temp), end(paddings_temp));
+
+    std::vector<int> dilations_temp = ctx.Attr<std::vector<int>>("dilations");
+    mkldnn::memory::dims dilations(begin(dilations_temp), end(dilations_temp));
+
+    int groups = ctx.Attr<int>("groups");
+    std::string padding_algorithm = ctx.Attr<std::string>("padding_algorithm");
+
+    PADDLE_ENFORCE_EQ(
+        strides.size(), 2,
+        platform::errors::Unimplemented(
+            "Now we only support 2d oneDNN convolution transpose op"));
+
+    const auto& input_dims = input->dims();
+    const auto data_dims =
+        framework::slice_ddim(input_dims, 2, input_dims.size());
+    const auto& filter_dims = filter->dims();
+    const auto filter_data_dims =
+        framework::slice_ddim(filter_dims, 2, filter_dims.size());
+
+    const auto ksize = framework::vectorize(filter_data_dims);
+
+    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
+                             data_dims, strides, ksize);
+
+    std::transform(dilations.begin(), dilations.end(), dilations.begin(),
+                   [](int64_t i) { return i - 1; });
+
+    const auto src_tz = framework::vectorize(input->dims());
+    const auto weights_tz = GetWeightsTz(filter, groups);
+    const auto dst_tz = framework::vectorize(output->dims());
+    const auto mkldnn_paddings = platform::ToMkldnnPadding(paddings);
+
+    /* create memory descriptor for convolution without specified format
+     * ('any') which lets a primitive (convolution in this case) choose
+     * the memory format preferred for best performance
+     */
+    const auto chosen_memory_format = MKLDNNMemoryFormat::any;
+    const std::string fuse_activation =
+        ctx.Attr<std::string>("fuse_activation");
+    const float fuse_alpha = ctx.Attr<float>("fuse_alpha");
+    const float fuse_beta = ctx.Attr<float>("fuse_beta");
+
+    auto data_type = mkldnn::memory::data_type::f32;
+    if (ctx.Attr<std::string>("mkldnn_data_type") == "bfloat16" ||
+        std::is_same<T_out, platform::bfloat16>::value)
+      data_type = mkldnn::memory::data_type::bf16;
+
+    const auto src_md =
+        platform::MKLDNNMemDesc(src_tz, data_type, chosen_memory_format);
+    const auto weights_md =
+        platform::MKLDNNMemDesc(weights_tz, data_type, chosen_memory_format);
+    const auto dst_md = platform::MKLDNNMemDesc(
+        dst_tz, platform::MKLDNNGetDataType<T_out>(), chosen_memory_format);
+
+    const mkldnn::primitive_attr conv_trans_attr =
+        CreatePostOps(fuse_activation, fuse_alpha, fuse_beta);
+    auto fwd_prop_kind = is_test_ ? mkldnn::prop_kind::forward_inference
+                                  : mkldnn::prop_kind::forward_training;
+    if (bias) {
+      std::vector<int64_t> bias_tz = framework::vectorize(bias->dims());
+      const auto bias_md =
+          platform::MKLDNNMemDesc(bias_tz, data_type, MKLDNNMemoryFormat::x);
+      this->AcquireForwardPrimitiveDescriptor(
+          conv_trans_attr, fwd_prop_kind, dnnl::algorithm::deconvolution_direct,
+          src_md, weights_md, bias_md, dst_md, strides, dilations,
+          mkldnn_paddings[0], mkldnn_paddings[1]);
+    } else {
+      this->AcquireForwardPrimitiveDescriptor(
+          conv_trans_attr, fwd_prop_kind, dnnl::algorithm::deconvolution_direct,
+          src_md, weights_md, dst_md, strides, dilations, mkldnn_paddings[0],
+          mkldnn_paddings[1]);
     }
   }
 
@@ -217,86 +210,140 @@ class ConvTransposeMKLDNNHandlerT
   std::shared_ptr<mkldnn::memory> AcquireSrcMemoryWithReorder(
       const framework::Tensor* input) {
     const T* input_data = input->data<T>();
-    const std::string user_key_suffix{"@src_mem_p_user"};
-    auto user_src_mem_p = this->AcquireMemory(user_key_suffix);
-    if (!user_src_mem_p) {
-      auto user_src_md = platform::MKLDNNMemDesc(
-          framework::vectorize(input->dims()), platform::MKLDNNGetDataType<T>(),
-          input->format());
-      return this->AcquireMemoryWithReorder(
-          user_src_md, this->fwd_pd_->src_desc(),
-          platform::to_void_cast<T>(input_data), "@src_mem_p");
-    } else {
-      const std::string target_key_suffix{"@src_mem_p_target"};
-      const auto target_src_mem_p = this->AcquireMemory(target_key_suffix);
-      user_src_mem_p->set_data_handle(platform::to_void_cast<T>(input_data));
-      if (user_src_mem_p != target_src_mem_p) {
-        this->AcquireReorder(user_src_mem_p, target_src_mem_p, "@src_mem_p");
-      }
-      return target_src_mem_p;
-    }
+    auto user_src_md = platform::MKLDNNMemDesc(
+        framework::vectorize(input->dims()), platform::MKLDNNGetDataType<T>(),
+        input->format());
+    return platform::MKLDNNHandlerNoCachingT<T, mkldnn::deconvolution_forward>::
+        AcquireMemoryWithReorder(user_src_md, this->fwd_pd_->src_desc(),
+                                 platform::to_void_cast<T>(input_data));
   }
 
   std::shared_ptr<mkldnn::memory> AcquireWeightsMemoryWithReorder(
-      const framework::Tensor* filter, const int& groups, const bool& is_test) {
-    // This is workaround to make execution faster, delete
-    // if statement after including md inside Tensor
-    auto weights_mem_p = this->AcquireMemory("@weights_mem_p_target");
-    if (is_test && weights_mem_p) {
-      return weights_mem_p;
-    } else {
-      const K* filter_data = filter->data<K>();
-      auto weights_tz = GetWeightsTz(filter, groups);
-      int g = std::max(groups, 1);
-
-      auto user_src_md = platform::MKLDNNMemDesc(
-          weights_tz, platform::MKLDNNGetDataType<K>(),
-          (g == 1) ? filter->format() : MKLDNNMemoryFormat::goihw);
-
-      auto iohw_weights_tz = framework::vectorize(filter->dims());
-      // Custom Reorder from IOHW to OIHW
-      auto iohw2oihw_reorder =
-          [&iohw_weights_tz](const K* filter_data) -> std::shared_ptr<K> {
-        int o = iohw_weights_tz[1];
-        int c = iohw_weights_tz[0];
-        int h = iohw_weights_tz[2];
-        int w = iohw_weights_tz[3];
-        std::shared_ptr<K> reordered_filter_data(new K[o * c * h * w](),
-                                                 std::default_delete<K[]>());
-        for (int i = 0; i < c; ++i) {
-          for (int j = 0; j < o; ++j) {
-            int in_offset = j * h * w + i * o * h * w;
-            int out_offset = j * c * h * w + i * h * w;
-            std::memcpy(&(reordered_filter_data.get())[out_offset],
-                        &filter_data[in_offset], h * w * sizeof(K));
-          }
+      const platform::MKLDNNDeviceContext& dev_ctx, const std::string& key,
+      const framework::Tensor* filter, const int& groups) {
+    const K* filter_data = filter->data<K>();
+    auto weights_tz = GetWeightsTz(filter, groups);
+    int g = std::max(groups, 1);
+
+    auto user_src_md = platform::MKLDNNMemDesc(
+        weights_tz, platform::MKLDNNGetDataType<K>(),
+        (g == 1) ? filter->format() : MKLDNNMemoryFormat::goihw);
+
+    auto iohw_weights_tz = framework::vectorize(filter->dims());
+    // Custom Reorder from IOHW to OIHW
+    auto iohw2oihw_reorder =
+        [&iohw_weights_tz](const K* filter_data) -> std::shared_ptr<K> {
+      int o = iohw_weights_tz[1];
+      int c = iohw_weights_tz[0];
+      int h = iohw_weights_tz[2];
+      int w = iohw_weights_tz[3];
+      std::shared_ptr<K> reordered_filter_data(new K[o * c * h * w](),
+                                               std::default_delete<K[]>());
+      for (int i = 0; i < c; ++i) {
+        for (int j = 0; j < o; ++j) {
+          int in_offset = j * h * w + i * o * h * w;
+          int out_offset = j * c * h * w + i * h * w;
+          std::memcpy(&(reordered_filter_data.get())[out_offset],
+                      &filter_data[in_offset], h * w * sizeof(K));
         }
+      }
+
+      return reordered_filter_data;
+    };
 
-        return reordered_filter_data;
-      };
+    return this->template AcquireMemoryWithReorder<K>(
+        dev_ctx, user_src_md, this->fwd_pd_->weights_desc(),
+        platform::to_void_cast<K>(filter_data), key, "@weights_mem_p", is_test_,
+        iohw2oihw_reorder);
+  }
 
-      return this->template AcquireMemoryWithReorder<K>(
-          user_src_md, this->fwd_pd_->weights_desc(),
-          platform::to_void_cast<K>(filter_data), "@weights_mem_p", is_test,
-          iohw2oihw_reorder);
+  template <typename F = T>
+  std::shared_ptr<mkldnn::memory> AcquireMemoryWithReorder(
+      const platform::MKLDNNDeviceContext& dev_ctx,
+      const mkldnn::memory::desc& user_md,
+      const mkldnn::memory::desc& target_md, void* ptr, const std::string& key,
+      const std::string& suffix, bool is_persistent = false,
+      std::function<std::shared_ptr<F>(const F*)> custom_reorder_func = {},
+      const std::vector<float>& scale_data = {1.0f}, int mask = 0) {
+    const auto target_key = key + suffix + "_target";
+    const auto key_reorder_p = key + suffix + "reorder_p";
+    const auto user_key = key + suffix + "_user";
+
+    auto target_memory_p =
+        std::static_pointer_cast<dnnl::memory>(dev_ctx.GetBlob(target_key));
+
+    if (target_memory_p == nullptr) {
+      if (custom_reorder_func) {
+        auto reordered_data =
+            custom_reorder_func(reinterpret_cast<const F*>(ptr));
+        dev_ctx.SetBlob(key_reorder_p + "-custom_reorder", reordered_data);
+        ptr = reinterpret_cast<void*>(reordered_data.get());
+      }
+      auto user_memory_p =
+          std::make_shared<dnnl::memory>(user_md, this->engine_, ptr);
+      if (user_md != target_md) {
+        target_memory_p =
+            std::make_shared<mkldnn::memory>(target_md, this->engine_);
+        dnnl::reorder::primitive_desc reorder_pdesc;
+        if (platform::is_int8<T>()) {
+          dnnl::primitive_attr attr;
+          attr.set_output_scales(mask, scale_data);
+          reorder_pdesc = dnnl::reorder::primitive_desc(*user_memory_p,
+                                                        *target_memory_p, attr);
+        } else {
+          reorder_pdesc =
+              dnnl::reorder::primitive_desc(*user_memory_p, *target_memory_p);
+        }
+        auto reorder_p = std::make_shared<dnnl::reorder>(reorder_pdesc);
+        dev_ctx.SetBlob(key_reorder_p, reorder_p);
+
+        auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
+        platform::RecordEvent record_reorder("int_reorder",
+                                             platform::EventRole::kUniqueOp);
+        reorder_p->execute(astream, {{MKLDNN_ARG_FROM, *user_memory_p},
+                                     {MKLDNN_ARG_TO, *target_memory_p}});
+        astream.wait();
+      } else {
+        target_memory_p = user_memory_p;
+      }
+      dev_ctx.SetBlob(user_key, user_memory_p);
+      dev_ctx.SetBlob(target_key, target_memory_p);
+    } else if (!is_persistent) {
+      auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
+
+      auto user_memory_p =
+          std::static_pointer_cast<dnnl::memory>(dev_ctx.GetBlob(user_key));
+      user_memory_p->set_data_handle(ptr);
+
+      // TODO(jczaja): Here we detect if reorder is cached it means it is needed
+      // need to change this to get rid of keys
+      auto reorder_p = std::static_pointer_cast<mkldnn::reorder>(
+          dev_ctx.GetBlob(key_reorder_p));
+      if (reorder_p != nullptr) {
+        platform::RecordEvent record_reorder("int_reorder",
+                                             platform::EventRole::kUniqueOp);
+        reorder_p->execute(astream, {{MKLDNN_ARG_FROM, *user_memory_p},
+                                     {MKLDNN_ARG_TO, *target_memory_p}});
+        astream.wait();
+      }
     }
+    return target_memory_p;
   }
 
   std::shared_ptr<mkldnn::memory> AcquireBiasMemoryWithReorder(
-      const framework::Tensor* bias, const bool& is_test) {
-    auto bias_mem_p = this->AcquireMemory("@bias_mem_p_target");
-    if (is_test && bias_mem_p) {
-      return bias_mem_p;
-    } else {
-      const K* bias_data = bias->data<K>();
-      auto user_bias_md = platform::MKLDNNMemDesc(
-          framework::vectorize(bias->dims()), platform::MKLDNNGetDataType<K>(),
-          MKLDNNMemoryFormat::x);
-      return this->AcquireMemoryWithReorder(
-          user_bias_md, this->fwd_pd_->bias_desc(),
-          platform::to_void_cast<K>(bias_data), "@bias_mem_p", is_test);
-    }
+      const platform::MKLDNNDeviceContext& dev_ctx, const std::string& key,
+      const framework::Tensor* bias) {
+    const K* bias_data = bias->data<K>();
+    auto user_bias_md = platform::MKLDNNMemDesc(
+        framework::vectorize(bias->dims()), platform::MKLDNNGetDataType<K>(),
+        MKLDNNMemoryFormat::x);
+    return this->AcquireMemoryWithReorder(
+        dev_ctx, user_bias_md, this->fwd_pd_->bias_desc(),
+        platform::to_void_cast<K>(bias_data), key, "@bias_mem_p", is_test_);
   }
+
+ private:
+  const bool is_test_;
 };
 
 template <typename T, typename K>
@@ -325,22 +372,21 @@ class ConvTransposeMKLDNNOpKernel : public framework::OpKernel<T> {
         ctx.template device_context<platform::MKLDNNDeviceContext>();
     const auto& mkldnn_engine = dev_ctx.GetEngine();
 
-    const bool is_test = ctx.Attr<bool>("is_test");
-
     const auto* input = ctx.Input<Tensor>("Input");
     const auto* filter = ctx.Input<Tensor>("Filter");
     const auto* bias =
         ctx.HasInput("Bias") ? ctx.Input<Tensor>("Bias") : nullptr;
     auto* output = ctx.Output<Tensor>("Output");
-    const std::string unique_name = ctx.InputName("Input") +
-                                    ctx.InputName("Filter") +
-                                    (bias ? ctx.InputName("Bias") : "");
-    ConvTransposeMKLDNNHandlerT<T, K, T_out> handler(
-        ctx, dev_ctx, mkldnn_engine, ctx.GetPlace(), input, filter, bias,
-        output, unique_name);
+    ConvTransposeMKLDNNHandlerT<T, K, T_out> handler(ctx, mkldnn_engine, input,
+                                                     filter, bias, output);
     auto src_memory_p = handler.AcquireSrcMemoryWithReorder(input);
+    // Caching Key for weights is needed
+    std::string key = platform::CreateKey(dev_ctx, ctx.InputName("Input"),
+                                          ctx.InputName("Filter"),
+                                          (bias ? ctx.InputName("Bias") : ""));
+    key = platform::ExtendKeyWithThreadInfoIfNeeded(dev_ctx, key);
     auto weights_memory_p = handler.AcquireWeightsMemoryWithReorder(
-        filter, ctx.Attr<int>("groups"), is_test);
+        dev_ctx, key, filter, ctx.Attr<int>("groups"));
 
     std::shared_ptr<dnnl::memory> dst_memory_p =
         handler.template AcquireDstMemory<T_out>(output);
@@ -352,7 +398,8 @@ class ConvTransposeMKLDNNOpKernel : public framework::OpKernel<T> {
         {MKLDNN_ARG_DST, *dst_memory_p}};
 
     if (bias) {
-      auto bias_memory_p = handler.AcquireBiasMemoryWithReorder(bias, is_test);
+      auto bias_memory_p =
+          handler.AcquireBiasMemoryWithReorder(dev_ctx, key, bias);
       args.insert({MKLDNN_ARG_BIAS, *bias_memory_p});
     }
     auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
diff --git a/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc
index e84266caa227c..8ab4612ff04b5 100644
--- a/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc
@@ -43,16 +43,20 @@ class LayerNormMKLDNNHandler : public platform::MKLDNNHandlerNoCachingT<
     }
   }
 
-  std::shared_ptr<dnnl::memory> AcquireScaleShiftMemory(
-      std::vector<float>& scaleshift_data) {
-    // scaleshift_data comes from temporary buffer so we need to copy it into
-    // created memory primitivie
-    auto scaleshift_mem =
+  std::shared_ptr<dnnl::memory> AcquireScaleShiftMemory(const Tensor* scale,
+                                                        const Tensor* shift) {
+    // OneDNN requires a single piece of memory for scale and shift data
+    const unsigned int C = framework::vectorize(scale->dims())[0];
+
+    auto scaleshift_memory =
         this->AcquireMemoryFromPrimitive(this->fwd_pd_->weights_desc());
-    auto data_ptr = scaleshift_mem->get_data_handle();
-    std::size_t num_bytes = scaleshift_data.size() * sizeof(float);
-    std::memcpy(data_ptr, scaleshift_data.data(), num_bytes);
-    return scaleshift_mem;
+
+    auto mem_data_handle =
+        reinterpret_cast<float*>(scaleshift_memory->get_data_handle());
+    std::copy(scale->data<float>(), scale->data<float>() + C, mem_data_handle);
+    std::copy(shift->data<float>(), shift->data<float>() + C,
+              mem_data_handle + C);
+    return scaleshift_memory;
   }
 
   std::shared_ptr<dnnl::memory> AcquireMeanMemory(framework::Tensor* mean) {
@@ -95,7 +99,6 @@ class LayerNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
                           "axis:%d as begin_norm_axis.",
                           (src_tz.size() - 1)));
 
-    y->mutable_data<T>(ctx.GetPlace());
     const bool with_scaleshift = (scale && bias);
     dnnl::normalization_flags flags{};
 
@@ -113,16 +116,12 @@ class LayerNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     auto layer_norm_p = handler.AcquireForwardPrimitive();
 
     auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
-    std::unordered_map<int, dnnl::memory> args;
-
-    args.insert({DNNL_ARG_SRC, *src_memory});
-    args.insert({DNNL_ARG_DST, *dst_memory});
+    std::unordered_map<int, dnnl::memory> args = {{DNNL_ARG_SRC, *src_memory},
+                                                  {DNNL_ARG_DST, *dst_memory}};
 
     if (!is_test) {
       auto* mean = ctx.Output<Tensor>("Mean");
       auto* var = ctx.Output<Tensor>("Variance");
-      mean->mutable_data<T>(ctx.GetPlace());
-      var->mutable_data<T>(ctx.GetPlace());
 
       auto mean_memory = handler.AcquireMeanMemory(mean);
       auto variance_memory = handler.AcquireVarianceMemory(var);
@@ -131,22 +130,9 @@ class LayerNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
       args.insert({DNNL_ARG_VARIANCE, *variance_memory});
     }
 
-    std::shared_ptr<mkldnn::memory> scaleshift_memory;
     if (with_scaleshift) {
-      auto scale_tz = paddle::framework::vectorize(scale->dims());
-      const unsigned int C = scale_tz[0];
-
-      // MKLDNN requires a single piece of memory for scale and shift/bias
-      // data
-      std::vector<float> scaleshift_data;
-      scaleshift_data.reserve(2 * C);
-      scaleshift_data.insert(scaleshift_data.begin(), scale->data<float>(),
-                             scale->data<float>() + C);
-
-      scaleshift_data.insert(scaleshift_data.end(), bias->data<float>(),
-                             bias->data<float>() + C);
-
-      scaleshift_memory = handler.AcquireScaleShiftMemory(scaleshift_data);
+      std::shared_ptr<mkldnn::memory> scaleshift_memory =
+          handler.AcquireScaleShiftMemory(scale, bias);
       args.insert({DNNL_ARG_SCALE_SHIFT, *scaleshift_memory});
     }
 
diff --git a/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc
index b78acd32e6dc8..b7eb5a3ab4b57 100644
--- a/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc
@@ -245,6 +245,36 @@ class MatMulMKLDNNHandler
     auto input_dims = ctx.Input<Tensor>(input_name)->dims();
     auto new_dims = input_dims;
     if (!shape.empty() && !axis.empty()) {
+      auto it_zero = std::find(shape.begin(), shape.end(), 0);
+      if (it_zero != shape.end()) {
+        for (uint64_t i = 0; i < shape.size(); i++) {
+          if (shape[i] == 0) {
+            PADDLE_ENFORCE_LT(
+                i, input_dims.size(),
+                paddle::platform::errors::InvalidArgument(
+                    "The index of 0 in fused_reshape_%s ",
+                    "should be less than output dim size, ",
+                    "but the index is %d and output dim size is %d", input_name,
+                    i, input_dims.size()));
+            shape[i] = input_dims.at(i);
+          }
+        }
+      }
+
+      // if "-1" is present then one of reshape dims must be infered
+      auto it_negative = std::find(shape.begin(), shape.end(), -1);
+      if (it_negative != shape.end()) {
+        int64_t dim_product = 1;
+        for (int i = 0; i < input_dims.size(); i++) {
+          dim_product *= input_dims.at(i);
+        }
+
+        int64_t shape_product = std::accumulate(shape.begin(), shape.end(), -1,
+                                                std::multiplies<int>());
+        int index = std::distance(shape.begin(), it_negative);
+        shape[index] = dim_product / shape_product;
+      }
+
       new_dims = input_dims.reshape(shape).transpose(axis);
     }
 
diff --git a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
index 920ec97a769b6..9e437fb15e917 100644
--- a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
@@ -30,234 +30,220 @@ using platform::to_void_cast;
 
 template <typename T>
 class PoolingMKLDNNHandler
-    : public platform::MKLDNNHandlerT<T, mkldnn::pooling_forward,
-                                      mkldnn::pooling_backward> {
+    : public platform::MKLDNNHandlerNoCachingT<T, mkldnn::pooling_forward,
+                                               mkldnn::pooling_backward> {
  public:
   PoolingMKLDNNHandler(const paddle::framework::ExecutionContext& ctx,
-                       const platform::MKLDNNDeviceContext& dev_ctx,
-                       platform::Place cpu_place, const Tensor* input,
-                       Tensor* output, const std::string& unique_name)
-      : platform::MKLDNNHandlerT<T, mkldnn::pooling_forward,
-                                 mkldnn::pooling_backward>(
-            dev_ctx, dev_ctx.GetEngine(), cpu_place,
-            platform::CreateKey(dev_ctx, framework::vectorize(input->dims()),
-                                framework::ToMKLDNNDataType(input->type()),
-                                unique_name)) {
-    if (!this->isCached()) {
-      PADDLE_ENFORCE_EQ(input->layout(), DataLayout::kMKLDNN,
-                        platform::errors::InvalidArgument(
-                            "Wrong layout set for Input tensor."));
-      PADDLE_ENFORCE_NE(input->format(), MKLDNNMemoryFormat::undef,
-                        platform::errors::InvalidArgument(
-                            "Wrong format set for Input tensor."));
-
-      const std::string pooling_type = ctx.Attr<std::string>("pooling_type");
-
-      std::vector<int> ksize_temp = ctx.Attr<std::vector<int>>("ksize");
-      std::vector<int64_t> ksize(begin(ksize_temp), end(ksize_temp));
-
-      std::vector<int> strides_temp = ctx.Attr<std::vector<int>>("strides");
-      std::vector<int64_t> strides(begin(strides_temp), end(strides_temp));
-
-      std::vector<int> paddings_temp = ctx.Attr<std::vector<int>>("paddings");
-      std::vector<int64_t> paddings(begin(paddings_temp), end(paddings_temp));
-
-      const bool global_pooling = ctx.Attr<bool>("global_pooling");
-      const std::string padding_algorithm =
-          ctx.Attr<std::string>("padding_algorithm");
-
-      // Only 2D pooling is supported now
-      PADDLE_ENFORCE_EQ(
-          ksize.size(), 2,
-          platform::errors::InvalidArgument(
-              "The ksize must be 2D, i.e. 2D pooling, but received %dD.",
-              ksize.size()));
-      PADDLE_ENFORCE_EQ(
-          pooling_type == "max" || pooling_type == "avg", true,
-          platform::errors::InvalidArgument(
-              "The pooling_type must be 'max' or 'avg', but received %s.",
-              pooling_type));
-      PADDLE_ENFORCE_EQ(
-          input->dims().size(), 4,
-          platform::errors::InvalidArgument(
-              "Input dim must be with 4, i.e. NCHW, but received %d.",
-              input->dims().size()));
-
-      const auto input_dims = input->dims();
-      framework::DDim data_dims =
-          framework::slice_ddim(input_dims, 2, input_dims.size());
-
-      if (global_pooling) {
-        operators::UpdateKsize(&ksize, data_dims);
-      }
-
-      operators::UpdatePadding(&paddings, global_pooling, 0, padding_algorithm,
-                               data_dims, strides, ksize);
+                       const mkldnn::engine mkldnn_engine, const Tensor* input,
+                       Tensor* output)
+      : platform::MKLDNNHandlerNoCachingT<T, mkldnn::pooling_forward,
+                                          mkldnn::pooling_backward>(
+            mkldnn_engine, ctx.GetPlace()) {
+    PADDLE_ENFORCE_EQ(input->layout(), DataLayout::kMKLDNN,
+                      platform::errors::InvalidArgument(
+                          "Wrong layout set for Input tensor."));
+    PADDLE_ENFORCE_NE(input->format(), MKLDNNMemoryFormat::undef,
+                      platform::errors::InvalidArgument(
+                          "Wrong format set for Input tensor."));
+
+    const std::string pooling_type = ctx.Attr<std::string>("pooling_type");
+
+    std::vector<int> ksize_temp = ctx.Attr<std::vector<int>>("ksize");
+    std::vector<int64_t> ksize(begin(ksize_temp), end(ksize_temp));
+
+    std::vector<int> strides_temp = ctx.Attr<std::vector<int>>("strides");
+    std::vector<int64_t> strides(begin(strides_temp), end(strides_temp));
+
+    std::vector<int> paddings_temp = ctx.Attr<std::vector<int>>("paddings");
+    std::vector<int64_t> paddings(begin(paddings_temp), end(paddings_temp));
+
+    const bool global_pooling = ctx.Attr<bool>("global_pooling");
+    const std::string padding_algorithm =
+        ctx.Attr<std::string>("padding_algorithm");
+
+    // Only 2D pooling is supported now
+    PADDLE_ENFORCE_EQ(
+        ksize.size(), 2,
+        platform::errors::InvalidArgument(
+            "The ksize must be 2D, i.e. 2D pooling, but received %dD.",
+            ksize.size()));
+    PADDLE_ENFORCE_EQ(
+        pooling_type == "max" || pooling_type == "avg", true,
+        platform::errors::InvalidArgument(
+            "The pooling_type must be 'max' or 'avg', but received %s.",
+            pooling_type));
+    PADDLE_ENFORCE_EQ(
+        input->dims().size(), 4,
+        platform::errors::InvalidArgument(
+            "Input dim must be with 4, i.e. NCHW, but received %d.",
+            input->dims().size()));
+
+    const auto input_dims = input->dims();
+    framework::DDim data_dims =
+        framework::slice_ddim(input_dims, 2, input_dims.size());
+
+    if (global_pooling) {
+      operators::UpdateKsize(&ksize, data_dims);
+    }
 
-      const auto src_tz = paddle::framework::vectorize(input->dims());
-      const auto dst_tz = paddle::framework::vectorize(output->dims());
+    operators::UpdatePadding(&paddings, global_pooling, 0, padding_algorithm,
+                             data_dims, strides, ksize);
 
-      const auto is_test = ctx.Attr<bool>("is_test");
+    const auto src_tz = paddle::framework::vectorize(input->dims());
+    const auto dst_tz = paddle::framework::vectorize(output->dims());
 
-      const auto dt = framework::ToMKLDNNDataType(input->type());
+    const auto is_test = ctx.Attr<bool>("is_test");
 
-      const auto exclude_padding = ctx.Attr<bool>("exclusive");
+    const auto dt = framework::ToMKLDNNDataType(input->type());
 
-      const auto src_md = mkldnn::memory::desc(src_tz, dt, input->format());
-      /* create memory descriptor for pooling without specified format
-       * ('any') which lets a primitive (pooling in this case) choose
-       * the memory format preferred for best performance
-       */
+    const auto exclude_padding = ctx.Attr<bool>("exclusive");
 
-      const auto dst_md =
-          platform::MKLDNNMemDesc(dst_tz, dt, MKLDNNMemoryFormat::any);
+    const auto src_md = mkldnn::memory::desc(src_tz, dt, input->format());
+    /* create memory descriptor for pooling without specified format
+     * ('any') which lets a primitive (pooling in this case) choose
+     * the memory format preferred for best performance
+     */
 
-      auto mkldnn_paddings = platform::ToMkldnnPadding(paddings);
+    const auto dst_md =
+        platform::MKLDNNMemDesc(dst_tz, dt, MKLDNNMemoryFormat::any);
 
-      const bool ceil_mode = ctx.Attr<bool>("ceil_mode");
+    auto mkldnn_paddings = platform::ToMkldnnPadding(paddings);
 
-      if (ceil_mode) {
-        CorrectOutputSize(src_tz, dst_tz, ksize, paddings, strides,
-                          mkldnn_paddings[1]);
-      }
+    const bool ceil_mode = ctx.Attr<bool>("ceil_mode");
 
-      ComputeAdaptivePoolParameters(ctx, src_tz, &ksize, &strides);
-
-      this->AcquireForwardPrimitiveDescriptor(
-          is_test ? mkldnn::prop_kind::forward_inference
-                  : mkldnn::prop_kind::forward_training,
-          pooling_type == "max"
-              ? mkldnn::algorithm::pooling_max
-              : (exclude_padding
-                     ? mkldnn::algorithm::pooling_avg_exclude_padding
-                     : mkldnn::algorithm::pooling_avg_include_padding),
-          src_md, dst_md, strides, ksize, mkldnn_paddings[0],
-          mkldnn_paddings[1]);
+    if (ceil_mode) {
+      CorrectOutputSize(src_tz, dst_tz, ksize, paddings, strides,
+                        mkldnn_paddings[1]);
     }
+
+    ComputeAdaptivePoolParameters(ctx, src_tz, &ksize, &strides);
+
+    this->AcquireForwardPrimitiveDescriptor(
+        is_test ? mkldnn::prop_kind::forward_inference
+                : mkldnn::prop_kind::forward_training,
+        pooling_type == "max"
+            ? mkldnn::algorithm::pooling_max
+            : (exclude_padding
+                   ? mkldnn::algorithm::pooling_avg_exclude_padding
+                   : mkldnn::algorithm::pooling_avg_include_padding),
+        src_md, dst_md, strides, ksize, mkldnn_paddings[0], mkldnn_paddings[1]);
   }
 
   PoolingMKLDNNHandler(const paddle::framework::ExecutionContext& ctx,
-                       const platform::MKLDNNDeviceContext& dev_ctx,
-                       platform::Place cpu_place, const Tensor* in_x,
-                       const Tensor* out_grad, Tensor* in_x_grad,
-                       const std::string& unique_name)
-      : platform::MKLDNNHandlerT<T, mkldnn::pooling_forward,
-                                 mkldnn::pooling_backward>(
-            dev_ctx, dev_ctx.GetEngine(), cpu_place,
-            platform::CreateKey(dev_ctx, framework::vectorize(in_x->dims()),
-                                framework::ToMKLDNNDataType(in_x->type()),
-                                unique_name)) {
-    if (!this->isBwdCached()) {
-      PADDLE_ENFORCE_EQ(in_x->layout(), DataLayout::kMKLDNN,
-                        platform::errors::InvalidArgument(
-                            "Wrong layout set for Input tensor"));
-      PADDLE_ENFORCE_NE(in_x->format(), MKLDNNMemoryFormat::undef,
-                        platform::errors::InvalidArgument(
-                            "Wrong format set for Input tensor"));
-
-      PADDLE_ENFORCE_EQ(out_grad->layout(), DataLayout::kMKLDNN,
-                        platform::errors::InvalidArgument(
-                            "Wrong layout set for Input output_grad tensor"));
-      PADDLE_ENFORCE_NE(out_grad->format(), MKLDNNMemoryFormat::undef,
-                        platform::errors::InvalidArgument(
-                            "Wrong format set for Input output_grad tensor"));
-
-      PADDLE_ENFORCE_EQ(
-          ctx.Attr<bool>("is_test"), false,
-          platform::errors::InvalidArgument(
-              "is_test attribute should be set to False in training phase."));
-
-      std::string pooling_type = ctx.Attr<std::string>("pooling_type");
-
-      std::vector<int> ksize_temp = ctx.Attr<std::vector<int>>("ksize");
-      std::vector<int64_t> ksize(begin(ksize_temp), end(ksize_temp));
-
-      std::vector<int> strides_temp = ctx.Attr<std::vector<int>>("strides");
-      std::vector<int64_t> strides(begin(strides_temp), end(strides_temp));
-
-      std::vector<int> paddings_temp = ctx.Attr<std::vector<int>>("paddings");
-      std::vector<int64_t> paddings(begin(paddings_temp), end(paddings_temp));
-
-      bool global_pooling = ctx.Attr<bool>("global_pooling");
-      std::string padding_algorithm =
-          ctx.Attr<std::string>("padding_algorithm");
-
-      auto in_x_dims = in_x->dims();
-      framework::DDim data_dims =
-          framework::slice_ddim(in_x_dims, 2, in_x_dims.size());
-
-      if (global_pooling) {
-        operators::UpdateKsize(&ksize, data_dims);
-      }
+                       const mkldnn::engine mkldnn_engine, const Tensor* in_x,
+                       const Tensor* out_grad, Tensor* in_x_grad)
+
+      : platform::MKLDNNHandlerNoCachingT<T, mkldnn::pooling_forward,
+                                          mkldnn::pooling_backward>(
+            mkldnn_engine, ctx.GetPlace()) {
+    PADDLE_ENFORCE_EQ(
+        in_x->layout(), DataLayout::kMKLDNN,
+        platform::errors::InvalidArgument("Wrong layout set for Input tensor"));
+    PADDLE_ENFORCE_NE(
+        in_x->format(), MKLDNNMemoryFormat::undef,
+        platform::errors::InvalidArgument("Wrong format set for Input tensor"));
+
+    PADDLE_ENFORCE_EQ(out_grad->layout(), DataLayout::kMKLDNN,
+                      platform::errors::InvalidArgument(
+                          "Wrong layout set for Input output_grad tensor"));
+    PADDLE_ENFORCE_NE(out_grad->format(), MKLDNNMemoryFormat::undef,
+                      platform::errors::InvalidArgument(
+                          "Wrong format set for Input output_grad tensor"));
+
+    PADDLE_ENFORCE_EQ(
+        ctx.Attr<bool>("is_test"), false,
+        platform::errors::InvalidArgument(
+            "is_test attribute should be set to False in training phase."));
+
+    std::string pooling_type = ctx.Attr<std::string>("pooling_type");
+
+    std::vector<int> ksize_temp = ctx.Attr<std::vector<int>>("ksize");
+    std::vector<int64_t> ksize(begin(ksize_temp), end(ksize_temp));
+
+    std::vector<int> strides_temp = ctx.Attr<std::vector<int>>("strides");
+    std::vector<int64_t> strides(begin(strides_temp), end(strides_temp));
+
+    std::vector<int> paddings_temp = ctx.Attr<std::vector<int>>("paddings");
+    std::vector<int64_t> paddings(begin(paddings_temp), end(paddings_temp));
+
+    bool global_pooling = ctx.Attr<bool>("global_pooling");
+    std::string padding_algorithm = ctx.Attr<std::string>("padding_algorithm");
+
+    auto in_x_dims = in_x->dims();
+    framework::DDim data_dims =
+        framework::slice_ddim(in_x_dims, 2, in_x_dims.size());
+
+    if (global_pooling) {
+      operators::UpdateKsize(&ksize, data_dims);
+    }
 
-      operators::UpdatePadding(&paddings, global_pooling, 0, padding_algorithm,
-                               data_dims, strides, ksize);
-
-      auto src_tz = paddle::framework::vectorize<int64_t>(in_x->dims());
-      auto diff_src_tz =
-          paddle::framework::vectorize<int64_t>(in_x_grad->dims());
-      auto diff_dst_tz =
-          paddle::framework::vectorize<int64_t>(out_grad->dims());
-
-      const auto dt = framework::ToMKLDNNDataType(in_x->type());
-      auto src_md = mkldnn::memory::desc(src_tz, dt, in_x->format());
-      auto dst_md =
-          mkldnn::memory::desc(diff_dst_tz, dt, MKLDNNMemoryFormat::any);
-      auto diff_dst_md = mkldnn::memory::desc(
-          diff_dst_tz, platform::MKLDNNGetDataType<T>(), out_grad->format());
-      auto diff_src_md =
-          mkldnn::memory::desc(diff_src_tz, platform::MKLDNNGetDataType<T>(),
-                               MKLDNNMemoryFormat::any);
-
-      auto mkldnn_paddings = platform::ToMkldnnPadding(paddings);
-      const bool ceil_mode = ctx.Attr<bool>("ceil_mode");
-
-      if (ceil_mode) {
-        CorrectOutputSize(src_tz, diff_dst_tz, ksize, paddings, strides,
-                          mkldnn_paddings[1]);
-      }
-      ComputeAdaptivePoolParameters(ctx, diff_src_tz, &ksize, &strides);
-
-      const auto exclude_padding = ctx.Attr<bool>("exclusive");
-
-      this->AcquireForwardPrimitiveDescriptor(
-          mkldnn::prop_kind::forward_training,
-          pooling_type == "max"
-              ? mkldnn::algorithm::pooling_max
-              : (exclude_padding
-                     ? mkldnn::algorithm::pooling_avg_exclude_padding
-                     : mkldnn::algorithm::pooling_avg_include_padding),
-          src_md, dst_md, strides, ksize, mkldnn_paddings[0],
-          mkldnn_paddings[1]);
-
-      this->AcquireBackwardPrimitiveDescriptor(
-          pooling_type == "max"
-              ? mkldnn::algorithm::pooling_max
-              : (exclude_padding
-                     ? mkldnn::algorithm::pooling_avg_exclude_padding
-                     : mkldnn::algorithm::pooling_avg_include_padding),
-          diff_src_md, diff_dst_md, strides, ksize, mkldnn_paddings[0],
-          mkldnn_paddings[1]);
+    operators::UpdatePadding(&paddings, global_pooling, 0, padding_algorithm,
+                             data_dims, strides, ksize);
+
+    auto src_tz = paddle::framework::vectorize<int64_t>(in_x->dims());
+    auto diff_src_tz = paddle::framework::vectorize<int64_t>(in_x_grad->dims());
+    auto diff_dst_tz = paddle::framework::vectorize<int64_t>(out_grad->dims());
+
+    const auto dt = framework::ToMKLDNNDataType(in_x->type());
+    auto src_md = mkldnn::memory::desc(src_tz, dt, in_x->format());
+    auto dst_md =
+        mkldnn::memory::desc(diff_dst_tz, dt, MKLDNNMemoryFormat::any);
+    auto diff_dst_md = mkldnn::memory::desc(
+        diff_dst_tz, platform::MKLDNNGetDataType<T>(), out_grad->format());
+    auto diff_src_md = mkldnn::memory::desc(
+        diff_src_tz, platform::MKLDNNGetDataType<T>(), MKLDNNMemoryFormat::any);
+
+    auto mkldnn_paddings = platform::ToMkldnnPadding(paddings);
+    const bool ceil_mode = ctx.Attr<bool>("ceil_mode");
+
+    if (ceil_mode) {
+      CorrectOutputSize(src_tz, diff_dst_tz, ksize, paddings, strides,
+                        mkldnn_paddings[1]);
     }
+    ComputeAdaptivePoolParameters(ctx, diff_src_tz, &ksize, &strides);
+
+    const auto exclude_padding = ctx.Attr<bool>("exclusive");
+
+    this->AcquireForwardPrimitiveDescriptor(
+        mkldnn::prop_kind::forward_training,
+        pooling_type == "max"
+            ? mkldnn::algorithm::pooling_max
+            : (exclude_padding
+                   ? mkldnn::algorithm::pooling_avg_exclude_padding
+                   : mkldnn::algorithm::pooling_avg_include_padding),
+        src_md, dst_md, strides, ksize, mkldnn_paddings[0], mkldnn_paddings[1]);
+
+    this->AcquireBackwardPrimitiveDescriptor(
+        pooling_type == "max"
+            ? mkldnn::algorithm::pooling_max
+            : (exclude_padding
+                   ? mkldnn::algorithm::pooling_avg_exclude_padding
+                   : mkldnn::algorithm::pooling_avg_include_padding),
+        diff_src_md, diff_dst_md, strides, ksize, mkldnn_paddings[0],
+        mkldnn_paddings[1]);
   }
 
-  std::shared_ptr<mkldnn::memory> AcquireWorkspaceMemory(void) {
+  std::shared_ptr<mkldnn::memory> AcquireWorkspaceMemory(
+      const platform::MKLDNNDeviceContext& dev_ctx,
+      const std::string& unique_name) {
     mkldnn::memory::desc workspace_md = this->fwd_pd_->workspace_desc();
-    // Pooling PD has to be passed to Grad op that
+    // Pooling Workspace has to be passed to Grad op that
     // may be executed by diffrent thread, hence
     // for that one we use key that does not contain TID
-    auto local_key = this->key_common_ + "@workspace";
+    std::string workspace_key =
+        platform::CreateKey(dev_ctx, workspace_md.dims(),
+                            workspace_md.data_type(), unique_name, "@wrk");
     auto mem_p = std::static_pointer_cast<mkldnn::memory>(
-        this->dev_ctx_.GetBlob(local_key));
+        dev_ctx.GetBlob(workspace_key));
     if (mem_p == nullptr) {
       static std::mutex acquire_barrier;
       std::lock_guard<std::mutex> block_threads_until_finish_this_job(
           acquire_barrier);
       mem_p = std::static_pointer_cast<mkldnn::memory>(
-          this->dev_ctx_.GetBlob(local_key));
+          dev_ctx.GetBlob(workspace_key));
       if (mem_p == nullptr) {
         mem_p = std::make_shared<mkldnn::memory>(workspace_md, this->engine_);
-        this->dev_ctx_.SetBlob(local_key, mem_p);
+        dev_ctx.SetBlob(workspace_key, mem_p);
       }
     }
     return mem_p;
@@ -319,8 +305,7 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     const Tensor* input = ctx.Input<Tensor>("X");
     Tensor* output = ctx.Output<Tensor>("Out");
 
-    PoolingMKLDNNHandler<T> handler(ctx, dev_ctx, ctx.GetPlace(), input, output,
-                                    ctx.OutputName("Out"));
+    PoolingMKLDNNHandler<T> handler(ctx, dev_ctx.GetEngine(), input, output);
 
     auto src_memory = handler.AcquireSrcMemory(input);
     auto dst_memory = handler.AcquireDstMemory(output);
@@ -331,7 +316,8 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     if ((ctx.Attr<bool>("is_test") == false) &&
         (ctx.Attr<std::string>("pooling_type") == "max")) {
       // Training
-      auto workspace_memory = handler.AcquireWorkspaceMemory();
+      auto workspace_memory =
+          handler.AcquireWorkspaceMemory(dev_ctx, ctx.OutputName("Out"));
       pool_p->execute(astream, {{MKLDNN_ARG_SRC, *src_memory},
                                 {MKLDNN_ARG_DST, *dst_memory},
                                 {MKLDNN_ARG_WORKSPACE, *workspace_memory}});
@@ -361,8 +347,8 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     auto& dev_ctx =
         ctx.template device_context<platform::MKLDNNDeviceContext>();
 
-    PoolingMKLDNNHandler<T> handler(ctx, dev_ctx, ctx.GetPlace(), in_x,
-                                    out_grad, in_x_grad, ctx.InputName("Out"));
+    PoolingMKLDNNHandler<T> handler(ctx, dev_ctx.GetEngine(), in_x, out_grad,
+                                    in_x_grad);
 
     auto diff_dst_memory = handler.AcquireDiffDstMemory(out_grad);
     auto diff_src_memory = handler.AcquireDiffSrcMemory(in_x_grad);
@@ -372,7 +358,8 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
     if (ctx.Attr<std::string>("pooling_type") == "max") {
       // Max - pooling needs Workspace
-      auto workspace_memory = handler.AcquireWorkspaceMemory();
+      auto workspace_memory =
+          handler.AcquireWorkspaceMemory(dev_ctx, ctx.InputName("Out"));
       pool_bwd_p->execute(astream, {{MKLDNN_ARG_DIFF_SRC, *diff_src_memory},
                                     {MKLDNN_ARG_DIFF_DST, *diff_dst_memory},
                                     {MKLDNN_ARG_WORKSPACE, *workspace_memory}});
diff --git a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
index 819c0d15505ca..815af4eaaf1b3 100644
--- a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
@@ -64,81 +64,46 @@ class QuantOpKernel : public framework::OpKernel<T> {
     bool is_negative_input = ctx.Attr<bool>("is_negative_input");
     bool bfloat16 = ctx.Attr<bool>("bfloat16");
 
-    std::string key =
-        platform::CreateKey(dev_ctx, src_tz, scale_data, scale_shift,
-                            is_negative_input, ctx.OutputName("Output"));
-    key = platform::ExtendKeyWithThreadInfoIfNeeded(dev_ctx, key);
-
-    const std::string key_prim = key + "@r";
-    const std::string key_src_mem = key + "@s";
-    const std::string key_dst_mem = key + "@d";
-
+    // TODO(jczaja): Refactor with Acquire API
     std::shared_ptr<mkldnn::memory> src_memory;
     std::shared_ptr<mkldnn::memory> dst_memory;
     std::shared_ptr<reorder> reorder_p;
-    reorder_p = std::static_pointer_cast<reorder>(dev_ctx.GetBlob(key_prim));
-
-    if (reorder_p == nullptr) {
-      std::string out_layout = ctx.Attr<std::string>("output_format");
-      MKLDNNMemoryFormat out_format =
-          platform::data_format_to_memory_format(out_layout);
-      mkldnn::primitive_attr attri;
-      int mask = 0;
-      attri.set_output_scales(mask, {scale_data});
-
-      if (with_shift) {
-        mkldnn::post_ops post_operations;
-        post_operations.append_sum();
-        attri.set_post_ops(post_operations);
-        uint8_t* output_data = output->mutable_data<uint8_t>(ctx.GetPlace());
-        // memset casts scale_shift to unsigned char (uint8_t) internally
-        std::memset(output_data, scale_shift, output->numel());
-      }
-
-      auto src_md = platform::MKLDNNMemDesc({src_tz}, memory::data_type::f32,
-                                            input->format());
-      src_memory = std::make_shared<mkldnn::memory>(
-          src_md, engine, to_void_cast<T>(input_data));
-
-      std::shared_ptr<mkldnn::memory::desc> dst_md;
-      if (bfloat16) {
-        platform::SetDstMemoryQuantized<paddle::platform::bfloat16>(
-            ctx, output, dst_tz, engine, dst_md, dst_memory, out_format);
-      } else if (is_negative_input && !with_shift) {
-        platform::SetDstMemoryQuantized<int8_t>(ctx, output, dst_tz, engine,
-                                                dst_md, dst_memory, out_format);
-      } else {
-        platform::SetDstMemoryQuantized<uint8_t>(
-            ctx, output, dst_tz, engine, dst_md, dst_memory, out_format);
-      }
-      auto reorder_pd = std::shared_ptr<reorder::primitive_desc>(
-          new reorder::primitive_desc(*src_memory, *dst_memory, attri));
-      reorder_p = std::shared_ptr<reorder>(new reorder(*reorder_pd));
-
-      dev_ctx.SetBlob(key_prim, reorder_p);
-      dev_ctx.SetBlob(key_src_mem, src_memory);
-      dev_ctx.SetBlob(key_dst_mem, dst_memory);
+
+    std::string out_layout = ctx.Attr<std::string>("output_format");
+    MKLDNNMemoryFormat out_format =
+        platform::data_format_to_memory_format(out_layout);
+    mkldnn::primitive_attr attri;
+    int mask = 0;
+    attri.set_output_scales(mask, {scale_data});
+
+    if (with_shift) {
+      mkldnn::post_ops post_operations;
+      post_operations.append_sum();
+      attri.set_post_ops(post_operations);
+      uint8_t* output_data = output->mutable_data<uint8_t>(ctx.GetPlace());
+      // memset casts scale_shift to unsigned char (uint8_t) internally
+      std::memset(output_data, scale_shift, output->numel());
+    }
+
+    auto src_md = platform::MKLDNNMemDesc({src_tz}, memory::data_type::f32,
+                                          input->format());
+    src_memory = std::make_shared<mkldnn::memory>(src_md, engine,
+                                                  to_void_cast<T>(input_data));
+
+    std::shared_ptr<mkldnn::memory::desc> dst_md;
+    if (bfloat16) {
+      platform::SetDstMemoryQuantized<paddle::platform::bfloat16>(
+          ctx, output, dst_tz, engine, dst_md, dst_memory, out_format);
+    } else if (is_negative_input && !with_shift) {
+      platform::SetDstMemoryQuantized<int8_t>(ctx, output, dst_tz, engine,
+                                              dst_md, dst_memory, out_format);
     } else {
-      src_memory = std::static_pointer_cast<mkldnn::memory>(
-          dev_ctx.GetBlob(key_src_mem));
-      src_memory->set_data_handle(to_void_cast<T>(input_data));
-
-      dst_memory = std::static_pointer_cast<mkldnn::memory>(
-          dev_ctx.GetBlob(key_dst_mem));
-      auto place = ctx.GetPlace();
-
-      if (bfloat16) {
-        dst_memory->set_data_handle(
-            output->mutable_data<paddle::platform::bfloat16>(place));
-      } else if (with_shift || !is_negative_input) {
-        uint8_t* output_data = output->mutable_data<uint8_t>(ctx.GetPlace());
-        if (with_shift) std::memset(output_data, scale_shift, output->numel());
-        dst_memory->set_data_handle(output_data);
-      } else {
-        dst_memory->set_data_handle(
-            output->mutable_data<int8_t>(ctx.GetPlace()));
-      }
+      platform::SetDstMemoryQuantized<uint8_t>(ctx, output, dst_tz, engine,
+                                               dst_md, dst_memory, out_format);
     }
+    auto reorder_pd = std::shared_ptr<reorder::primitive_desc>(
+        new reorder::primitive_desc(*src_memory, *dst_memory, attri));
+    reorder_p = std::shared_ptr<reorder>(new reorder(*reorder_pd));
 
     auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
     {
diff --git a/paddle/fluid/operators/mkldnn/stack_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/stack_mkldnn_op.cc
new file mode 100644
index 0000000000000..94cf3747581c1
--- /dev/null
+++ b/paddle/fluid/operators/mkldnn/stack_mkldnn_op.cc
@@ -0,0 +1,143 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/utils.h"
+#include "paddle/fluid/platform/mkldnn_reuse.h"
+namespace paddle {
+namespace operators {
+
+using framework::DataLayout;
+using framework::Tensor;
+using framework::LoDTensor;
+using mkldnn::memory;
+using mkldnn::primitive;
+using mkldnn::concat;
+using mkldnn::stream;
+using platform::to_void_cast;
+
+template <typename T>
+class StackMKLDNNHandler
+    : public platform::MKLDNNHandlerNoCachingT<T, dnnl::concat> {
+ public:
+  StackMKLDNNHandler(const framework::ExecutionContext& ctx,
+                     const mkldnn::engine mkldnn_engine,
+                     const std::vector<const Tensor*>& inputs, Tensor* output)
+      : platform::MKLDNNHandlerNoCachingT<T, dnnl::concat>(mkldnn_engine,
+                                                           ctx.GetPlace()) {
+    int stack_axis = ctx.Attr<int>("axis");
+
+    int ndims = inputs[0]->dims().size();
+
+    if (stack_axis < 0) {
+      stack_axis = ndims + 1 + stack_axis;  // +1 to match output's ndims
+    }
+
+    // in stack op all inputs must have same dims
+    auto input_dims = framework::vectorize<int64_t>(inputs[0]->dims());
+
+    memory::data_type dt = framework::ToMKLDNNDataType(inputs[0]->type());
+    std::vector<memory::desc> srcs_md;
+    memory::desc dst_md;
+    MKLDNNMemoryFormat dst_fmt;
+
+    srcs_md.reserve(inputs.size());
+
+    // if stack is not done on last(non existing) axis, then we can optimize
+    // concat primitive by not adding additional dimension, since it causes
+    // wrong output format deduction and suboptimal performance as a result
+    if (stack_axis != ndims) {
+      for (size_t i = 0; i < inputs.size(); ++i) {
+        srcs_md.emplace_back(memory::desc(input_dims, dt, inputs[i]->format()));
+      }
+
+      input_dims[stack_axis] *= inputs.size();
+      dst_md = memory::desc(input_dims, dt, MKLDNNMemoryFormat::any);
+    } else {
+      auto extended_input_dims = framework::vectorize<int64_t>(output->dims());
+      extended_input_dims[stack_axis] = 1;
+
+      for (size_t i = 0; i < inputs.size(); ++i) {
+        srcs_md.emplace_back(memory::desc(input_dims, dt, inputs[i]->format())
+                                 .reshape(extended_input_dims));
+      }
+
+      // concat primitive choses suboptimal format tag because it cannot
+      // distinguish between f.e. abcd and abdc if last dim is equal to 1 so
+      // enforcing is needed for better performance
+      dst_fmt = platform::GetPlainMKLDNNFormat(extended_input_dims.size());
+      dst_md = memory::desc(framework::vectorize(output->dims()), dt, dst_fmt);
+    }
+
+    this->AcquireForwardPrimitiveDescriptor(dst_md, stack_axis, srcs_md);
+  }
+
+  // concat oneDNN prim is not having .desc attribute so we cannot use default
+  // AcquireForwardPrimitiveDescriptor
+  void AcquireForwardPrimitiveDescriptor(
+      const memory::desc& dst_md, const int stack_axis,
+      const std::vector<memory::desc>& srcs_md) {
+    this->fwd_pd_.reset(new dnnl::concat::primitive_desc(
+        dst_md, stack_axis, srcs_md, this->engine_));
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireSrcMemory(const Tensor& input, int i) {
+    const T* input_data = input.data<T>();
+    return this->AcquireMemoryFromPrimitive(this->fwd_pd_->src_desc(i),
+                                            to_void_cast<T>(input_data));
+  }
+};
+
+template <typename T>
+class StackMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+    auto& dev_ctx =
+        ctx.template device_context<platform::MKLDNNDeviceContext>();
+    const auto& mkldnn_engine = dev_ctx.GetEngine();
+
+    auto multi_input = ctx.MultiInput<Tensor>("X");
+
+    Tensor* output = ctx.Output<Tensor>("Y");
+
+    StackMKLDNNHandler<T> handler(ctx, mkldnn_engine, multi_input, output);
+
+    std::vector<std::shared_ptr<memory>> srcs;
+    srcs.reserve(multi_input.size());
+
+    auto dst_mem = handler.AcquireDstMemory(output);
+    auto concat_p = handler.AcquireForwardPrimitive();
+
+    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
+    std::unordered_map<int, memory> args;
+    for (size_t i = 0; i < multi_input.size(); ++i) {
+      srcs.push_back(handler.AcquireSrcMemory(*(multi_input[i]), i));
+      args.insert({MKLDNN_ARG_MULTIPLE_SRC + i, *(srcs.at(i))});
+    }
+    args.insert({MKLDNN_ARG_DST, *dst_mem});
+
+    concat_p->execute(astream, args);
+    astream.wait();
+
+    output->set_layout(DataLayout::kMKLDNN);
+    output->set_format(platform::GetMKLDNNFormat(
+        dst_mem->get_desc().reshape(framework::vectorize(output->dims()))));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_KERNEL(stack, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::StackMKLDNNOpKernel<float>);
diff --git a/paddle/fluid/operators/multinomial_op.cu b/paddle/fluid/operators/multinomial_op.cu
index 2d97111709a0f..1e52cf36f69c8 100644
--- a/paddle/fluid/operators/multinomial_op.cu
+++ b/paddle/fluid/operators/multinomial_op.cu
@@ -33,18 +33,22 @@ namespace operators {
 
 template <typename T>
 __global__ void NormalizeProbability(T* norm_probs, const T* in_data,
-                                     T* sum_rows) {
+                                     T* sum_rows, int64_t num_distributions,
+                                     int64_t num_categories) {
   int id = threadIdx.x + blockIdx.x * blockDim.x +
            blockIdx.y * gridDim.x * blockDim.x;
-  PADDLE_ENFORCE(
-      in_data[id] >= 0.0,
-      "The input of multinomial distribution should be >= 0, but got %f.",
-      in_data[id]);
-  PADDLE_ENFORCE(sum_rows[blockIdx.y] > 0.0,
-                 "The sum of one multinomial distribution probability should "
-                 "be > 0, but got %f.",
-                 sum_rows[blockIdx.y]);
-  norm_probs[id] = in_data[id] / sum_rows[blockIdx.y];
+  if (id < num_distributions * num_categories) {
+    PADDLE_ENFORCE(
+        in_data[id] >= 0.0,
+        "The input of multinomial distribution should be >= 0, but got %f.",
+        in_data[id]);
+    int64_t row_id = id / num_categories;
+    PADDLE_ENFORCE(sum_rows[row_id] > 0.0,
+                   "The sum of one multinomial distribution probability should "
+                   "be > 0, but got %f.",
+                   sum_rows[row_id]);
+    norm_probs[id] = in_data[id] / sum_rows[row_id];
+  }
 }
 
 template <typename T>
@@ -52,12 +56,10 @@ __global__ void GetCumulativeProbs(T* norm_probs_data,
                                    int64_t num_distributions,
                                    int64_t num_categories,
                                    T* cumulative_probs) {
-  for (int id = blockIdx.x; id < num_distributions; id += gridDim.x) {
-    thrust::inclusive_scan(thrust::device,
-                           norm_probs_data + id * num_categories,
-                           norm_probs_data + (id + 1) * num_categories,
-                           cumulative_probs + id * num_categories);
-  }
+  int id = blockIdx.x;
+  thrust::inclusive_scan(thrust::device, norm_probs_data + id * num_categories,
+                         norm_probs_data + (id + 1) * num_categories,
+                         cumulative_probs + id * num_categories);
 }
 
 template <typename T>
@@ -108,23 +110,19 @@ __global__ void sampleMultinomialWithReplacement(
   // use binary search to get the selected category sample id.
   // let cumulative_probs[id-1] < rng_data < cumulative_probs[id].
 
-  int idx = threadIdx.x + blockIdx.x * blockDim.x +
-            blockIdx.y * gridDim.x * blockDim.x;
-
   // for every distribution
-  for (int dist = blockIdx.y; dist < num_distributions; dist += gridDim.y) {
-    // for every sample
-    for (int sample = blockIdx.x * blockDim.x + threadIdx.x;
-         sample < num_samples; sample += blockDim.x * gridDim.x) {
-      T rng_number = rng_data[sample + dist * num_samples];
-
-      // Find the bucket that a uniform random number lies in
-      int selected_category = binarySearchFunctor<T>(
-          cumulative_probs + dist * num_categories,
-          norm_probs_data + dist * num_categories, num_categories, rng_number);
-
-      out_data[sample + dist * num_samples] = selected_category;
-    }
+  int dist = blockIdx.y;
+  // for every sample
+  int sample = blockIdx.x * blockDim.x + threadIdx.x;
+  if (sample < num_samples) {
+    T rng_number = rng_data[sample + dist * num_samples];
+
+    // Find the bucket that a uniform random number lies in
+    int selected_category = binarySearchFunctor<T>(
+        cumulative_probs + dist * num_categories,
+        norm_probs_data + dist * num_categories, num_categories, rng_number);
+
+    out_data[sample + dist * num_samples] = selected_category;
   }
 }
 
@@ -215,10 +213,11 @@ class MultinomialOpKernel<platform::CUDADeviceContext, T>
 
     // number of threads in a block is min(num_categories, 512)
     dim3 block_norm(num_categories < 512 ? num_categories : 512);
-    dim3 grid_norm((num_categories - 1) / block_norm.x + 1, num_distributions);
+    dim3 grid_norm((num_distributions * num_categories - 1) / block_norm.x + 1);
     NormalizeProbability<
         T><<<grid_norm, block_norm, 0, ctx.cuda_device_context().stream()>>>(
-        norm_probs_data, in_data, sum_rows_data);
+        norm_probs_data, in_data, sum_rows_data, num_distributions,
+        num_categories);
 
     // Get cumulative probability of each distribution. It's the same function
     // of
diff --git a/paddle/fluid/operators/npu_op_runner.cc b/paddle/fluid/operators/npu_op_runner.cc
index 830e18cb8a14c..e104fc157d6f0 100644
--- a/paddle/fluid/operators/npu_op_runner.cc
+++ b/paddle/fluid/operators/npu_op_runner.cc
@@ -436,5 +436,67 @@ void NpuOpRunner::Run(aclrtStream stream) const {
   PADDLE_ENFORCE_NPU_SUCCESS(ret);
 }
 
+void NpuOpRunner::TypeAdapter(
+    const std::vector<Tensor> &inputs, const std::vector<Tensor> &outputs,
+    const NPUAttributeMap &attrs, const platform::NPUDeviceContext &dev_ctx,
+    std::function<void(const std::vector<Tensor> &, const std::vector<Tensor> &,
+                       const NPUAttributeMap &,
+                       const platform::NPUDeviceContext &)>
+        op_runner,
+    const std::vector<framework::proto::VarType::Type> &input_type,
+    const std::vector<framework::proto::VarType::Type> &output_type) {
+  PADDLE_ENFORCE_EQ(
+      inputs.size(), input_type.size(),
+      platform::errors::InvalidArgument(
+          "The number of inputs must be equal to input_type.size()."));
+  PADDLE_ENFORCE_EQ(
+      outputs.size(), output_type.size(),
+      platform::errors::InvalidArgument(
+          "The number of outputs must be equal to output_type.size()."));
+
+  std::vector<Tensor> tmp_inputs(inputs.size());
+  std::vector<Tensor> tmp_outputs(outputs.size());
+
+  for (size_t i = 0; i < input_type.size(); ++i) {
+    bool cast_input =
+        (input_type[i] == -1 || input_type[i] != inputs[i].type());
+    if (!cast_input) {
+      tmp_inputs[i].ShareDataWith(inputs[i]);
+    } else {
+      tmp_inputs[i].Resize(inputs[i].dims());
+      tmp_inputs[i].mutable_data(dev_ctx.GetPlace(), input_type[i]);
+
+      const auto &cast_runner = NpuOpRunner(
+          "Cast", {inputs[i]}, {tmp_inputs[i]},
+          {{"dst_type", static_cast<int>(ConvertToNpuDtype(input_type[i]))}});
+      cast_runner.Run(dev_ctx.stream());
+    }
+  }
+  for (size_t i = 0; i < output_type.size(); ++i) {
+    bool cast_output =
+        (output_type[i] == -1 || output_type[i] != outputs[i].type());
+    if (!cast_output) {
+      tmp_outputs[i].ShareDataWith(outputs[i]);
+    } else {
+      tmp_outputs[i].Resize(outputs[i].dims());
+      tmp_outputs[i].mutable_data(dev_ctx.GetPlace(), output_type[i]);
+    }
+  }
+
+  op_runner(tmp_inputs, tmp_outputs, attrs, dev_ctx);
+
+  for (size_t i = 0; i < output_type.size(); ++i) {
+    bool cast_output =
+        (output_type[i] == -1 || output_type[i] != outputs[i].type());
+    if (cast_output) {
+      const auto &cast_runner = NpuOpRunner(
+          "Cast", {tmp_outputs[i]}, {outputs[i]},
+          {{"dst_type",
+            static_cast<int>(ConvertToNpuDtype(outputs[i].type()))}});
+      cast_runner.Run(dev_ctx.stream());
+    }
+  }
+}
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/npu_op_runner.h b/paddle/fluid/operators/npu_op_runner.h
index 6db5f17d67118..a4a3786b5da53 100644
--- a/paddle/fluid/operators/npu_op_runner.h
+++ b/paddle/fluid/operators/npu_op_runner.h
@@ -103,6 +103,16 @@ class NpuOpRunner {
 
   void Run(aclrtStream stream = nullptr) const;
 
+  static void TypeAdapter(
+      const std::vector<Tensor> &inputs, const std::vector<Tensor> &outputs,
+      const NPUAttributeMap &attrs, const platform::NPUDeviceContext &dev_ctx,
+      std::function<void(const std::vector<Tensor> &,
+                         const std::vector<Tensor> &, const NPUAttributeMap &,
+                         const platform::NPUDeviceContext &)>
+          op_runner,
+      const std::vector<framework::proto::VarType::Type> &input_type,
+      const std::vector<framework::proto::VarType::Type> &output_type);
+
  private:
   aclTensorDesc *CreateTensorDesc(Tensor tensor,
                                   aclMemType mem_type = ACL_MEMTYPE_DEVICE);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc
index 5efc7e9b869b7..68417cdad50c0 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc
@@ -73,20 +73,33 @@ class ReduceMaxNPUKernel : public framework::OpKernel<T> {
       attr_input = {{"axes", dim_vec}, {"keep_dims", keep_dim}};
     }
 
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    const auto& runner =
-        NpuOpRunner("ReduceMaxD", {*x}, {cast_out}, attr_input);
-    runner.Run(stream);
+    const auto& dev_ctx =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>();
+    if (x->type() == framework::proto::VarType::INT64) {
+      auto op_func = [](const std::vector<Tensor>& inputs,
+                        const std::vector<Tensor>& outputs,
+                        const NPUAttributeMap& attrs,
+                        const platform::NPUDeviceContext& dev_ctx) {
+        const auto& runner =
+            NpuOpRunner("ReduceMaxD", {inputs[0]}, {outputs[0]}, attrs);
+        runner.Run(dev_ctx.stream());
+      };
+
+      NpuOpRunner::TypeAdapter({*x}, {cast_out}, attr_input, dev_ctx, op_func,
+                               {framework::proto::VarType::INT32},
+                               {framework::proto::VarType::INT32});
+    } else {
+      const auto& runner =
+          NpuOpRunner("ReduceMaxD", {*x}, {cast_out}, attr_input);
+      runner.Run(dev_ctx.stream());
+    }
 
     if (x->type() != cast_out_dtype) {
       auto dst_dtype = ConvertToNpuDtype(cast_out_dtype);
       const auto& runner_cast =
           NpuOpRunner("Cast", {cast_out}, {*out},
                       {{"dst_type", static_cast<int>(dst_dtype)}});
-      runner_cast.Run(stream);
+      runner_cast.Run(dev_ctx.stream());
     }
   }
 };
@@ -98,4 +111,6 @@ namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 REGISTER_OP_NPU_KERNEL(
     reduce_max, ops::ReduceMaxNPUKernel<plat::NPUDeviceContext, float>,
-    ops::ReduceMaxNPUKernel<plat::NPUDeviceContext, plat::float16>);
+    ops::ReduceMaxNPUKernel<plat::NPUDeviceContext, plat::float16>,
+    ops::ReduceMaxNPUKernel<plat::NPUDeviceContext, int64_t>,
+    ops::ReduceMaxNPUKernel<plat::NPUDeviceContext, int>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc
index 78bd42ff00c83..33fcdbce9d0ee 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc
@@ -142,12 +142,18 @@ namespace ops = paddle::operators;
 REGISTER_OP_NPU_KERNEL(
     reduce_sum,
     ops::ReduceSumNPUKernel<paddle::platform::NPUDeviceContext, float>,
+#ifdef PADDLE_WITH_ASCEND_INT64
+    ops::ReduceSumNPUKernel<paddle::platform::NPUDeviceContext, int64_t>,
+#endif
     ops::ReduceSumNPUKernel<paddle::platform::NPUDeviceContext, int>,
     ops::ReduceSumNPUKernel<paddle::platform::NPUDeviceContext,
                             paddle::platform::float16>);
 REGISTER_OP_NPU_KERNEL(
     reduce_sum_grad,
     ops::ReduceSumGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
+#ifdef PADDLE_WITH_ASCEND_INT64
+    ops::ReduceSumGradNPUKernel<paddle::platform::NPUDeviceContext, int64_t>,
+#endif
     ops::ReduceSumGradNPUKernel<paddle::platform::NPUDeviceContext, int>,
     ops::ReduceSumGradNPUKernel<paddle::platform::NPUDeviceContext,
                                 paddle::platform::float16>);
diff --git a/paddle/fluid/operators/rnn_op.h b/paddle/fluid/operators/rnn_op.h
index ab64e50d450f0..5e19be5e4cfe1 100644
--- a/paddle/fluid/operators/rnn_op.h
+++ b/paddle/fluid/operators/rnn_op.h
@@ -965,6 +965,9 @@ class RNNCPUKernel : public framework::OpKernel<T> {
     }
     dropout_mask->mutable_data<uint8_t>(output->dims(), ctx.GetPlace());
 
+    auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
+    math::SetConstant<platform::CPUDeviceContext, uint8_t> ones;
+    ones(dev_ctx, dropout_mask, static_cast<uint8_t>(1));
     // init the output and allocate the memory
     output->mutable_data<T>(ctx.GetPlace());
     int gate_num = 4;
diff --git a/paddle/fluid/operators/roll_op.cc b/paddle/fluid/operators/roll_op.cc
index b74dfc984affb..f82510556fde8 100644
--- a/paddle/fluid/operators/roll_op.cc
+++ b/paddle/fluid/operators/roll_op.cc
@@ -183,7 +183,12 @@ REGISTER_OP_VERSION(roll)
                      "(std::vector<int64_t>) Axis along which to roll. "
                      "It must have the same size with shifts, or size = 0.",
                      std::vector<int64_t>())
-            .DeleteAttr(
-                "dims",
-                "(std::vector<int64_t>) Dims along which to roll. "
-                "It must have the same size with shifts, or size = 0."));
+            .DeleteAttr("dims",
+                        "(std::vector<int64_t>) Dims along which to roll. "
+                        "It must have the same size with shifts, or size = 0."))
+    .AddCheckpoint(
+        R"ROC(Upgrade roll add a dispensable input "ShiftsTensor".)ROC",
+        paddle::framework::compatible::OpVersionDesc().NewInput(
+            "ShiftsTensor",
+            "The number of places by which the elements of"
+            "the tensor are shifted."));
diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc
index a195452791048..038fcfcfee490 100644
--- a/paddle/fluid/operators/scale_op.cc
+++ b/paddle/fluid/operators/scale_op.cc
@@ -70,6 +70,17 @@ class ScaleOp : public framework::OperatorWithKernel {
 #endif
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
+
+  framework::KernelSignature GetExpectedPtenKernelArgs(
+      const framework::ExecutionContext &ctx) const override {
+    if (ctx.HasInput("ScaleTensor")) {
+      return framework::KernelSignature("scale.host", {"X", "ScaleTensor"},
+                                        {"bias", "bias_after_scale"}, {"Out"});
+    } else {
+      return framework::KernelSignature(
+          "scale", {"X"}, {"scale", "bias", "bias_after_scale"}, {"Out"});
+    }
+  }
 };
 
 class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
diff --git a/paddle/fluid/operators/scale_op.h b/paddle/fluid/operators/scale_op.h
index e7a07810c621c..a75c9fd4fd245 100644
--- a/paddle/fluid/operators/scale_op.h
+++ b/paddle/fluid/operators/scale_op.h
@@ -14,9 +14,13 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/eigen/eigen_function.h"
+#include "paddle/fluid/framework/pten_utils.h"
+
+// only can include the headers in paddle/top/api dirs
+#include "paddle/pten/api/lib/utils/tensor_utils.h"
+#include "paddle/pten/include/core.h"
+#include "paddle/pten/include/math.h"
 
 namespace paddle {
 namespace operators {
@@ -33,6 +37,7 @@ static inline T GetAttrFromTensor(const framework::Tensor* tensor) {
   return tensor_data[0];
 }
 
+// See Note [ Why still keep the original kernel implementation? ]
 template <typename DeviceContext, typename T>
 class ScaleKernel : public framework::OpKernel<T> {
  public:
@@ -40,13 +45,13 @@ class ScaleKernel : public framework::OpKernel<T> {
     auto* in_var = ctx.InputVar("X");
     auto* in = framework::GetLoDTensorOrSelectedRowsValueFromVar(*in_var);
 
-    auto bias = static_cast<T>(ctx.Attr<float>("bias"));
+    auto bias = ctx.Attr<float>("bias");
     auto bias_after_scale = ctx.Attr<bool>("bias_after_scale");
 
-    auto scale = static_cast<T>(ctx.Attr<float>("scale"));
+    auto scale = ctx.Attr<float>("scale");
     if (ctx.HasInput("ScaleTensor")) {
       auto* scale_tensor = ctx.Input<framework::Tensor>("ScaleTensor");
-      scale = GetAttrFromTensor<T>(scale_tensor);
+      scale = static_cast<float>(GetAttrFromTensor<T>(scale_tensor));
     }
 
     auto* out_var = ctx.OutputVar("Out");
@@ -56,22 +61,17 @@ class ScaleKernel : public framework::OpKernel<T> {
       out_slr->set_rows(in_slr.rows());
       out_slr->set_height(in_slr.height());
     }
-
     auto* out =
         framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(out_var);
     out->mutable_data<T>(in->place());
+    auto& dev_ctx = ctx.device_context<DeviceContext>();
 
-    PADDLE_ENFORCE_EQ(in->dims(), out->dims(),
-                      paddle::platform::errors::InvalidArgument(
-                          "the input and output should have the same dim"
-                          "but input dim is %s, output dim is %s",
-                          in->dims(), out->dims()));
+    auto pt_x = paddle::experimental::MakePtenDenseTensor(*in);
+    auto pt_out = paddle::experimental::MakePtenDenseTensor(*out);
 
-    auto eigen_out = framework::EigenVector<T>::Flatten(*out);
-    auto eigen_in = framework::EigenVector<T>::Flatten(*in);
-    auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
-    EigenScale<std::decay_t<decltype(dev)>, T>::Eval(
-        dev, eigen_out, eigen_in, scale, bias, bias_after_scale);
+    // call new kernel
+    pten::Scale<T>(dev_ctx, *pt_x.get(), scale, bias, bias_after_scale,
+                   pt_out.get());
   }
 };
 
diff --git a/paddle/fluid/operators/scale_op_npu.cc b/paddle/fluid/operators/scale_op_npu.cc
index 744a9b137f622..c2f320ed684b8 100644
--- a/paddle/fluid/operators/scale_op_npu.cc
+++ b/paddle/fluid/operators/scale_op_npu.cc
@@ -37,15 +37,47 @@ class ScaleNPUKernel : public framework::OpKernel<T> {
       auto* scale_tensor = ctx.Input<framework::Tensor>("ScaleTensor");
       scale = static_cast<float>(GetAttrFromTensor<T>(scale_tensor));
     }
-
+    if (isinf(scale)) {
+      if (signbit(scale)) {
+        scale = -std::numeric_limits<float>::max();
+      } else {
+        scale = std::numeric_limits<float>::max();
+      }
+    }
     if (!bias_after_scale) {
       bias *= scale;
     }
     out->mutable_data<T>(ctx.GetPlace());
-    const auto& runner =
-        NpuOpRunner("Power", {*x}, {*out},
-                    {{"power", power}, {"scale", scale}, {"shift", bias}});
-    runner.Run(stream);
+
+    framework::NPUAttributeMap attrs = {
+        {"power", power}, {"scale", scale}, {"shift", bias}};
+    const auto& dev_ctx =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>();
+    auto op_func = [](const std::vector<Tensor>& inputs,
+                      const std::vector<Tensor>& outputs,
+                      const NPUAttributeMap& attrs,
+                      const platform::NPUDeviceContext& dev_ctx) {
+      const auto& muls_runner = NpuOpRunner("Muls", {inputs[0]}, {outputs[0]},
+                                            {{"value", attrs.at("scale")}});
+      muls_runner.Run(dev_ctx.stream());
+
+      const auto& adds_runner = NpuOpRunner("Adds", {outputs[0]}, {outputs[0]},
+                                            {{"value", attrs.at("shift")}});
+      adds_runner.Run(dev_ctx.stream());
+    };
+
+    if (x->type() == framework::proto::VarType::INT32) {
+      NpuOpRunner::TypeAdapter({*x}, {*out}, attrs, dev_ctx, op_func,
+                               {framework::proto::VarType::INT32},
+                               {framework::proto::VarType::INT32});
+    } else if (x->type() == framework::proto::VarType::INT64) {
+      NpuOpRunner::TypeAdapter({*x}, {*out}, attrs, dev_ctx, op_func,
+                               {framework::proto::VarType::INT32},
+                               {framework::proto::VarType::INT32});
+    } else {
+      const auto& runner = NpuOpRunner("Power", {*x}, {*out}, attrs);
+      runner.Run(stream);
+    }
   }
 };
 
@@ -54,4 +86,6 @@ class ScaleNPUKernel : public framework::OpKernel<T> {
 
 REGISTER_OP_NPU_KERNEL(
     scale, paddle::operators::ScaleNPUKernel<float>,
-    paddle::operators::ScaleNPUKernel<paddle::platform::float16>);
+    paddle::operators::ScaleNPUKernel<paddle::platform::float16>,
+    paddle::operators::ScaleNPUKernel<int64_t>,
+    paddle::operators::ScaleNPUKernel<int>);
diff --git a/paddle/fluid/operators/scale_op_xpu.cc b/paddle/fluid/operators/scale_op_xpu.cc
index e0dfad91570ad..d3943e09b6d0b 100644
--- a/paddle/fluid/operators/scale_op_xpu.cc
+++ b/paddle/fluid/operators/scale_op_xpu.cc
@@ -22,12 +22,14 @@ namespace paddle {
 namespace operators {
 template <typename DeviceContext, typename T>
 class ScaleXPUKernel : public framework::OpKernel<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+
  public:
   virtual void Compute(const framework::ExecutionContext& ctx) const {
     auto* in_var = ctx.InputVar("X");
     auto* in = framework::GetLoDTensorOrSelectedRowsValueFromVar(*in_var);
-    auto scale = static_cast<T>(ctx.Attr<float>("scale"));
-    auto bias = static_cast<T>(ctx.Attr<float>("bias"));
+    auto scale = static_cast<float>(ctx.Attr<float>("scale"));
+    auto bias = static_cast<float>(ctx.Attr<float>("bias"));
     auto bias_after_scale = ctx.Attr<bool>("bias_after_scale");
     auto* out_var = ctx.OutputVar("Out");
     if (in_var->IsType<framework::SelectedRows>() && in_var != out_var) {
@@ -46,9 +48,10 @@ class ScaleXPUKernel : public framework::OpKernel<T> {
                                           in->dims().to_str().c_str(),
                                           out->dims().to_str().c_str()));
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    int r =
-        xpu::scale(dev_ctx.x_context(), in->data<float>(), out->data<float>(),
-                   in->numel(), bias_after_scale, scale, bias);
+    int r = xpu::scale(dev_ctx.x_context(),
+                       reinterpret_cast<const XPUType*>(in->data<T>()),
+                       reinterpret_cast<XPUType*>(out->data<T>()), in->numel(),
+                       bias_after_scale, scale, bias);
     PADDLE_ENFORCE_EQ(
         r, XPU_SUCCESS,
         platform::errors::External("XPU scale kernel return wrong value[%d %s]",
@@ -60,7 +63,11 @@ class ScaleXPUKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+
 REGISTER_OP_XPU_KERNEL(
-    scale, ops::ScaleXPUKernel<paddle::platform::XPUDeviceContext, float>);
+    scale, ops::ScaleXPUKernel<paddle::platform::XPUDeviceContext, float>,
+    ops::ScaleXPUKernel<paddle::platform::XPUDeviceContext,
+                        paddle::platform::float16>,
+    ops::ScaleXPUKernel<paddle::platform::XPUDeviceContext, int64_t>);
 
 #endif
diff --git a/paddle/fluid/operators/sequence_ops/sequence_pad_op.h b/paddle/fluid/operators/sequence_ops/sequence_pad_op.h
index d8ae0b200df7d..a9660f05c3c6b 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_pad_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_pad_op.h
@@ -46,7 +46,7 @@ class SequencePadOpKernel : public framework::OpKernel<T> {
 
     math::PaddingLoDTensorFunctor<DeviceContext, T>()(
         ctx.template device_context<DeviceContext>(), *x, out, *pad_value,
-        padded_length, 0, false, false, false, math::kBatchLengthWidth);
+        padded_length, 0, false, math::kBatchLengthWidth);
 
     LoDTensor seq_len;
     seq_len.Resize(len_t->dims());
@@ -72,7 +72,7 @@ class SequencePadGradOpKernel : public framework::OpKernel<T> {
 
       math::UnpaddingLoDTensorFunctor<DeviceContext, T>()(
           ctx.template device_context<DeviceContext>(), *d_out, d_x,
-          padded_length, 0, false, false, false, math::kBatchLengthWidth);
+          padded_length, 0, false, math::kBatchLengthWidth);
     }
   }
 };
diff --git a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.h b/paddle/fluid/operators/sequence_ops/sequence_unpad_op.h
index 398c3bba07569..60ba4797db1e2 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_unpad_op.h
@@ -69,8 +69,7 @@ class SequenceUnpadOpKernel : public framework::OpKernel<T> {
 
     int64_t padded_length = x_t->dims()[1];
     math::UnpaddingLoDTensorFunctor<DeviceContext, T>()(
-        dev_ctx, *x_t, out_t, padded_length, 0, false, false, false,
-        math::kBatchLengthWidth);
+        dev_ctx, *x_t, out_t, padded_length, 0, false, math::kBatchLengthWidth);
   }
 };
 
@@ -94,7 +93,7 @@ class SequenceUnpadGradOpKernel : public framework::OpKernel<T> {
 
       math::PaddingLoDTensorFunctor<DeviceContext, T>()(
           ctx.template device_context<DeviceContext>(), *d_out, d_x, zero_pads,
-          padded_length, 0, false, false, false, math::kBatchLengthWidth);
+          padded_length, 0, false, math::kBatchLengthWidth);
     }
   }
 };
diff --git a/paddle/fluid/operators/sign_op.h b/paddle/fluid/operators/sign_op.h
index b6d501afa621a..b7a46cc546797 100644
--- a/paddle/fluid/operators/sign_op.h
+++ b/paddle/fluid/operators/sign_op.h
@@ -16,24 +16,31 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/pten_utils.h"
 #include "paddle/fluid/operators/eigen/eigen_function.h"
 
+// only can include the headers in paddle/pten/api dirs
+#include "paddle/pten/include/core.h"
+#include "paddle/pten/include/math.h"
+
 namespace paddle {
 namespace operators {
+
+// See Note [ Why still keep the original kernel implementation? ]
 template <typename DeviceContext, typename T>
 class SignKernel : public framework::OpKernel<T> {
  public:
   virtual void Compute(const framework::ExecutionContext& context) const {
+    auto* x = context.Input<framework::Tensor>("X");
     auto* out = context.Output<framework::Tensor>("Out");
-    auto* in = context.Input<framework::Tensor>("X");
-    out->mutable_data<T>(in->place());
-
-    auto eigen_out = framework::EigenVector<T>::Flatten(*out);
-    auto eigen_in = framework::EigenVector<T>::Flatten(*in);
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    EigenSign<std::decay_t<decltype(place)>, T>::Eval(place, eigen_out,
-                                                      eigen_in);
+    auto& dev_ctx = context.device_context<DeviceContext>();
+    out->mutable_data<T>(x->place());
+
+    auto pt_x = paddle::experimental::MakePtenDenseTensor(*x);
+    auto pt_out = paddle::experimental::MakePtenDenseTensor(*out);
+
+    // call new kernel
+    pten::Sign<T>(dev_ctx, *pt_x.get(), pt_out.get());
   }
 };
 
diff --git a/paddle/fluid/operators/softmax_cudnn_op.cu.h b/paddle/fluid/operators/softmax_cudnn_op.cu.h
index cb63e88d63623..68b694a59f47d 100644
--- a/paddle/fluid/operators/softmax_cudnn_op.cu.h
+++ b/paddle/fluid/operators/softmax_cudnn_op.cu.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
+#include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h"
 #include "paddle/fluid/operators/math/math_cuda_utils.h"
 #include "paddle/fluid/operators/softmax_op.h"
 #include "paddle/fluid/platform/cuda_device_function.h"
@@ -99,6 +100,97 @@ __device__ __forceinline__ void WarpReduceMax(T* sum) {
   }
 }
 
+namespace kps = paddle::operators::kernel_primitives;
+
+template <typename Tx, typename Ty = Tx>
+struct ReduceMaxFunctor {
+  inline Ty initial() { return -std::numeric_limits<Ty>::infinity(); }
+
+  __device__ __forceinline__ Ty operator()(const Ty& a, const Ty& b) const {
+    return max(a, b);
+  }
+};
+
+template <typename Tx, typename Ty = Tx>
+struct ExpSubFunctor {
+  HOSTDEVICE inline ExpSubFunctor() { y = static_cast<Tx>(0.0f); }
+
+  HOSTDEVICE explicit inline ExpSubFunctor(Tx y) : y((Tx)(y)) {}
+
+  HOSTDEVICE inline Ty operator()(const Tx& x) const {
+    return static_cast<Ty>(std::exp(x - y));
+  }
+
+ private:
+  Tx y;
+};
+
+template <typename Tx, typename Ty = Tx>
+struct ExpMulFunctor {
+  HOSTDEVICE inline ExpMulFunctor() { y = static_cast<Tx>(1.0f); }
+
+  HOSTDEVICE explicit inline ExpMulFunctor(Tx y) : y((Tx)(y)) {}
+
+  HOSTDEVICE inline Ty operator()(const Tx& x) const {
+    return static_cast<Ty>(std::exp(x) * y);
+  }
+
+ private:
+  Tx y;
+};
+
+template <typename Tx, typename Ty = Tx>
+struct UnarySubFunctor {
+  HOSTDEVICE inline UnarySubFunctor() { y = static_cast<Tx>(0.0f); }
+
+  HOSTDEVICE explicit inline UnarySubFunctor(Tx y) : y((Tx)(y)) {}
+
+  HOSTDEVICE inline Ty operator()(const Tx& x) const {
+    return static_cast<Ty>(x - y);
+  }
+
+ private:
+  Tx y;
+};
+
+template <typename Tx, typename Ty = Tx>
+struct UnaryLogFunctor {
+  HOSTDEVICE inline UnaryLogFunctor() {}
+
+  HOSTDEVICE explicit inline UnaryLogFunctor(int n) {}
+
+  HOSTDEVICE inline Ty operator()(const Tx& x) const {
+    return static_cast<Ty>(std::log(x));
+  }
+};
+
+template <typename Tx, typename Ty>
+struct DataTransFunctor {
+  HOSTDEVICE inline DataTransFunctor() {}
+
+  HOSTDEVICE explicit inline DataTransFunctor(int n) {}
+
+  HOSTDEVICE inline Ty operator()(const Tx& x) const {
+    return x == -std::numeric_limits<Tx>::infinity()
+               ? -std::numeric_limits<Ty>::infinity()
+               : static_cast<Ty>(x);
+  }
+};
+
+template <typename Tx, typename Ty = Tx>
+struct UnaryDivFunctor {
+  HOSTDEVICE inline UnaryDivFunctor() { n_inv = static_cast<Tx>(1.0f); }
+
+  HOSTDEVICE explicit inline UnaryDivFunctor(Tx n) : n_inv((Tx)(1.0 / n)) {}
+
+  HOSTDEVICE inline Ty operator()(const Tx& x) const {
+    return static_cast<Ty>(x * n_inv);
+  }
+
+ private:
+  Tx n_inv;
+};
+
 /*
 Core function of computing softmax forward for axis=-1.
 The computation includes
@@ -117,12 +209,14 @@ __global__ void WarpSoftmaxForward(T* softmax, const T* src,
   constexpr int kDimCeil = 1 << Log2Elements;
   constexpr int kWarpSize = (kDimCeil < 32) ? kDimCeil : 32;
   constexpr int kVSize = sizeof(VecT) / sizeof(T);
-  constexpr int kIterations = kDimCeil / kWarpSize;
-  constexpr int kIterationsV =
-      (kIterations >= kVSize) ? (kIterations / kVSize) : 1;
+  constexpr int kLoops = kDimCeil / kWarpSize;
+  constexpr int kLoopsV = (kLoops >= kVSize) ? (kLoops / kVSize) : 1;
   constexpr int kBatchSize = (kDimCeil <= 32) ? 2 : 1;
-
   int first_batch = (blockDim.y * blockIdx.x + threadIdx.y) * kBatchSize;
+  constexpr int kStep = kBatchSize * kLoopsV * kVSize;
+  constexpr int kVItem = kLoopsV * kVSize;
+  constexpr AccT kLowInf = -std::numeric_limits<AccT>::infinity();
+  using kMode = kps::details::ReduceMode;
 
   // max index to read
   int idx_max_v[kBatchSize];
@@ -133,146 +227,51 @@ __global__ void WarpSoftmaxForward(T* softmax, const T* src,
   }
 
   // read data from global memory
-  AccT srcdata[kBatchSize][kIterationsV][kVSize];
-
+  AccT srcdata[kBatchSize][kLoopsV][kVSize];
+  kps::Init<AccT, kStep>(&srcdata[0][0][0], kLowInf);
+  T src_tmp[kBatchSize][kLoopsV][kVSize];
+  kps::Init<T, kStep>(&src_tmp[0][0][0], -std::numeric_limits<T>::infinity());
 #pragma unroll
   for (int i = 0; i < kBatchSize; ++i) {
-// read data
-#pragma unroll
-    for (int it = 0; it < kIterationsV; ++it) {
-      int src_idx = threadIdx.x + it * kWarpSize;
-      if (kVSize == 1) {
-        if (src_idx < idx_max_v[i]) {
-          srcdata[i][it][0] =
-              static_cast<AccT>(src[(first_batch + i) * stride + src_idx]);
-        } else {
-          srcdata[i][it][0] = -std::numeric_limits<AccT>::infinity();
-        }
-      } else {
-        const VecT* src_v =
-            reinterpret_cast<const VecT*>(&src[(first_batch + i) * stride]);
-        if (src_idx < idx_max_v[i]) {
-          VecT srctmp = src_v[src_idx];
-          const T* srcinptr = reinterpret_cast<const T*>(&srctmp);
-#pragma unroll
-          for (int s = 0; s < kVSize; s++) {
-            srcdata[i][it][s] = static_cast<AccT>(srcinptr[s]);
-          }
-        } else {
-#pragma unroll
-          for (int s = 0; s < kVSize; s++) {
-            srcdata[i][it][s] = -std::numeric_limits<AccT>::infinity();
-          }
-        }
-      }
-    }
+    int ptr = (first_batch + i) * stride;
+    const VecT* src_v = reinterpret_cast<const VecT*>(&src[ptr]);
+    VecT* reg_v = reinterpret_cast<VecT*>(&src_tmp[i][0][0]);
+    kps::ReadData<VecT, VecT, kLoopsV, 1, 1, true>(
+        &reg_v[0], &src_v[0], idx_max_v[i], 0, kWarpSize, 1);
+    kps::ElementwiseUnary<T, AccT, kVItem, 1, 1, DataTransFunctor<T, AccT>>(
+        &srcdata[i][0][0], &src_tmp[i][0][0], DataTransFunctor<T, AccT>());
   }
 
-  // compute max value
-  AccT max_value[kBatchSize];
-#pragma unroll
-  for (int i = 0; i < kBatchSize; ++i) {
-    // it = 0
-    AccT valmax = srcdata[i][0][0];
-#pragma unroll
-    for (int s = 1; s < kVSize; ++s) {
-      valmax = (valmax > srcdata[i][0][s]) ? valmax : srcdata[i][0][s];
-    }
-    max_value[i] = valmax;
-
-// it = 1, 2, ...
-#pragma unroll
-    for (int it = 1; it < kIterationsV; ++it) {
-      AccT valmax = srcdata[i][it][0];
-#pragma unroll
-      for (int s = 1; s < kVSize; ++s) {
-        valmax = (valmax > srcdata[i][it][s]) ? valmax : srcdata[i][it][s];
-      }
-      max_value[i] = (max_value[i] > valmax) ? max_value[i] : valmax;
-    }
-  }
-  WarpReduceMax<AccT, kBatchSize, kWarpSize>(max_value);
+  // compute max
+  AccT max[kBatchSize];
+  kps::Init<AccT, kBatchSize>(&max[0], kLowInf);
+  kps::Reduce<AccT, kVItem, kBatchSize, 1, ReduceMaxFunctor<AccT>,
+              kMode::kLocalMode>(&max[0], &srcdata[0][0][0],
+                                 ReduceMaxFunctor<AccT>(), true);
+  WarpReduceMax<AccT, kBatchSize, kWarpSize>(max);
 
   // compute sum
-  AccT sum[kBatchSize];
-#pragma unroll
+  AccT sum[kBatchSize] = {0};
   for (int i = 0; i < kBatchSize; ++i) {
-    // it = 0
-    if (LogMode) {
-      sum[i] = std::exp(srcdata[i][0][0] - max_value[i]);
-    } else {
-      srcdata[i][0][0] = std::exp(srcdata[i][0][0] - max_value[i]);
-      sum[i] = srcdata[i][0][0];
-    }
-#pragma unroll
-    for (int s = 1; s < kVSize; ++s) {
-      if (LogMode) {
-        sum[i] += std::exp(srcdata[i][0][s] - max_value[i]);
-      } else {
-        srcdata[i][0][s] = std::exp(srcdata[i][0][s] - max_value[i]);
-        sum[i] += srcdata[i][0][s];
-      }
-    }
-
-// it = 1, 2, ...
-#pragma unroll
-    for (int it = 1; it < kIterationsV; ++it) {
-#pragma unroll
-      for (int s = 0; s < kVSize; ++s) {
-        if (LogMode) {
-          sum[i] += std::exp(srcdata[i][it][s] - max_value[i]);
-        } else {
-          srcdata[i][it][s] = std::exp(srcdata[i][it][s] - max_value[i]);
-          sum[i] += srcdata[i][it][s];
-        }
-      }
-    }
+    kps::ElementwiseUnary<AccT, AccT, kVItem, 1, 1, ExpSubFunctor<AccT>>(
+        &srcdata[i][0][0], &srcdata[i][0][0], ExpSubFunctor<AccT>(max[i]));
   }
+  kps::Reduce<AccT, kVItem, kBatchSize, 1, kps::AddFunctor<AccT>,
+              kMode::kLocalMode>(&sum[0], &srcdata[0][0][0],
+                                 kps::AddFunctor<AccT>(), true);
   WarpReduceSum<AccT, kBatchSize, kWarpSize>(sum);
 
-// write result to global memory
+  // write result to global memory
+  T out_tmp[kBatchSize][kLoopsV][kVSize];
 #pragma unroll
   for (int i = 0; i < kBatchSize; ++i) {
-    if (LogMode) {
-      sum[i] = std::log(sum[i]);
-    }
-
-#pragma unroll
-    for (int it = 0; it < kIterationsV; ++it) {
-      int idx = threadIdx.x + it * kWarpSize;
-      if (kVSize == 1) {
-        if (idx < idx_max_v[i]) {
-          if (LogMode) {
-            softmax[(first_batch + i) * stride + idx] =
-                srcdata[i][it][0] - max_value[i] - sum[i];
-          } else {
-            softmax[(first_batch + i) * stride + idx] =
-                srcdata[i][it][0] / sum[i];
-          }
-        } else {
-          break;
-        }
-      } else {
-        VecT* softmax_v =
-            reinterpret_cast<VecT*>(&softmax[(first_batch + i) * stride]);
-        VecT tmpdata;
-        T* tmpptr = reinterpret_cast<T*>(&tmpdata);
-#pragma unroll
-        for (int s = 0; s < kVSize; ++s) {
-          if (LogMode) {
-            tmpptr[s] = srcdata[i][it][s] - max_value[i] - sum[i];
-          } else {
-            tmpptr[s] = srcdata[i][it][s] / sum[i];
-          }
-        }
-
-        if (idx < idx_max_v[i]) {
-          softmax_v[idx] = tmpdata;
-        } else {
-          break;
-        }
-      }
-    }
+    kps::ElementwiseUnary<AccT, T, kVItem, 1, 1, UnaryDivFunctor<AccT>>(
+        &out_tmp[i][0][0], &srcdata[i][0][0], UnaryDivFunctor<AccT>(sum[i]));
+    int softmax_ptr = (first_batch + i) * stride;
+    VecT* softmax_v = reinterpret_cast<VecT*>(&softmax[softmax_ptr]);
+    VecT* reg_v = reinterpret_cast<VecT*>(&out_tmp[i][0][0]);
+    kps::WriteData<VecT, VecT, kLoopsV, 1, 1, true>(
+        &softmax_v[0], &reg_v[0], idx_max_v[i], 0, kWarpSize, 1);
   }
 }
 
@@ -293,101 +292,82 @@ __global__ void WarpSoftmaxBackward(T* dst, const T* grad, const T* src,
   constexpr int kVSize = sizeof(VecT) / sizeof(T);
   constexpr int kDimCeil = 1 << Log2Elements;
   constexpr int kWarpSize = (kDimCeil < 32) ? kDimCeil : 32;
-  constexpr int kIterations = kDimCeil / kWarpSize;
+  constexpr int kLoops = kDimCeil / kWarpSize;
   constexpr int kBatchSize = (kDimCeil <= 128) ? 2 : 1;
-  constexpr int kIterationsV =
-      (kIterations >= kVSize) ? (kIterations / kVSize) : 1;
+  constexpr int kLoopsV = (kLoops >= kVSize) ? (kLoops / kVSize) : 1;
   int element_count_v = element_count / kVSize;
-
   int first_batch = (blockDim.y * blockIdx.x + threadIdx.y) * kBatchSize;
-  int local_batches = batch_size - first_batch;
-  if (local_batches > kBatchSize) {
-    local_batches = kBatchSize;
+  int local_batches = min(batch_size - first_batch, kBatchSize);
+
+  // max index to read
+  int idx_max_v[kBatchSize];
+#pragma unroll
+  for (int i = 0; i < kBatchSize; i++) {
+    int idx_max = ((i + first_batch) < batch_size) ? element_count : 0;
+    idx_max_v[i] = idx_max / kVSize;
   }
 
   // read data from global memory
-  VecT src_reg[kBatchSize][kIterationsV];
-  VecT grad_reg[kBatchSize][kIterationsV];
-
-  for (int i = 0; i < kBatchSize; ++i) {
-    const VecT* src_v =
-        reinterpret_cast<const VecT*>(&src[(first_batch + i) * stride]);
-    const VecT* grad_v =
-        reinterpret_cast<const VecT*>(&grad[(first_batch + i) * stride]);
-
-    // max index to read
-    int idx_max = (i < local_batches) ? element_count : 0;
-    int idx_max_v = idx_max / kVSize;
-
-    // read data
-    for (int it = 0; it < kIterationsV; ++it) {
-      int src_idx = threadIdx.x + it * kWarpSize;
-      if (src_idx < idx_max_v) {
-        src_reg[i][it] = src_v[src_idx];
-        grad_reg[i][it] = grad_v[src_idx];
-      } else {
+  VecT src_reg[kBatchSize][kLoopsV];
+  VecT grad_reg[kBatchSize][kLoopsV];
+  VecT k_value;
+  for (int s = 0; s < kVSize; s++) {
+    reinterpret_cast<T*>(&k_value)[s] = 0.0;
+  }
+  kps::Init<VecT, kBatchSize * kLoopsV>(&src_reg[0][0], k_value);
+  kps::Init<VecT, kBatchSize * kLoopsV>(&grad_reg[0][0], k_value);
 #pragma unroll
-        for (int s = 0; s < kVSize; s++) {
-          reinterpret_cast<T*>(&src_reg[i][it])[s] = 0.0;
-          reinterpret_cast<T*>(&grad_reg[i][it])[s] = 0.0;
-        }
-      }
-    }
+  for (int i = 0; i < kBatchSize; ++i) {
+    int flag = i < local_batches ? 1 : 0;
+    int ptr = (first_batch + i) * stride;
+    const VecT* src_v = reinterpret_cast<const VecT*>(&src[ptr]);
+    const VecT* grad_v = reinterpret_cast<const VecT*>(&grad[ptr]);
+    kps::ReadData<VecT, VecT, kLoopsV, 1, 1, true>(
+        &src_reg[i][0], &src_v[0], idx_max_v[i], 0, kWarpSize, flag);
+    kps::ReadData<VecT, VecT, kLoopsV, 1, 1, true>(
+        &grad_reg[i][0], &grad_v[0], idx_max_v[i], 0, kWarpSize, flag);
   }
 
+  // change T to AccT
+  AccT src_tmp[kBatchSize][kLoopsV][kVSize];
+  AccT grad_tmp[kBatchSize][kLoopsV][kVSize];
+  const T* src_ptr = reinterpret_cast<const T*>(&src_reg[0][0]);
+  const T* grad_ptr = reinterpret_cast<const T*>(&grad_reg[0][0]);
+  constexpr int kStep = kBatchSize * kLoopsV * kVSize;
+  constexpr int kVItem = kLoopsV * kVSize;
+  kps::ElementwiseUnary<T, AccT, kStep, 1, 1, DataTransFunctor<T, AccT>>(
+      &src_tmp[0][0][0], &src_ptr[0], DataTransFunctor<T, AccT>());
+  kps::ElementwiseUnary<T, AccT, kStep, 1, 1, DataTransFunctor<T, AccT>>(
+      &grad_tmp[0][0][0], &grad_ptr[0], DataTransFunctor<T, AccT>());
+
   // compute sum
   AccT sum[kBatchSize]{0.0};
-#pragma unroll
-  for (int i = 0; i < kBatchSize; ++i) {
-#pragma unroll
-    for (int it = 0; it < kIterationsV; ++it) {
-      T* gradptr = reinterpret_cast<T*>(&grad_reg[i][it]);
-      T* srcptr = reinterpret_cast<T*>(&src_reg[i][it]);
-#pragma unroll
-      for (int s = 0; s < kVSize; ++s) {
-        if (LogMode) {
-          sum[i] += static_cast<AccT>(gradptr[s]);
-        } else {
-          sum[i] += static_cast<AccT>(gradptr[s] * srcptr[s]);
-        }
-      }
-    }
-  }
+  AccT sum_tmp[kBatchSize][kLoopsV][kVSize];
+  AccT* gradptr = reinterpret_cast<AccT*>(&grad_tmp[0][0][0]);
+  AccT* srcptr = reinterpret_cast<AccT*>(&src_tmp[0][0][0]);
+  kps::ElementwiseBinary<AccT, AccT, kStep, 1, 1, kps::MulFunctor<AccT>>(
+      &sum_tmp[0][0][0], &gradptr[0], &srcptr[0], kps::MulFunctor<AccT>());
+  kps::Reduce<AccT, kVItem, kBatchSize, 1, kps::AddFunctor<AccT>,
+              kps::details::ReduceMode::kLocalMode>(
+      &sum[0], &sum_tmp[0][0][0], kps::AddFunctor<AccT>(), true);
   WarpReduceSum<AccT, kBatchSize, kWarpSize>(sum);
 
-// write result
+  // write result to global memory
+  AccT out[kBatchSize][kLoopsV][kVSize];
+  T out_tmp[kBatchSize][kLoopsV][kVSize];
 #pragma unroll
   for (int i = 0; i < kBatchSize; ++i) {
     if (i >= local_batches) break;
-
+    AccT* gradptr = reinterpret_cast<AccT*>(&grad_tmp[i][0][0]);
+    AccT* srcptr = reinterpret_cast<AccT*>(&src_tmp[i][0][0]);
+    kps::ElementwiseUnary<AccT, AccT, kVItem, 1, 1, UnarySubFunctor<AccT>>(
+        &out[i][0][0], &gradptr[0], UnarySubFunctor<AccT>(sum[i]));
+    kps::ElementwiseBinary<AccT, T, kVItem, 1, 1, kps::MulFunctor<AccT>>(
+        &out_tmp[i][0][0], &srcptr[0], &out[i][0][0], kps::MulFunctor<AccT>());
     VecT* dst_v = reinterpret_cast<VecT*>(&dst[(first_batch + i) * stride]);
-
-    // max index to write
-    int idx_max = (i < local_batches) ? element_count : 0;
-    int idx_max_v = idx_max / kVSize;
-
-#pragma unroll
-    for (int it = 0; it < kIterationsV; ++it) {
-      VecT tmpdata;
-      T* tmpptr = reinterpret_cast<T*>(&tmpdata);
-      T* gradptr = reinterpret_cast<T*>(&grad_reg[i][it]);
-      T* srcptr = reinterpret_cast<T*>(&src_reg[i][it]);
-#pragma unroll
-      for (int s = 0; s < kVSize; ++s) {
-        if (LogMode) {
-          tmpptr[s] = static_cast<AccT>(gradptr[s]) -
-                      std::exp(static_cast<AccT>(srcptr[s])) * sum[i];
-        } else {
-          tmpptr[s] = static_cast<AccT>(srcptr[s]) *
-                      (static_cast<AccT>(gradptr[s]) - sum[i]);
-        }
-      }
-
-      int idx = threadIdx.x + it * kWarpSize;
-      if (idx < idx_max_v) {
-        dst_v[idx] = tmpdata;
-      }
-    }
+    VecT* reg_v = reinterpret_cast<VecT*>(&out_tmp[i][0][0]);
+    kps::WriteData<VecT, VecT, kLoopsV, 1, 1, true>(
+        &dst_v[0], &reg_v[0], idx_max_v[i], 0, kWarpSize, 1);
   }
 }
 
@@ -493,6 +473,7 @@ void SoftmaxForwardCUDAKernelDriver(const platform::CUDADeviceContext& dev_ctx,
     // vectorization read/write
     using T4 = typename VecT4<T>::Type;
     using T2 = typename VecT2<T>::Type;
+
     if (dim % 4 == 0) {
       SwitchWarpSoftmaxForward<T, T4, LogMode>(blocks, threads, dev_ctx,
                                                out_data, x.data<T>(), N, dim,
diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc
index 4b0179953030a..3b1753b49b11d 100644
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
@@ -85,9 +85,10 @@ class SoftmaxOp : public framework::OperatorWithKernel {
 
 #ifndef PADDLE_WITH_ASCEND_CL
     if (input_data_type == framework::proto::VarType::FP16) {
-      PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
-                        platform::errors::InvalidArgument(
-                            "float16 can only be used on GPU place"));
+      PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()) ||
+                            platform::is_xpu_place(ctx.GetPlace()),
+                        true, platform::errors::InvalidArgument(
+                                  "float16 can only be used on GPU/XPU place"));
     }
 #endif
 
@@ -214,9 +215,10 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
 #endif
     if (input_data_type == framework::proto::VarType::FP16) {
       if (!(platform::is_gpu_place(ctx.GetPlace()) ||
-            platform::is_npu_place(ctx.GetPlace())))
+            platform::is_npu_place(ctx.GetPlace()) ||
+            platform::is_xpu_place(ctx.GetPlace())))
         PADDLE_THROW(platform::errors::InvalidArgument(
-            "float16 can only be used on GPU/NPU place"));
+            "float16 can only be used on GPU/NPU/XPU place"));
     }
 
     return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout_,
diff --git a/paddle/fluid/operators/softmax_op_xpu.cc b/paddle/fluid/operators/softmax_op_xpu.cc
index 3527478f76610..0adc12e684c3a 100644
--- a/paddle/fluid/operators/softmax_op_xpu.cc
+++ b/paddle/fluid/operators/softmax_op_xpu.cc
@@ -22,6 +22,8 @@ using DDim = framework::DDim;
 
 template <typename DeviceContext, typename T>
 class SoftmaxXPUKernel : public framework::OpKernel<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* x = context.Input<Tensor>("X");
@@ -43,29 +45,43 @@ class SoftmaxXPUKernel : public framework::OpKernel<T> {
     auto& dev_ctx = context.template device_context<DeviceContext>();
 
     int r = XPU_SUCCESS;
-    Tensor clip_x;
-    int len = x->numel();
-    T* clip_x_data =
-        clip_x.mutable_data<T>(context.GetPlace(), len * sizeof(T));
-    r = xpu::clip_v2(dev_ctx.x_context(), x->data<float>(), clip_x_data, len,
-                     static_cast<float>(-1e20), static_cast<float>(1e20));
-    PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
-                      platform::errors::External("XPU API(clip) return wrong "
-                                                 "value[%d %s]",
-                                                 r, XPUAPIErrorMsg[r]));
-
-    r = xpu::softmax<T>(dev_ctx.x_context(), clip_x_data, out->data<float>(),
-                        x_dims, axis);
-    PADDLE_ENFORCE_EQ(
-        r, XPU_SUCCESS,
-        platform::errors::External("XPU API(softmax2d_forward) return wrong "
-                                   "value[%d %s]",
-                                   r, XPUAPIErrorMsg[r]));
+    paddle::platform::XPUVersion version = dev_ctx.xpu_version();
+    if (version == paddle::platform::XPUVersion::XPU1) {
+      xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
+      XPUType* clip_x_data_l3 = RAII_GUARD.alloc_l3_or_gm<XPUType>(x->numel());
+      r = xpu::clip_v2(dev_ctx.x_context(),
+                       reinterpret_cast<const XPUType*>(x->data<T>()),
+                       clip_x_data_l3, x->numel(), static_cast<XPUType>(-1e20),
+                       static_cast<XPUType>(1e20));
+      PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
+                        platform::errors::External(
+                            "XPU API(clip_v2) return wrong value[%d %s]", r,
+                            XPUAPIErrorMsg[r]));
+      r = xpu::softmax<XPUType>(dev_ctx.x_context(), clip_x_data_l3,
+                                reinterpret_cast<XPUType*>(out->data<T>()),
+                                x_dims, axis);
+      PADDLE_ENFORCE_EQ(
+          r, XPU_SUCCESS,
+          platform::errors::External("XPU API(softmax2d_forward) return wrong "
+                                     "value[%d %s]",
+                                     r, XPUAPIErrorMsg[r]));
+    } else {
+      r = xpu::softmax<XPUType>(
+          dev_ctx.x_context(), reinterpret_cast<const XPUType*>(x->data<T>()),
+          reinterpret_cast<XPUType*>(out->data<T>()), x_dims, axis);
+      PADDLE_ENFORCE_EQ(
+          r, XPU_SUCCESS,
+          platform::errors::External("XPU API(softmax2d_forward) return wrong "
+                                     "value[%d %s]",
+                                     r, XPUAPIErrorMsg[r]));
+    }
   }
 };
 
 template <typename DeviceContext, typename T>
 class SoftmaxGradXPUKernel : public framework::OpKernel<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* out = context.Input<Tensor>("Out");
@@ -86,9 +102,10 @@ class SoftmaxGradXPUKernel : public framework::OpKernel<T> {
     }
 
     auto& dev_ctx = context.template device_context<DeviceContext>();
-    int r = xpu::softmax_grad<T>(dev_ctx.x_context(), out->data<float>(),
-                                 dout->data<float>(), dx->data<float>(), x_dims,
-                                 axis);
+    int r = xpu::softmax_grad<XPUType>(
+        dev_ctx.x_context(), reinterpret_cast<const XPUType*>(out->data<T>()),
+        reinterpret_cast<const XPUType*>(dout->data<T>()),
+        reinterpret_cast<XPUType*>(dx->data<T>()), x_dims, axis);
     PADDLE_ENFORCE_EQ(
         r, XPU_SUCCESS,
         platform::errors::External("XPU API(softmax2d_backward) return wrong "
@@ -103,9 +120,13 @@ class SoftmaxGradXPUKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 
 REGISTER_OP_XPU_KERNEL(
-    softmax, ops::SoftmaxXPUKernel<paddle::platform::XPUDeviceContext, float>);
+    softmax, ops::SoftmaxXPUKernel<paddle::platform::XPUDeviceContext, float>,
+    ops::SoftmaxXPUKernel<paddle::platform::XPUDeviceContext,
+                          paddle::platform::float16>);
 REGISTER_OP_XPU_KERNEL(
     softmax_grad,
-    ops::SoftmaxGradXPUKernel<paddle::platform::XPUDeviceContext, float>);
+    ops::SoftmaxGradXPUKernel<paddle::platform::XPUDeviceContext, float>,
+    ops::SoftmaxGradXPUKernel<paddle::platform::XPUDeviceContext,
+                              paddle::platform::float16>);
 
 #endif  // PADDLE_WITH_XPU
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
index b81a37a68782b..6a9dca9fe2a6a 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
@@ -73,17 +73,21 @@ __global__ void CrossEntropyHardLabel(T* loss, const T* softmax,
 
   // thread ids compute loss[ids] using softmax[idx]
   if (ids < n * d) {
-    int64_t idx = idx_n * dim * d + labels[ids] * d + idx_d;
-    if (IgnoreIndex == true) {
-      // IgnoreIndex is true
-      if (labels[ids] == ignore_idx) {
-        loss[ids] = static_cast<T>(0.0);
+    if (labels[ids] < 0) {  // label is negative
+      loss[ids] = static_cast<T>(0.0);
+    } else {  // label is positive of zero
+      int64_t idx = idx_n * dim * d + labels[ids] * d + idx_d;
+      if (IgnoreIndex == true) {
+        // IgnoreIndex is true
+        if (labels[ids] == ignore_idx) {
+          loss[ids] = static_cast<T>(0.0);
+        } else {
+          loss[ids] = -Log(softmax[idx]);
+        }
       } else {
+        // IgnoreIndex is false
         loss[ids] = -Log(softmax[idx]);
       }
-    } else {
-      // IgnoreIndex is false
-      loss[ids] = -Log(softmax[idx]);
     }
   }
 }
diff --git a/paddle/fluid/operators/solve_op.h b/paddle/fluid/operators/solve_op.h
index d55c2647c1f3a..ec72269f697e8 100644
--- a/paddle/fluid/operators/solve_op.h
+++ b/paddle/fluid/operators/solve_op.h
@@ -49,9 +49,9 @@ struct IdentityFunctor {
 };
 
 template <typename DeviceContext, typename T>
-void ReduceSumForSolveGrad(const Tensor* input, Tensor* output,
-                           const std::vector<int>& reduce_dims, bool keep_dim,
-                           const paddle::framework::ExecutionContext& ctx) {
+void ReduceSumForSolve(const Tensor* input, Tensor* output,
+                       const std::vector<int>& reduce_dims, bool keep_dim,
+                       const paddle::framework::ExecutionContext& ctx) {
 #if defined(__NVCC__) || defined(__HIPCC__)
   auto stream = ctx.cuda_device_context().stream();
   TensorReduce<T, T, cub::Sum, IdentityFunctor>(*input, output, reduce_dims,
@@ -157,112 +157,72 @@ static void to_unsqueeze(const framework::ExecutionContext& context,
   out->Resize(out_dims);
 }
 
-template <typename Container>
-Container infer_size_impl(std::vector<int64_t> a, std::vector<int64_t> b) {
-  size_t dimsA = a.size();
-  size_t dimsB = b.size();
-  size_t ndim = dimsA > dimsB ? dimsA : dimsB;
-  Container expandedSizes(ndim);
-
-  for (ptrdiff_t i = (ptrdiff_t)ndim - 1; i >= 0; --i) {
-    ptrdiff_t offset = ndim - 1 - i;
-    ptrdiff_t dimA = dimsA - 1 - offset;
-    ptrdiff_t dimB = dimsB - 1 - offset;
-    int64_t sizeA = (dimA >= 0) ? a[dimA] : 1;
-    int64_t sizeB = (dimB >= 0) ? b[dimB] : 1;
+// Prepared for the broadcast operation
+static std::vector<int64_t> get_broadcast_batch_portion(
+    std::vector<int64_t> x, std::vector<int64_t> y) {
+  size_t size_x = x.size();
+  size_t size_y = y.size();
+  size_t size = std::max(size_x, size_y);
+  std::vector<int64_t> batchPortion(size);
+
+  ptrdiff_t i = (ptrdiff_t)size - 1;
+  for (; i >= 0; --i) {
+    ptrdiff_t offset = size - i - 1;
+    ptrdiff_t dim_x = size_x - offset - 1;
+    ptrdiff_t dim_y = size_y - offset - 1;
+    int64_t x_size = (dim_x >= 0) ? x[dim_x] : 1;
+    int64_t y_size = (dim_y >= 0) ? y[dim_y] : 1;
 
     PADDLE_ENFORCE_EQ(
-        (sizeA == sizeB || sizeA == 1 || sizeB == 1), true,
+        (x_size == y_size || x_size == 1 || y_size == 1), true,
         platform::errors::PreconditionNotMet(
-            "The size of tensor a (%d) must match the size of tensor b "
+            "The size of tensor x (%d) must match the size of tensor y "
             "(%d) at non-singleton dimension %d.",
-            sizeA, sizeB, i));
+            x_size, y_size, i));
 
-    expandedSizes[i] = sizeA == 1 ? sizeB : sizeA;
+    batchPortion[i] = x_size != 1 ? x_size : y_size;
   }
-  return expandedSizes;
-}
-
-// infer size for broadcast operation
-static std::vector<int64_t> infer_size(std::vector<int64_t> a,
-                                       std::vector<int64_t> b) {
-  return infer_size_impl<std::vector<int64_t>>(a, b);
+  return batchPortion;
 }
 
-// necessary check before expand operation
-static void expand_check(const Tensor& arg1,
-                         std::vector<int64_t> expand_shape) {
-  auto rank = arg1.dims().size();
-  PADDLE_ENFORCE_GE(
-      rank, 1, platform::errors::InvalidArgument(
-                   "The rank of the input 'X' for expand must be positive, "
-                   "but the value received is %d.",
-                   rank));
-  PADDLE_ENFORCE_LE(
-      rank, MAX_RANK_SUPPORTED,
-      platform::errors::InvalidArgument(
-          "The rank of the input 'X' for expand must be less than "
-          "or equal to %d, but the value received is %d.",
-          MAX_RANK_SUPPORTED, rank));
-  auto shape_size = static_cast<int>(expand_shape.size());
-  PADDLE_ENFORCE_GE(
-      shape_size, rank,
-      platform::errors::InvalidArgument(
-          "The number (%d) of elements of 'shape' for expand must be "
-          "greater than or equal to the rank (%d) of the input 'X'.",
-          shape_size, rank));
-  PADDLE_ENFORCE_LE(
-      shape_size, MAX_RANK_SUPPORTED,
-      platform::errors::InvalidArgument(
-          "The number (%d) of elements of 'shape' for expand must be "
-          "less than or equal to %d.",
-          shape_size, MAX_RANK_SUPPORTED));
-}
-
-// broadcast the batch dimensions of arg1 and arg2.
+// broadcast the batch dimensions of tensor x and tensor y.
 static inline std::tuple<std::vector<int64_t>, std::vector<int64_t>>
-_broadcast_batch_dims(const Tensor& arg1, const Tensor& arg2) {
-  std::vector<int64_t> arg1_dims_vec =
-      paddle::framework::vectorize(arg1.dims());
-  std::vector<int64_t> arg2_dims_vec =
-      paddle::framework::vectorize(arg2.dims());
+get_broadcast_dims(const Tensor& x, const Tensor& y) {
+  std::vector<int64_t> x_dims_vec = paddle::framework::vectorize(x.dims());
+  std::vector<int64_t> y_dims_vec = paddle::framework::vectorize(y.dims());
 
-  std::vector<int64_t>::const_iterator f1 = arg1_dims_vec.begin();
-  std::vector<int64_t>::const_iterator l1 = arg1_dims_vec.end() - 2;
-  std::vector<int64_t> arg1_dims_vec_cut(f1, l1);
+  std::vector<int64_t>::const_iterator f1 = x_dims_vec.begin();
+  std::vector<int64_t>::const_iterator l1 = x_dims_vec.end() - 2;
+  std::vector<int64_t> x_dims_vec_cut(f1, l1);
 
-  std::vector<int64_t>::const_iterator f2 = arg2_dims_vec.begin();
-  std::vector<int64_t>::const_iterator l2 = arg2_dims_vec.end() - 2;
-  std::vector<int64_t> arg2_dims_vec_cut(f2, l2);
+  std::vector<int64_t>::const_iterator f2 = y_dims_vec.begin();
+  std::vector<int64_t>::const_iterator l2 = y_dims_vec.end() - 2;
+  std::vector<int64_t> y_dims_vec_cut(f2, l2);
 
   std::vector<int64_t> expand_batch_portion =
-      infer_size(arg1_dims_vec_cut, arg2_dims_vec_cut);
+      get_broadcast_batch_portion(x_dims_vec_cut, y_dims_vec_cut);
 
-  std::vector<int64_t> arg1_expand_size({expand_batch_portion});
-  arg1_expand_size.insert(
-      arg1_expand_size.end(),
-      {arg1_dims_vec[static_cast<int>(arg1_dims_vec.size()) - 2],
-       arg1_dims_vec[static_cast<int>(arg1_dims_vec.size()) - 1]});
+  std::vector<int64_t> x_expand_size({expand_batch_portion});
+  x_expand_size.insert(x_expand_size.end(),
+                       {x_dims_vec[static_cast<int>(x_dims_vec.size()) - 2],
+                        x_dims_vec[static_cast<int>(x_dims_vec.size()) - 1]});
 
-  std::vector<int64_t> arg2_expand_size({expand_batch_portion});
-  arg2_expand_size.insert(
-      arg2_expand_size.end(),
-      {arg2_dims_vec[static_cast<int>(arg2_dims_vec.size()) - 2],
-       arg2_dims_vec[static_cast<int>(arg2_dims_vec.size()) - 1]});
+  std::vector<int64_t> y_expand_size({expand_batch_portion});
+  y_expand_size.insert(y_expand_size.end(),
+                       {y_dims_vec[static_cast<int>(y_dims_vec.size()) - 2],
+                        y_dims_vec[static_cast<int>(y_dims_vec.size()) - 1]});
 
-  return std::make_tuple(arg1_expand_size, arg2_expand_size);
+  return std::make_tuple(x_expand_size, y_expand_size);
 }
 
 template <int Rank, typename T, typename DeviceContext>
-void tensor_expand(const framework::ExecutionContext& context,
-                   const Tensor& arg1, Tensor* out0,
-                   std::vector<int64_t> expand_size) {
-  auto in_dims = arg1.dims();
-  auto expand_shape = expand_size;
-  auto vec_in_dims = framework::vectorize<int>(in_dims);
+void expand_impl(const DeviceContext& context, const Tensor& in, Tensor* out,
+                 const std::vector<int64_t>& expand_shape) {
+  auto vec_in_dims = framework::vectorize<int>(in.dims());
   auto diff = expand_shape.size() - vec_in_dims.size();
   vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
   std::vector<int> repeat_times(vec_in_dims.size());
+
   for (size_t i = 0; i < vec_in_dims.size(); ++i) {
     PADDLE_ENFORCE_NE(
         expand_shape[i], 0,
@@ -309,12 +269,11 @@ void tensor_expand(const framework::ExecutionContext& context,
     out_dims[i] *= repeat_times[i];
   }
 
-  out0->Resize(out_dims);
-  auto x = EigenTensor<T, Rank>::From(arg1, new_in_dims);
-  out0->mutable_data<T>(context.GetPlace());
-  auto y = EigenTensor<T, Rank>::From(*out0, out_dims);
-  auto& place =
-      *context.template device_context<DeviceContext>().eigen_device();
+  out->Resize(out_dims);
+  out->mutable_data<T>(context.GetPlace());
+  auto x = EigenTensor<T, Rank>::From(in, new_in_dims);
+  auto y = EigenTensor<T, Rank>::From(*out, out_dims);
+  auto& place = *context.eigen_device();
   // use 32-bit index to speed up
   bool use_32bit_index = y.size() < Eigen::NumTraits<int>::highest();
   if (use_32bit_index) {
@@ -326,6 +285,41 @@ void tensor_expand(const framework::ExecutionContext& context,
   }
 }
 
+template <typename T, typename DeviceContext>
+void TensorExpand(const DeviceContext& context, const Tensor& in, Tensor* out,
+                  const std::vector<int64_t>& expand_shape) {
+  // necessary check before expand operation
+  PADDLE_ENFORCE_GE(expand_shape.size(), in.dims().size(),
+                    platform::errors::InvalidArgument(
+                        "The size of 'expand_shape' (%d) should >= the input "
+                        "Tensor's rank (%d).",
+                        expand_shape.size(), in.dims().size()));
+  PADDLE_ENFORCE_LE(expand_shape.size(), MAX_RANK_SUPPORTED,
+                    platform::errors::InvalidArgument(
+                        "The size of 'expand_shape' (%d) should be <= %d",
+                        expand_shape.size(), MAX_RANK_SUPPORTED));
+  switch (expand_shape.size()) {
+    case 1:
+      expand_impl<1, T, DeviceContext>(context, in, out, expand_shape);
+      break;
+    case 2:
+      expand_impl<2, T, DeviceContext>(context, in, out, expand_shape);
+      break;
+    case 3:
+      expand_impl<3, T, DeviceContext>(context, in, out, expand_shape);
+      break;
+    case 4:
+      expand_impl<4, T, DeviceContext>(context, in, out, expand_shape);
+      break;
+    case 5:
+      expand_impl<5, T, DeviceContext>(context, in, out, expand_shape);
+      break;
+    case 6:
+      expand_impl<6, T, DeviceContext>(context, in, out, expand_shape);
+      break;
+  }
+}
+
 template <typename DeviceContext, typename T>
 static void linalg_solve(const framework::ExecutionContext& context,
                          const framework::Tensor* x, const framework::Tensor* y,
@@ -362,71 +356,13 @@ static void linalg_solve(const framework::ExecutionContext& context,
   std::vector<int64_t> x_broadcast_dims;
   std::vector<int64_t> y_broadcast_dims;
   std::tie(x_broadcast_dims, y_broadcast_dims) =
-      _broadcast_batch_dims(tmp_x, tmp_y);
-
-  expand_check(tmp_x, x_broadcast_dims);
-  expand_check(tmp_y, y_broadcast_dims);
+      get_broadcast_dims(tmp_x, tmp_y);
 
   Tensor tmp_x_bc;
-  Tensor tmp_y_bc;
-  auto tmp_x_rank = tmp_x.dims().size();
-  auto tmp_y_rank = tmp_y.dims().size();
+  TensorExpand<T, DeviceContext>(dev_ctx, tmp_x, &tmp_x_bc, x_broadcast_dims);
 
-  auto rank_0 = std::max(tmp_x_rank, static_cast<int>(x_broadcast_dims.size()));
-  switch (rank_0) {
-    case 1:
-      tensor_expand<1, T, DeviceContext>(context, tmp_x, &tmp_x_bc,
-                                         x_broadcast_dims);
-      break;
-    case 2:
-      tensor_expand<2, T, DeviceContext>(context, tmp_x, &tmp_x_bc,
-                                         x_broadcast_dims);
-      break;
-    case 3:
-      tensor_expand<3, T, DeviceContext>(context, tmp_x, &tmp_x_bc,
-                                         x_broadcast_dims);
-      break;
-    case 4:
-      tensor_expand<4, T, DeviceContext>(context, tmp_x, &tmp_x_bc,
-                                         x_broadcast_dims);
-      break;
-    case 5:
-      tensor_expand<5, T, DeviceContext>(context, tmp_x, &tmp_x_bc,
-                                         x_broadcast_dims);
-      break;
-    case 6:
-      tensor_expand<6, T, DeviceContext>(context, tmp_x, &tmp_x_bc,
-                                         x_broadcast_dims);
-      break;
-  }
-
-  auto rank_1 = std::max(tmp_y_rank, static_cast<int>(y_broadcast_dims.size()));
-  switch (rank_1) {
-    case 1:
-      tensor_expand<1, T, DeviceContext>(context, tmp_y, &tmp_y_bc,
-                                         y_broadcast_dims);
-      break;
-    case 2:
-      tensor_expand<2, T, DeviceContext>(context, tmp_y, &tmp_y_bc,
-                                         y_broadcast_dims);
-      break;
-    case 3:
-      tensor_expand<3, T, DeviceContext>(context, tmp_y, &tmp_y_bc,
-                                         y_broadcast_dims);
-      break;
-    case 4:
-      tensor_expand<4, T, DeviceContext>(context, tmp_y, &tmp_y_bc,
-                                         y_broadcast_dims);
-      break;
-    case 5:
-      tensor_expand<5, T, DeviceContext>(context, tmp_y, &tmp_y_bc,
-                                         y_broadcast_dims);
-      break;
-    case 6:
-      tensor_expand<6, T, DeviceContext>(context, tmp_y, &tmp_y_bc,
-                                         y_broadcast_dims);
-      break;
-  }
+  Tensor tmp_y_bc;
+  TensorExpand<T, DeviceContext>(dev_ctx, tmp_y, &tmp_y_bc, y_broadcast_dims);
 
   auto x_dim = x->dims();
   auto y_dim = y->dims();
@@ -566,7 +502,7 @@ class SolveGradKernel : public framework::OpKernel<T> {
     std::vector<int64_t> x_broadcast_dims;
     std::vector<int64_t> y_broadcast_dims;
     std::tie(x_broadcast_dims, y_broadcast_dims) =
-        _broadcast_batch_dims(tmp_x, tmp_y);
+        get_broadcast_dims(tmp_x, tmp_y);
 
     // tmp_dx
     Tensor tmp_dx;
@@ -666,8 +602,8 @@ class SolveGradKernel : public framework::OpKernel<T> {
           if (dy_help.dims().size() != dy->dims().size()) {
             keep_dim = false;
           }
-          ReduceSumForSolveGrad<DeviceContext, T>(&dy_help, dy, dy_reduce_dims,
-                                                  keep_dim, ctx);
+          ReduceSumForSolve<DeviceContext, T>(&dy_help, dy, dy_reduce_dims,
+                                              keep_dim, ctx);
         }
         dy->Resize(y->dims());
       }
@@ -716,8 +652,8 @@ class SolveGradKernel : public framework::OpKernel<T> {
           if (dx_help.dims().size() != dx->dims().size()) {
             keep_dim = false;
           }
-          ReduceSumForSolveGrad<DeviceContext, T>(&dx_help, dx, dx_reduce_dims,
-                                                  keep_dim, ctx);
+          ReduceSumForSolve<DeviceContext, T>(&dx_help, dx, dx_reduce_dims,
+                                              keep_dim, ctx);
         }
         dx->Resize(input->dims());
       }
diff --git a/paddle/fluid/operators/spectral_helper.h b/paddle/fluid/operators/spectral_helper.h
index 9c34d500eac92..924ec7cd52d50 100644
--- a/paddle/fluid/operators/spectral_helper.h
+++ b/paddle/fluid/operators/spectral_helper.h
@@ -27,12 +27,12 @@
 namespace paddle {
 namespace operators {
 using ScalarType = framework::proto::VarType::Type;
-const int64_t kMaxCUFFTNdim = 3;
-const int64_t kMaxDataNdim = kMaxCUFFTNdim + 1;
+const int64_t kMaxFFTNdim = 3;
+const int64_t kMaxDataNdim = kMaxFFTNdim + 1;
 // This struct is used to easily compute hashes of the
 // parameters. It will be the **key** to the plan cache.
-struct PlanKey {
-  // between 1 and kMaxCUFFTNdim, i.e., 1 <= signal_ndim <= 3
+struct FFTConfigKey {
+  // between 1 and kMaxFFTNdim, i.e., 1 <= signal_ndim <= 3
   int64_t signal_ndim_;
   // These include additional batch dimension as well.
   int64_t sizes_[kMaxDataNdim];
@@ -41,12 +41,12 @@ struct PlanKey {
   FFTTransformType fft_type_;
   ScalarType value_type_;
 
-  PlanKey() = default;
+  FFTConfigKey() = default;
 
-  PlanKey(const std::vector<int64_t>& in_shape,
-          const std::vector<int64_t>& out_shape,
-          const std::vector<int64_t>& signal_size, FFTTransformType fft_type,
-          ScalarType value_type) {
+  FFTConfigKey(const std::vector<int64_t>& in_shape,
+               const std::vector<int64_t>& out_shape,
+               const std::vector<int64_t>& signal_size,
+               FFTTransformType fft_type, ScalarType value_type) {
     // Padding bits must be zeroed for hashing
     memset(this, 0, sizeof(*this));
     signal_ndim_ = signal_size.size() - 1;
@@ -69,6 +69,12 @@ class CuFFTHandle {
     PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cufftCreate(&handle_));
   }
 
+  CuFFTHandle(const CuFFTHandle& other) = delete;
+  CuFFTHandle& operator=(const CuFFTHandle& other) = delete;
+
+  CuFFTHandle(CuFFTHandle&& other) = delete;
+  CuFFTHandle& operator=(CuFFTHandle&& other) = delete;
+
   ::cufftHandle& get() { return handle_; }
   const ::cufftHandle& get() const { return handle_; }
 
@@ -81,20 +87,20 @@ using plan_size_type = long long int;  // NOLINT
 // This class contains all the information needed to execute a cuFFT plan:
 //   1. the plan
 //   2. the workspace size needed
-class CuFFTConfig {
+class FFTConfig {
  public:
   // Only move semantics is enought for this class. Although we already use
   // unique_ptr for the plan, still remove copy constructor and assignment op so
   // we don't accidentally copy and take perf hit.
-  explicit CuFFTConfig(const PlanKey& plan_key)
-      : CuFFTConfig(
+  explicit FFTConfig(const FFTConfigKey& plan_key)
+      : FFTConfig(
             std::vector<int64_t>(plan_key.sizes_,
                                  plan_key.sizes_ + plan_key.signal_ndim_ + 1),
             plan_key.signal_ndim_, plan_key.fft_type_, plan_key.value_type_) {}
 
   // sizes are full signal, including batch size and always two-sided
-  CuFFTConfig(const std::vector<int64_t>& sizes, const int64_t signal_ndim,
-              FFTTransformType fft_type, ScalarType dtype)
+  FFTConfig(const std::vector<int64_t>& sizes, const int64_t signal_ndim,
+            FFTTransformType fft_type, ScalarType dtype)
       : fft_type_(fft_type), value_type_(dtype) {
     // signal sizes (excluding batch dim)
     std::vector<plan_size_type> signal_sizes(sizes.begin() + 1, sizes.end());
@@ -144,6 +150,12 @@ class CuFFTConfig {
     ws_size = ws_size_t;
   }
 
+  FFTConfig(const FFTConfig& other) = delete;
+  FFTConfig& operator=(const FFTConfig& other) = delete;
+
+  FFTConfig(FFTConfig&& other) = delete;
+  FFTConfig& operator=(FFTConfig&& other) = delete;
+
   const cufftHandle& plan() const { return plan_ptr.get(); }
 
   FFTTransformType transform_type() const { return fft_type_; }
@@ -167,6 +179,12 @@ class HIPFFTHandle {
     PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftCreate(&handle_));
   }
 
+  HIPFFTHandle(const HIPFFTHandle& other) = delete;
+  HIPFFTHandle& operator=(const HIPFFTHandle& other) = delete;
+
+  HIPFFTHandle(HIPFFTHandle&& other) = delete;
+  HIPFFTHandle& operator=(HIPFFTHandle&& other) = delete;
+
   ::hipfftHandle& get() { return handle_; }
   const ::hipfftHandle& get() const { return handle_; }
 
@@ -178,20 +196,20 @@ using plan_size_type = int;
 // This class contains all the information needed to execute a cuFFT plan:
 //   1. the plan
 //   2. the workspace size needed
-class HIPFFTConfig {
+class FFTConfig {
  public:
   // Only move semantics is enought for this class. Although we already use
   // unique_ptr for the plan, still remove copy constructor and assignment op so
   // we don't accidentally copy and take perf hit.
-  explicit HIPFFTConfig(const PlanKey& plan_key)
-      : HIPFFTConfig(
+  explicit FFTConfig(const FFTConfigKey& plan_key)
+      : FFTConfig(
             std::vector<int64_t>(plan_key.sizes_,
                                  plan_key.sizes_ + plan_key.signal_ndim_ + 1),
             plan_key.signal_ndim_, plan_key.fft_type_, plan_key.value_type_) {}
 
   // sizes are full signal, including batch size and always two-sided
-  HIPFFTConfig(const std::vector<int64_t>& sizes, const int64_t signal_ndim,
-               FFTTransformType fft_type, ScalarType dtype)
+  FFTConfig(const std::vector<int64_t>& sizes, const int64_t signal_ndim,
+            FFTTransformType fft_type, ScalarType dtype)
       : fft_type_(fft_type), value_type_(dtype) {
     // signal sizes (excluding batch dim)
     std::vector<plan_size_type> signal_sizes(sizes.begin() + 1, sizes.end());
@@ -257,5 +275,192 @@ class HIPFFTConfig {
   ScalarType value_type_;
 };
 #endif
+
+// Hashing machinery for Key
+// Fowler–Noll–Vo hash function
+// see
+// https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
+template <typename Key>
+struct KeyHash {
+  // Key must be a POD because we read out its memory
+  // contenst as char* when hashing
+  static_assert(std::is_pod<Key>::value, "Key must be plain old data type");
+
+  size_t operator()(const Key& params) const {
+    auto ptr = reinterpret_cast<const uint8_t*>(&params);
+    uint32_t value = 0x811C9DC5;
+    for (int i = 0; i < static_cast<int>(sizeof(Key)); ++i) {
+      value ^= ptr[i];
+      value *= 0x01000193;
+    }
+    return static_cast<size_t>(value);
+  }
+};
+
+template <typename Key>
+struct KeyEqual {
+  // Key must be a POD because we read out its memory
+  // contenst as char* when comparing
+  static_assert(std::is_pod<Key>::value, "Key must be plain old data type");
+
+  bool operator()(const Key& a, const Key& b) const {
+    auto ptr1 = reinterpret_cast<const uint8_t*>(&a);
+    auto ptr2 = reinterpret_cast<const uint8_t*>(&b);
+    return memcmp(ptr1, ptr2, sizeof(Key)) == 0;
+  }
+};
+
+#if CUDA_VERSION < 10000
+// Note that the max plan number for CUDA version < 10 has to be 1023
+// due to a bug that fails on the 1024th plan
+constexpr size_t CUFFT_MAX_PLAN_NUM = 1023;
+constexpr size_t CUFFT_DEFAULT_CACHE_SIZE = CUFFT_MAX_PLAN_NUM;
+#else
+constexpr size_t CUFFT_MAX_PLAN_NUM = std::numeric_limits<size_t>::max();
+// The default max cache size chosen for CUDA version > 10 is arbitrary.
+// This number puts a limit on how big of a plan cache should we maintain by
+// default. Users can always configure it via cufft_set_plan_cache_max_size.
+constexpr size_t CUFFT_DEFAULT_CACHE_SIZE = 4096;
+#endif
+static_assert(CUFFT_MAX_PLAN_NUM >= 0 &&
+                  CUFFT_MAX_PLAN_NUM <= std::numeric_limits<size_t>::max(),
+              "CUFFT_MAX_PLAN_NUM not in size_t range");
+static_assert(CUFFT_DEFAULT_CACHE_SIZE >= 0 &&
+                  CUFFT_DEFAULT_CACHE_SIZE <= CUFFT_MAX_PLAN_NUM,
+              "CUFFT_DEFAULT_CACHE_SIZE not in [0, CUFFT_MAX_PLAN_NUM] range");
+
+// This cache assumes that the mapping from key to value never changes.
+// This is **NOT** thread-safe. Please use a mutex when using it **AND** the
+// value returned from try_emplace_value.
+// The contract of using this cache is that try_emplace_value should only be
+// used when the max_size is positive.
+class FFTConfigCache {
+ public:
+  using kv_t = typename std::pair<FFTConfigKey, FFTConfig>;
+  using map_t = typename std::unordered_map<
+      std::reference_wrapper<FFTConfigKey>, typename std::list<kv_t>::iterator,
+      KeyHash<FFTConfigKey>, KeyEqual<FFTConfigKey>>;
+  using map_kkv_iter_t = typename map_t::iterator;
+
+  FFTConfigCache() : FFTConfigCache(CUFFT_DEFAULT_CACHE_SIZE) {}
+
+  explicit FFTConfigCache(int64_t max_size) { _set_max_size(max_size); }
+
+  FFTConfigCache(const FFTConfigCache& other) = delete;
+  FFTConfigCache& operator=(const FFTConfigCache& other) = delete;
+
+  FFTConfigCache(FFTConfigCache&& other) noexcept
+      : _usage_list(std::move(other._usage_list)),
+        _cache_map(std::move(other._cache_map)),
+        _max_size(other._max_size) {}
+
+  FFTConfigCache& operator=(FFTConfigCache&& other) noexcept {
+    _usage_list = std::move(other._usage_list);
+    _cache_map = std::move(other._cache_map);
+    _max_size = other._max_size;
+    return *this;
+  }
+
+  // If key is in this cache, return the cached config. Otherwise, emplace the
+  // config in this cache and return it.
+  FFTConfig& lookup(FFTConfigKey params) {
+    PADDLE_ENFORCE_GT(_max_size, 0,
+                      platform::errors::InvalidArgument(
+                          "The max size of FFTConfigCache must be great than 0,"
+                          "But received is [%d]",
+                          _max_size));
+
+    map_kkv_iter_t map_it = _cache_map.find(params);
+    // Hit, put to list front
+    if (map_it != _cache_map.end()) {
+      _usage_list.splice(_usage_list.begin(), _usage_list, map_it->second);
+      return map_it->second->second;
+    }
+
+    // Miss
+    // remove if needed
+    if (_usage_list.size() >= _max_size) {
+      auto last = _usage_list.end();
+      last--;
+      _cache_map.erase(last->first);
+      _usage_list.pop_back();
+    }
+
+    // construct new plan at list front, then insert into _cache_map
+    _usage_list.emplace_front(std::piecewise_construct,
+                              std::forward_as_tuple(params),
+                              std::forward_as_tuple(params));
+    auto kv_it = _usage_list.begin();
+    _cache_map.emplace(std::piecewise_construct,
+                       std::forward_as_tuple(kv_it->first),
+                       std::forward_as_tuple(kv_it));
+    return kv_it->second;
+  }
+
+  void clear() {
+    _cache_map.clear();
+    _usage_list.clear();
+  }
+
+  void resize(int64_t new_size) {
+    _set_max_size(new_size);
+    auto cur_size = _usage_list.size();
+    if (cur_size > _max_size) {
+      auto delete_it = _usage_list.end();
+      for (size_t i = 0; i < cur_size - _max_size; i++) {
+        delete_it--;
+        _cache_map.erase(delete_it->first);
+      }
+      _usage_list.erase(delete_it, _usage_list.end());
+    }
+  }
+
+  size_t size() const { return _cache_map.size(); }
+
+  size_t max_size() const noexcept { return _max_size; }
+
+  std::mutex mutex;
+
+ private:
+  // Only sets size and does value check. Does not resize the data structures.
+  void _set_max_size(int64_t new_size) {
+    // We check that 0 <= new_size <= CUFFT_MAX_PLAN_NUM here. Since
+    // CUFFT_MAX_PLAN_NUM is of type size_t, we need to do non-negativity check
+    // first.
+    PADDLE_ENFORCE_GE(
+        new_size, 0,
+        platform::errors::InvalidArgument(
+            "cuFFT plan cache size must be non-negative, But received is [%d]",
+            new_size));
+    PADDLE_ENFORCE_LE(new_size, CUFFT_MAX_PLAN_NUM,
+                      platform::errors::InvalidArgument(
+                          "cuFFT plan cache size can not be larger than [%d], "
+                          "But received is [%d]",
+                          CUFFT_MAX_PLAN_NUM, new_size));
+    _max_size = static_cast<size_t>(new_size);
+  }
+
+  std::list<kv_t> _usage_list;
+  map_t _cache_map;
+  size_t _max_size;
+};
+
+static std::vector<std::unique_ptr<FFTConfigCache>> plan_caches;
+static std::mutex plan_caches_mutex;
+
+static inline FFTConfigCache& get_fft_plan_cache(int64_t device_index) {
+  std::lock_guard<std::mutex> guard(plan_caches_mutex);
+
+  if (device_index >= plan_caches.size()) {
+    plan_caches.resize(device_index + 1);
+  }
+
+  if (!plan_caches[device_index]) {
+    plan_caches[device_index] = std::make_unique<FFTConfigCache>();
+  }
+
+  return *plan_caches[device_index];
+}
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/spectral_op.cu b/paddle/fluid/operators/spectral_op.cu
index e8a4fac2915d7..8e42a070a398e 100644
--- a/paddle/fluid/operators/spectral_op.cu
+++ b/paddle/fluid/operators/spectral_op.cu
@@ -68,9 +68,9 @@ void exec_normalization(const DeviceContext& ctx, const Tensor* in, Tensor* out,
 }
 
 #if defined(PADDLE_WITH_CUDA)
-CuFFTConfig create_cufft_config(const framework::Tensor& input,
-                                const framework::Tensor& output,
-                                int signal_ndim) {
+FFTConfigKey create_fft_configkey(const framework::Tensor& input,
+                                  const framework::Tensor& output,
+                                  int signal_ndim) {
   // Create the transform plan (either from cache or locally)
   const auto value_type = framework::IsComplexType(input.type())
                               ? framework::ToRealType(input.type())
@@ -85,15 +85,14 @@ CuFFTConfig create_cufft_config(const framework::Tensor& input,
     auto out_size = output.dims()[i];
     signal_size[i] = std::max(in_size, out_size);
   }
-  PlanKey key(framework::vectorize(input.dims()),
-              framework::vectorize(output.dims()), signal_size, fft_type,
-              value_type);
-
-  return CuFFTConfig(key);
+  FFTConfigKey key(framework::vectorize(input.dims()),
+                   framework::vectorize(output.dims()), signal_size, fft_type,
+                   value_type);
+  return key;
 }
 
 // Execute a pre-planned transform
-static void exec_cufft_plan_raw(const CuFFTConfig& config, void* in_data,
+static void exec_cufft_plan_raw(const FFTConfig& config, void* in_data,
                                 void* out_data, bool forward) {
   auto& plan = config.plan();
 
@@ -102,7 +101,7 @@ static void exec_cufft_plan_raw(const CuFFTConfig& config, void* in_data,
 }
 
 template <typename DeviceContext, typename Ti, typename To>
-void exec_cufft_plan(const DeviceContext& ctx, const CuFFTConfig& config,
+void exec_cufft_plan(const DeviceContext& ctx, const FFTConfig& config,
                      framework::Tensor* input, framework::Tensor* output,
                      bool forward) {
   // execute transform plan
@@ -136,7 +135,7 @@ void exec_cufft_plan(const DeviceContext& ctx, const CuFFTConfig& config,
 
 #elif defined(PADDLE_WITH_HIP)
 
-HIPFFTConfig create_hipfft_config(const framework::Tensor& input,
+FFTConfigKey create_fft_configkey(const framework::Tensor& input,
                                   const framework::Tensor& output,
                                   int signal_ndim) {
   // Create the transform plan (either from cache or locally)
@@ -153,15 +152,14 @@ HIPFFTConfig create_hipfft_config(const framework::Tensor& input,
     auto out_size = output.dims()[i];
     signal_size[i] = std::max(in_size, out_size);
   }
-  PlanKey key(framework::vectorize(input.dims()),
-              framework::vectorize(output.dims()), signal_size, fft_type,
-              value_type);
-
-  return HIPFFTConfig(key);
+  FFTConfigKey key(framework::vectorize(input.dims()),
+                   framework::vectorize(output.dims()), signal_size, fft_type,
+                   value_type);
+  return key;
 }
 
 // Execute a pre-planned transform
-static void exec_hipfft_plan_raw(const HIPFFTConfig& config, void* in_data,
+static void exec_hipfft_plan_raw(const FFTConfig& config, void* in_data,
                                  void* out_data, bool forward) {
   auto& plan = config.plan();
 
@@ -216,7 +214,7 @@ static void exec_hipfft_plan_raw(const HIPFFTConfig& config, void* in_data,
 }
 
 template <typename DeviceContext, typename Ti, typename To>
-void exec_hipfft_plan(const DeviceContext& ctx, const HIPFFTConfig& config,
+void exec_hipfft_plan(const DeviceContext& ctx, const FFTConfig& config,
                       framework::Tensor* input, framework::Tensor* output,
                       bool forward) {
   auto fft_type = config.transform_type();
@@ -308,34 +306,58 @@ void exec_fft(const DeviceContext& ctx, const Tensor* X, Tensor* out,
   collapsed_output.Resize(framework::make_ddim(collapsed_output_shape));
   collapsed_output.mutable_data<To>(tensor_place);
 
+  FFTConfig* config = nullptr;
+
 #if defined(PADDLE_WITH_CUDA)
+  std::unique_ptr<FFTConfig> config_ = nullptr;
   // create plan
-  CuFFTConfig config =
-      create_cufft_config(collapsed_input, collapsed_output, signal_ndim);
+  FFTConfigKey key =
+      create_fft_configkey(collapsed_input, collapsed_output, signal_ndim);
+  if (CUFFT_VERSION < 10200) {
+    const int64_t device_id = static_cast<int64_t>(
+        reinterpret_cast<const platform::CUDAPlace*>(&collapsed_input.place())
+            ->GetDeviceId());
+    FFTConfigCache& plan_cache = get_fft_plan_cache(device_id);
+    std::unique_lock<std::mutex> guard(plan_cache.mutex, std::defer_lock);
+    guard.lock();
+    config = &(plan_cache.lookup(key));
+  } else {
+    config_ = std::make_unique<FFTConfig>(key);
+    config = config_.get();
+  }
+
   // prepare cufft for execution
   PADDLE_ENFORCE_CUDA_SUCCESS(
-      platform::dynload::cufftSetStream(config.plan(), ctx.stream()));
+      platform::dynload::cufftSetStream(config->plan(), ctx.stream()));
   framework::Tensor workspace_tensor;
-  workspace_tensor.mutable_data<To>(tensor_place, config.workspace_size());
+  workspace_tensor.mutable_data<To>(tensor_place, config->workspace_size());
   PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cufftSetWorkArea(
-      config.plan(), workspace_tensor.data<To>()));
+      config->plan(), workspace_tensor.data<To>()));
   // execute transform plan
-  exec_cufft_plan<DeviceContext, Ti, To>(ctx, config, &collapsed_input,
+  exec_cufft_plan<DeviceContext, Ti, To>(ctx, *config, &collapsed_input,
                                          &collapsed_output, forward);
 
 #elif defined(PADDLE_WITH_HIP)
   // create plan
-  HIPFFTConfig config =
-      create_hipfft_config(collapsed_input, collapsed_output, signal_ndim);
+  FFTConfigKey key =
+      create_fft_configkey(collapsed_input, collapsed_output, signal_ndim);
+  const int64_t device_id = static_cast<int64_t>(
+      reinterpret_cast<const platform::CUDAPlace*>(&collapsed_input.place())
+          ->GetDeviceId());
+  FFTConfigCache& plan_cache = get_fft_plan_cache(device_id);
+  std::unique_lock<std::mutex> guard(plan_cache.mutex, std::defer_lock);
+  guard.lock();
+  config = &(plan_cache.lookup(key));
+
   // prepare cufft for execution
   PADDLE_ENFORCE_CUDA_SUCCESS(
-      platform::dynload::hipfftSetStream(config.plan(), ctx.stream()));
+      platform::dynload::hipfftSetStream(config->plan(), ctx.stream()));
   framework::Tensor workspace_tensor;
-  workspace_tensor.mutable_data<To>(tensor_place, config.workspace_size());
+  workspace_tensor.mutable_data<To>(tensor_place, config->workspace_size());
   PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftSetWorkArea(
-      config.plan(), workspace_tensor.data<To>()));
+      config->plan(), workspace_tensor.data<To>()));
   // execute transform plan
-  exec_hipfft_plan<DeviceContext, Ti, To>(ctx, config, &collapsed_input,
+  exec_hipfft_plan<DeviceContext, Ti, To>(ctx, *config, &collapsed_input,
                                           &collapsed_output, forward);
 #endif
 
@@ -358,10 +380,10 @@ void exec_fft(const DeviceContext& ctx, const Tensor* X, Tensor* out,
 
 // Use the optimized path to perform single R2C or C2R if transformation dim is
 // supported by cuFFT
-bool use_optimized_cufft_path(const std::vector<int64_t>& axes) {
+bool use_optimized_fft_path(const std::vector<int64_t>& axes) {
   // For performance reason, when axes starts with (0, 1), do not use the
   // optimized path.
-  if (axes.size() > kMaxCUFFTNdim ||
+  if (axes.size() > kMaxFFTNdim ||
       (axes.size() >= 2 && axes[0] == 0 && axes[1] == 1)) {
     return false;
   } else {
@@ -391,7 +413,7 @@ struct FFTC2CFunctor<platform::CUDADeviceContext, Ti, To> {
 
     while (true) {
       max_dims =
-          std::min(static_cast<size_t>(kMaxCUFFTNdim), working_axes.size());
+          std::min(static_cast<size_t>(kMaxFFTNdim), working_axes.size());
       first_dims.assign(working_axes.end() - max_dims, working_axes.end());
 
       exec_fft<platform::CUDADeviceContext, Ti, To>(ctx, p_working_tensor,
@@ -418,7 +440,7 @@ struct FFTC2RFunctor<platform::CUDADeviceContext, Ti, To> {
     std::vector<int64_t> in_dims = framework::vectorize(X->dims());
     std::vector<int64_t> out_dims = framework::vectorize(out->dims());
 
-    if (use_optimized_cufft_path(axes)) {
+    if (use_optimized_fft_path(axes)) {
       framework::Tensor x_copy(X->type());
       x_copy.mutable_data<Ti>(X->dims(), ctx.GetPlace());
       framework::TensorCopy(*X, ctx.GetPlace(), &x_copy);
diff --git a/paddle/fluid/operators/stack_op.cc b/paddle/fluid/operators/stack_op.cc
index fbd8d8b2e0727..0a813759aa3ec 100644
--- a/paddle/fluid/operators/stack_op.cc
+++ b/paddle/fluid/operators/stack_op.cc
@@ -71,6 +71,21 @@ class StackOp : public framework::OperatorWithKernel {
     vec.insert(vec.begin() + axis, input_dims.size());
     ctx->SetOutputDim("Y", framework::make_ddim(vec));
   }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    auto input_data_type =
+        framework::OperatorWithKernel::IndicateVarDataType(ctx, "X");
+
+#ifdef PADDLE_WITH_MKLDNN
+    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
+      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
+                                     framework::DataLayout::kMKLDNN,
+                                     framework::LibraryType::kMKLDNN);
+    }
+#endif
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+  }
 };
 
 class StackOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -81,6 +96,11 @@ class StackOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<int>("axis",
                  "The axis along which all of the Inputs(X) should be stacked.")
         .SetDefault(0);
+    AddAttr<bool>(
+        "use_mkldnn",
+        "(bool, default false) Indicates if MKL-DNN kernel will be used")
+        .SetDefault(false)
+        .AsExtra();
     AddComment(R"DOC(
 Stack Operator.
 Stack all of the Inputs(X) into one tensor along Attr(axis). The dims of all Inputs(X) must be the same.
diff --git a/paddle/fluid/operators/tile_op.h b/paddle/fluid/operators/tile_op.h
index 260cbc2368731..5211d72336124 100644
--- a/paddle/fluid/operators/tile_op.h
+++ b/paddle/fluid/operators/tile_op.h
@@ -33,6 +33,7 @@ inline std::vector<int> get_repeat_times(
     auto* repeat_data = repeat_tensor->data<int>();
     framework::Tensor cpu_repeat_tensor;
     if (platform::is_gpu_place(repeat_tensor->place()) ||
+        platform::is_xpu_place(repeat_tensor->place()) ||
         platform::is_npu_place(repeat_tensor->place())) {
       TensorCopySync(*repeat_tensor, platform::CPUPlace(), &cpu_repeat_tensor);
       repeat_data = cpu_repeat_tensor.data<int>();
@@ -50,6 +51,7 @@ inline std::vector<int> get_repeat_times(
     for (size_t i = 0; i < list_repeat_times_tensor.size(); ++i) {
       auto tensor = list_repeat_times_tensor[i];
       if (platform::is_gpu_place(tensor->place()) ||
+          platform::is_xpu_place(tensor->place()) ||
           platform::is_npu_place(tensor->place())) {
         framework::Tensor temp;
         TensorCopySync(*tensor, platform::CPUPlace(), &temp);
diff --git a/paddle/fluid/operators/tile_op_xpu.cc b/paddle/fluid/operators/tile_op_xpu.cc
new file mode 100644
index 0000000000000..94b0e465cfedb
--- /dev/null
+++ b/paddle/fluid/operators/tile_op_xpu.cc
@@ -0,0 +1,119 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+
+#include "paddle/fluid/operators/tile_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class TileXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto rank = context.Input<Tensor>("X")->dims().size();
+    PADDLE_ENFORCE_GE(
+        rank, 1, platform::errors::InvalidArgument(
+                     "The rank of the input 'x' for tile op must be a positive "
+                     "integer, but the value received is %d.",
+                     rank));
+    PADDLE_ENFORCE_LE(
+        rank, MAX_RANK_SUPPORTED,
+        platform::errors::InvalidArgument(
+            "The rank of the input 'x' for tile op "
+            "must be less than or equal to %d, but the value received is %d.",
+            MAX_RANK_SUPPORTED, rank));
+    auto repeat_times = get_repeat_times(context);
+    int repeat_times_size = repeat_times.size();
+    PADDLE_ENFORCE_GE(
+        repeat_times_size, 1,
+        platform::errors::InvalidArgument(
+            "The number of elements of the input 'repeat_times' for tile "
+            "op must be positive, but the value received is %d.",
+            repeat_times_size));
+    PADDLE_ENFORCE_LE(
+        repeat_times_size, MAX_RANK_SUPPORTED,
+        platform::errors::InvalidArgument(
+            "The number of elements of the input 'repeat_times' for tile op "
+            "must be less than or equal to %d, but the value received is %d.",
+            MAX_RANK_SUPPORTED, repeat_times_size));
+
+    auto* in0 = context.Input<framework::Tensor>("X");
+    auto in_dims = in0->dims();
+    for (size_t i = 0; i < repeat_times.size(); ++i) {
+      PADDLE_ENFORCE_GT(
+          repeat_times[i], 0,
+          platform::errors::InvalidArgument(
+              "All elements of the input 'repeat_times' for tile op must "
+              "be positive integers, but the value received is %d.",
+              repeat_times[i]));
+    }
+    auto vec_in_dims = framework::vectorize<int>(in_dims);
+    if (repeat_times.size() < vec_in_dims.size()) {
+      int diff = vec_in_dims.size() - repeat_times.size();
+      repeat_times.insert(repeat_times.begin(), diff, 1);
+    } else {
+      int diff = repeat_times.size() - vec_in_dims.size();
+      vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
+    }
+    PADDLE_ENFORCE_EQ(
+        repeat_times.size(), vec_in_dims.size(),
+        platform::errors::InvalidArgument(
+            "The rank (%d) of the input 'x' and the rank (%d) of the input "
+            "'repeat_times' for tile op must match after promotion.",
+            vec_in_dims.size(), repeat_times.size()));
+
+    auto* out0 = context.Output<framework::Tensor>("Out");
+    framework::DDim new_in_dims = framework::make_ddim(vec_in_dims);
+    framework::DDim out_dims(new_in_dims);
+
+    for (size_t i = 0; i < repeat_times.size(); ++i) {
+      out_dims[i] *= repeat_times[i];
+    }
+    auto vec_out_dims = framework::vectorize<int>(out_dims);
+    out0->Resize(out_dims);
+    out0->mutable_data<T>(context.GetPlace());
+
+    auto& dev_ctx =
+        context.template device_context<paddle::platform::XPUDeviceContext>();
+    std::vector<int> temp(repeat_times.size(), 1);
+    if (repeat_times == temp) {
+      framework::TensorCopy(*in0, context.GetPlace(), dev_ctx, out0);
+      return;
+    }
+
+    int ret = XPU_SUCCESS;
+    if (std::is_same<T, bool>::value) {
+      ret = xpu::broadcast<int8_t>(
+          dev_ctx.x_context(), reinterpret_cast<const int8_t*>(in0->data<T>()),
+          reinterpret_cast<int8_t*>(out0->data<T>()), vec_in_dims,
+          vec_out_dims);
+
+    } else {
+      ret = xpu::broadcast<T>(dev_ctx.x_context(), in0->data<T>(),
+                              out0->data<T>(), vec_in_dims, vec_out_dims);
+    }
+    PADDLE_ENFORCE_EQ(
+        ret, XPU_SUCCESS,
+        platform::errors::External("XPU tile kernel return wrong value[%d %s]",
+                                   ret, XPUAPIErrorMsg[ret]));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(tile, ops::TileXPUKernel<bool>, ops::TileXPUKernel<int>,
+                       ops::TileXPUKernel<int64_t>, ops::TileXPUKernel<float>);
+
+#endif
diff --git a/paddle/fluid/operators/triangular_solve_op.cc b/paddle/fluid/operators/triangular_solve_op.cc
new file mode 100644
index 0000000000000..4b01669bf55b4
--- /dev/null
+++ b/paddle/fluid/operators/triangular_solve_op.cc
@@ -0,0 +1,187 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/triangular_solve_op.h"
+#include "paddle/fluid/operators/solve_op.h"
+
+namespace paddle {
+namespace operators {
+
+class TriangularSolveOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "TriangularSolve");
+    OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "TriangularSolve");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "TriangularSolve");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+
+    auto x_dims_n = x_dims.size();
+    auto y_dims_n = y_dims.size();
+
+    PADDLE_ENFORCE_GE(
+        x_dims_n, 2, platform::errors::InvalidArgument(
+                         "The input tensor X's dimensions of TriangularSolveOp "
+                         "should be >= 2. But received X's "
+                         "dimensions = %d, X's shape = [%s]",
+                         x_dims.size(), x_dims));
+
+    PADDLE_ENFORCE_GE(
+        y_dims_n, 2, platform::errors::InvalidArgument(
+                         "The input tensor Y's dimensions of TriangularSolveOp "
+                         "should be >=2. But received Y's "
+                         "dimensions = %d, Y's shape = [%s]",
+                         y_dims.size(), y_dims));
+
+    PADDLE_ENFORCE_EQ(x_dims[x_dims_n - 2], x_dims[x_dims_n - 1],
+                      platform::errors::InvalidArgument(
+                          "The inner-most 2 dimensions of Input(X) all should "
+                          "be square matrices "
+                          "But received X's shape[-2] = %d and shape[-1] = %d.",
+                          x_dims[x_dims_n - 2], x_dims[x_dims_n - 1]));
+
+    std::vector<int64_t> x_dims_vec = paddle::framework::vectorize(x_dims);
+    std::vector<int64_t> y_dims_vec = paddle::framework::vectorize(y_dims);
+
+    std::vector<int64_t> x_dims_vec_cut(x_dims_vec.begin(),
+                                        x_dims_vec.end() - 2);
+    std::vector<int64_t> y_dims_vec_cut(y_dims_vec.begin(),
+                                        y_dims_vec.end() - 2);
+
+    std::vector<int64_t> expand_batch_portion =
+        get_broadcast_batch_portion(x_dims_vec_cut, y_dims_vec_cut);
+
+    std::vector<int64_t> y_broadcast_dims({expand_batch_portion});
+    y_broadcast_dims.insert(y_broadcast_dims.end(), {y_dims_vec[y_dims_n - 2],
+                                                     y_dims_vec[y_dims_n - 1]});
+
+    // dim of 'Out' is the same with 'Y' after broadcast
+    ctx->SetOutputDim("Out", framework::make_ddim(y_broadcast_dims));
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
+  }
+};
+
+class TriangularSolveOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(Tensor), The first input tensor of triangular solve op, which "
+             "is the triangular coefficient matrix.");
+    AddInput("Y",
+             "(Tensor), The second input tensor of triangular solve op, which "
+             "is multiple right-hand.");
+    AddOutput("Out", "(Tensor), The solution tensor of triangular solve op.");
+    AddAttr<bool>("upper",
+                  "whether to solve the upper-triangular or the "
+                  "lower-triangular system of equations")
+        .SetDefault(true);
+    AddAttr<bool>("transpose", "whether X should be transposed firstly.")
+        .SetDefault(false);
+    AddAttr<bool>("unitriangular", "whether X is unit triangular.")
+        .SetDefault(false);
+    AddComment(R"DOC(
+          Triangular Solve Operator.
+          This operator is used to computes the solution of equations with a triangular coefficient matrix.
+
+          The equation is:
+          $$Out = X^-1 * Y$$
+)DOC");
+  }
+};
+
+class TriangularSolveOpInferVarType
+    : public framework::PassInDtypeAndVarTypeToOutput {
+ protected:
+  std::unordered_map<std::string, std::string>& GetInputOutputWithSameType()
+      const override {
+    static std::unordered_map<std::string, std::string> m{{"X", /*->*/ "Out"}};
+    return m;
+  }
+};
+
+class TriangularSolveGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "triangular_solve");
+    OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "triangular_solve");
+    OP_INOUT_CHECK(ctx->HasInput("Out"), "Input", "Out", "triangular_solve");
+    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input",
+                   "Out@GRAD", "triangular_solve");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+
+    auto x_grad_name = framework::GradVarName("X");
+    auto y_grad_name = framework::GradVarName("Y");
+
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+    }
+    if (ctx->HasOutput(y_grad_name)) {
+      ctx->SetOutputDim(y_grad_name, y_dims);
+    }
+  }
+};
+
+template <typename T>
+class TriangularSolveOpGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> retv) const override {
+    retv->SetType("triangular_solve_grad");
+    retv->SetInput("X", this->Input("X"));
+    retv->SetInput("Y", this->Input("Y"));
+    retv->SetInput("Out", this->Output("Out"));
+    retv->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+
+    retv->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    retv->SetOutput(framework::GradVarName("Y"), this->InputGrad("Y"));
+    retv->SetAttrMap(this->Attrs());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(triangular_solve, ops::TriangularSolveOp,
+                  ops::TriangularSolveOpMaker,
+                  ops::TriangularSolveOpInferVarType,
+                  ops::TriangularSolveOpGradMaker<paddle::framework::OpDesc>,
+                  ops::TriangularSolveOpGradMaker<paddle::imperative::OpBase>);
+
+REGISTER_OPERATOR(triangular_solve_grad, ops::TriangularSolveGradOp);
+
+REGISTER_OP_CPU_KERNEL(
+    triangular_solve,
+    ops::TriangularSolveKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::TriangularSolveKernel<paddle::platform::CPUDeviceContext, double>);
+
+REGISTER_OP_CPU_KERNEL(
+    triangular_solve_grad,
+    ops::TriangularSolveGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::TriangularSolveGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/triangular_solve_op.cu b/paddle/fluid/operators/triangular_solve_op.cu
new file mode 100644
index 0000000000000..c5218aec03e28
--- /dev/null
+++ b/paddle/fluid/operators/triangular_solve_op.cu
@@ -0,0 +1,64 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
+#include "paddle/fluid/operators/triangular_solve_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct MatrixReduceSumFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const Tensor& in, Tensor* out,
+                  const framework::ExecutionContext& ctx) {
+    // For example: in's dim = [5, 3, 2, 7, 3] ; out's dim = [3, 1, 7, 3]
+    // out_reduce_dim should be [0, 2]
+    const std::vector<std::int64_t> in_dims = framework::vectorize(in.dims());
+    auto in_size = in_dims.size();
+    const std::vector<std::int64_t> out_dims =
+        framework::vectorize(out->dims());
+    auto out_size = out_dims.size();
+
+    std::vector<std::int64_t> out_bst_dims(in_size);
+
+    std::fill(out_bst_dims.data(), out_bst_dims.data() + in_size - out_size, 1);
+    std::copy(out_dims.data(), out_dims.data() + out_size,
+              out_bst_dims.data() + in_size - out_size);
+
+    std::vector<int> out_reduce_dims;
+    for (size_t idx = 0; idx <= in_size - 3; idx++) {
+      if (in_dims[idx] != 1 && out_bst_dims[idx] == 1) {
+        out_reduce_dims.push_back(idx);
+      }
+    }
+    gpuStream_t stream = ctx.cuda_device_context().stream();
+    TensorReduceFunctorImpl<T, T, CustomSum>(in, out, out_reduce_dims, stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    triangular_solve,
+    ops::TriangularSolveKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::TriangularSolveKernel<paddle::platform::CUDADeviceContext, double>);
+
+REGISTER_OP_CUDA_KERNEL(
+    triangular_solve_grad,
+    ops::TriangularSolveGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::TriangularSolveGradKernel<paddle::platform::CUDADeviceContext,
+                                   double>);
diff --git a/paddle/fluid/operators/triangular_solve_op.h b/paddle/fluid/operators/triangular_solve_op.h
new file mode 100644
index 0000000000000..f64b016366e39
--- /dev/null
+++ b/paddle/fluid/operators/triangular_solve_op.h
@@ -0,0 +1,227 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "glog/logging.h"
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/math/complex_functors.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
+#include "paddle/fluid/operators/solve_op.h"
+#include "paddle/fluid/operators/tril_triu_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+static void triangular_solve(const DeviceContext& context, const Tensor& x,
+                             const Tensor& y, Tensor* out, bool upper,
+                             bool transpose, bool unitriangular) {
+  // Tensor broadcast use eigen
+  std::vector<int64_t> x_bst_dims_vec;
+  std::vector<int64_t> y_bst_dims_vec;
+  std::tie(x_bst_dims_vec, y_bst_dims_vec) = get_broadcast_dims(x, y);
+
+  Tensor x_bst(x.type());
+  TensorExpand<T, DeviceContext>(context, x, &x_bst, x_bst_dims_vec);
+
+  Tensor y_bst(y.type());
+  TensorExpand<T, DeviceContext>(context, y, &y_bst, y_bst_dims_vec);
+
+  // TriangularSolveFunctor performs calculations in-place
+  // x_clone should be a copy of 'x' after broadcast
+  // out should be a copy of 'y' after broadcast
+  Tensor x_clone(x.type());
+  x_clone.Resize(framework::make_ddim(x_bst_dims_vec));
+  x_clone.mutable_data<T>(context.GetPlace());
+  framework::TensorCopy(x_bst, context.GetPlace(), context, &x_clone);
+
+  out->Resize(framework::make_ddim(y_bst_dims_vec));
+  out->mutable_data<T>(context.GetPlace());
+  framework::TensorCopy(y_bst, context.GetPlace(), context, out);
+
+  math::TriangularSolveFunctor<DeviceContext, T> functor;
+  functor(context, &x_clone, out, /*left=*/true, upper, transpose,
+          unitriangular);
+}
+
+template <typename DeviceContext, typename T>
+class MatrixReduceSumFunctor {
+ public:
+  void operator()(const Tensor& input, Tensor* output,
+                  const framework::ExecutionContext& ctx);
+};
+
+template <typename T>
+class MatrixReduceSumFunctor<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const Tensor& in, Tensor* out,
+                  const framework::ExecutionContext& ctx) {
+    // For example: in's dim = [5, 3, 2, 7, 3] ; out's dim = [3, 1, 7, 3]
+    // out_reduce_dim should be [0, 2]
+    const std::vector<std::int64_t> in_dims = framework::vectorize(in.dims());
+    auto in_size = in_dims.size();
+    const std::vector<std::int64_t> out_dims =
+        framework::vectorize(out->dims());
+    auto out_size = out_dims.size();
+
+    std::vector<std::int64_t> out_bst_dims(in_size);
+
+    std::fill(out_bst_dims.data(), out_bst_dims.data() + in_size - out_size, 1);
+    std::copy(out_dims.data(), out_dims.data() + out_size,
+              out_bst_dims.data() + in_size - out_size);
+    out->Resize(framework::make_ddim(out_bst_dims));
+
+    std::vector<int> out_reduce_dims;
+    for (size_t idx = 0; idx <= in_size - 3; idx++) {
+      if (in_dims[idx] != 1 && out_bst_dims[idx] == 1) {
+        out_reduce_dims.push_back(idx);
+      }
+    }
+
+    ReduceKernelFunctor<platform::CPUDeviceContext, T, SumFunctor>(
+        &in, out, out_reduce_dims, true, false, ctx)
+        .template apply<T>();
+    out->Resize(framework::make_ddim(out_dims));
+  }
+};
+
+template <typename DeviceContext, typename T>
+class TriangularSolveKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const auto* x = ctx.Input<framework::Tensor>("X");
+    const auto* y = ctx.Input<framework::Tensor>("Y");
+    auto* out = ctx.Output<framework::Tensor>("Out");
+
+    bool upper = ctx.template Attr<bool>("upper");
+    bool transpose = ctx.template Attr<bool>("transpose");
+    bool unitriangular = ctx.template Attr<bool>("unitriangular");
+
+    const auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    triangular_solve<DeviceContext, T>(dev_ctx, *x, *y, out, upper, transpose,
+                                       unitriangular);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class TriangularSolveGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const auto* x = ctx.Input<framework::Tensor>("X");
+    const auto* y = ctx.Input<framework::Tensor>("Y");
+    const auto* out = ctx.Input<framework::Tensor>("Out");
+    const auto* dout =
+        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+
+    auto* dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
+
+    bool upper = ctx.template Attr<bool>("upper");
+    bool transpose = ctx.template Attr<bool>("transpose");
+    bool unitriangular = ctx.template Attr<bool>("unitriangular");
+
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+
+    std::vector<int64_t> x_bst_dims_vec;
+    std::vector<int64_t> y_bst_dims_vec;
+    std::tie(x_bst_dims_vec, y_bst_dims_vec) = get_broadcast_dims(*x, *y);
+
+    Tensor dy_bst(y->type());
+    if (dy) {
+      dy->mutable_data<T>(y->dims(), dev_ctx.GetPlace());
+      dy_bst.Resize(framework::make_ddim(y_bst_dims_vec));
+      dy_bst.mutable_data<T>(dev_ctx.GetPlace());
+
+      // calculate x's conjugate for complex
+      Tensor x_conj(x->type());
+      platform::ForRange<DeviceContext> x_for_range(dev_ctx, x->numel());
+      math::ConjFunctor<T> x_functor(
+          x->data<T>(), x->numel(),
+          x_conj.mutable_data<T>(x->dims(), dev_ctx.GetPlace()));
+      x_for_range(x_functor);
+
+      // reuse forward to get dy_bst, and the result has been broadcated.
+      triangular_solve<DeviceContext, T>(dev_ctx, x_conj, *dout, &dy_bst, upper,
+                                         !transpose, unitriangular);
+
+      if (dy_bst.dims() == dy->dims()) {
+        framework::TensorCopy(dy_bst, dev_ctx.GetPlace(), dev_ctx, dy);
+      } else {
+        MatrixReduceSumFunctor<DeviceContext, T> functor;
+        functor(dy_bst, dy, ctx);
+        dy->Resize(y->dims());
+      }
+    }
+
+    Tensor dx_bst(x->type());
+    if (dx) {
+      dx->mutable_data<T>(x->dims(), dev_ctx.GetPlace());
+      dx_bst.Resize(framework::make_ddim(x_bst_dims_vec));
+      dx_bst.mutable_data<T>(dev_ctx.GetPlace());
+
+      // calculate out's conjugate for complex
+      Tensor out_conj(out->type());
+      platform::ForRange<DeviceContext> out_for_range(dev_ctx, out->numel());
+      math::ConjFunctor<T> out_functor(
+          out->data<T>(), out->numel(),
+          out_conj.mutable_data<T>(out->dims(), dev_ctx.GetPlace()));
+      out_for_range(out_functor);
+
+      auto blas = math::GetBlas<DeviceContext, T>(ctx);
+      if (transpose) {
+        auto mat_dim_a =
+            math::CreateMatrixDescriptor(out_conj.dims(), 0, false);
+        auto mat_dim_b = math::CreateMatrixDescriptor(dy_bst.dims(), 0, true);
+        blas.MatMul(out_conj, mat_dim_a, dy_bst, mat_dim_b, static_cast<T>(-1),
+                    &dx_bst, static_cast<T>(0));
+      } else {
+        auto mat_dim_a = math::CreateMatrixDescriptor(dy_bst.dims(), 0, false);
+        auto mat_dim_b = math::CreateMatrixDescriptor(out_conj.dims(), 0, true);
+        blas.MatMul(dy_bst, mat_dim_a, out_conj, mat_dim_b, static_cast<T>(-1),
+                    &dx_bst, static_cast<T>(0));
+      }
+
+      Tensor dx_bst_upper(x->type());
+      // get upper or lower triangular
+      dx_bst_upper.Resize(dx_bst.dims());
+      dx_bst_upper.mutable_data<T>(dev_ctx.GetPlace());
+
+      const auto& dims = dx_bst.dims();
+      const auto H = dims[dims.size() - 2];
+      const auto W = dims[dims.size() - 1];
+      platform::ForRange<DeviceContext> x_for_range(dev_ctx, dx_bst.numel());
+      TrilTriuCompute<T> tril_triu_computer(dx_bst.data<T>(), unitriangular,
+                                            !upper, H, W,
+                                            dx_bst_upper.data<T>());
+      x_for_range(tril_triu_computer);
+
+      if (dx_bst_upper.dims() == dx->dims()) {
+        framework::TensorCopy(dx_bst_upper, dev_ctx.GetPlace(), dev_ctx, dx);
+      } else {
+        MatrixReduceSumFunctor<DeviceContext, T> functor;
+        functor(dx_bst_upper, dx, ctx);
+        dx->Resize(x->dims());
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/unity_build_rule.cmake b/paddle/fluid/operators/unity_build_rule.cmake
index 8262273b7ca7d..5faa0dba6b878 100644
--- a/paddle/fluid/operators/unity_build_rule.cmake
+++ b/paddle/fluid/operators/unity_build_rule.cmake
@@ -109,7 +109,6 @@ register_unity_group(cc
     gaussian_random_batch_size_like_op.cc
     gaussian_random_op.cc
     mkldnn/gaussian_random_mkldnn_op.cc
-    grid_sampler_op.cc
     group_norm_op.cc gru_op.cc)
 register_unity_group(cc
     hash_op.cc
diff --git a/paddle/fluid/operators/warpctc_op.cc b/paddle/fluid/operators/warpctc_op.cc
index 92862929159d4..f38f5d9f72357 100644
--- a/paddle/fluid/operators/warpctc_op.cc
+++ b/paddle/fluid/operators/warpctc_op.cc
@@ -125,17 +125,6 @@ class WarpCTCOpMaker : public framework::OpProtoAndCheckerMaker {
                   "normalize the gradients by the number of time-step, "
                   "which is also the sequence's length.")
         .SetDefault(false);
-    AddAttr<bool>(
-        "norm_by_batchsize",
-        "(bool, default: false), normalize the loss by the batch size."
-        "If True, supersedes norm_by_times")
-        .SetDefault(false);
-    AddAttr<bool>(
-        "norm_by_total_logits_len",
-        "(bool, default: false), normalize the loss by the total number of "
-        "frames"
-        "in the batch. If True, supersedes norm_by_batchsize and norm_by_times")
-        .SetDefault(false);
     AddComment(R"DOC(
 An operator integrating the open-source
 [warp-ctc](https://github.com/baidu-research/warp-ctc) library, which is used in
@@ -217,21 +206,3 @@ REGISTER_OP_CPU_KERNEL(
     warpctc_grad,
     ops::WarpCTCGradKernel<paddle::platform::CPUDeviceContext, float>,
     ops::WarpCTCGradKernel<paddle::platform::CPUDeviceContext, double>);
-
-REGISTER_OP_VERSION(warpctc)
-    .AddCheckpoint(
-        R"ROC(
-              Upgrade warpctc add a new attribute [norm_by_batchsize] and [norm_by_total_logits_len])ROC",
-        paddle::framework::compatible::OpVersionDesc()
-            .NewAttr(
-                "norm_by_batchsize",
-                "(bool, default: false), normalize the loss by the batch size."
-                "If True, supersedes norm_by_times",
-                false)
-            .NewAttr("norm_by_total_logits_len",
-                     "(bool, default: false), normalize the loss by the total "
-                     "number of "
-                     "frames"
-                     "in the batch. If True, supersedes norm_by_batchsize and "
-                     "norm_by_times",
-                     false));
\ No newline at end of file
diff --git a/paddle/fluid/operators/warpctc_op.cu b/paddle/fluid/operators/warpctc_op.cu
index 27c17eb6de8ab..fd820805e4d08 100644
--- a/paddle/fluid/operators/warpctc_op.cu
+++ b/paddle/fluid/operators/warpctc_op.cu
@@ -12,185 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <typeinfo>
-#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/warpctc_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_info.h"
-
-namespace paddle {
-namespace operators {
-
-using platform::PADDLE_CUDA_NUM_THREADS;
-
-template <typename T>
-void PrintTensor(const framework::LoDTensor& src,
-                 const framework::ExecutionContext& ctx) {
-  std::vector<T> vec(src.numel());
-  TensorToVector(src, ctx.device_context(), &vec);
-  for (int i = 0; i < static_cast<int>(vec.size()); ++i) {
-    VLOG(3) << "vec[" << i << "] : " << vec[i];
-  }
-}
-
-template <typename T>
-__global__ void ReduceSumKernel(const T* d_in, T* d_out) {
-  // Allocate shared memory
-  extern __shared__ int partial_sum[];
-
-  // Calculate thread ID
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-
-  // Load elements into shared memory
-  partial_sum[threadIdx.x] = d_in[tid];
-  __syncthreads();
-
-  // Start at 1/2 block stride and divide by two each iteration
-  for (int s = blockDim.x / 2; s > 0; s >>= 1) {
-    // Each thread does work unless it is further than the stride
-    if (threadIdx.x < s) {
-      partial_sum[threadIdx.x] += partial_sum[threadIdx.x + s];
-    }
-    __syncthreads();
-  }
-
-  // Let the thread 0 for this block write it's result to main memory
-  // Result is inexed by this block
-  if (threadIdx.x == 0) {
-    d_out[blockIdx.x] = partial_sum[0];
-  }
-}
-
-template <typename T>
-__global__ void CTCGradScaleKernel(T* d_out, const T* d_ctc, const T* d_loss,
-                                   int scale, int Tmax, int B, int D) {
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  int n_elems = Tmax * B * D;
-  int b_idx = (tid / D) % B;
-  for (; tid < n_elems; tid += gridDim.x * blockDim.x) {
-    d_out[tid] = d_ctc[tid] * d_loss[b_idx] / static_cast<T>(scale);
-  }
-}
-
-template <typename T>
-__global__ void CTCGradScaleKernel(T* d_out, const T* d_ctc, const T* d_loss,
-                                   int64_t* scale, int Tmax, int B, int D) {
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  int n_elems = Tmax * B * D;
-  int b_idx = (tid / D) % B;
-  for (; tid < n_elems; tid += gridDim.x * blockDim.x) {
-    d_out[tid] = d_ctc[tid] * d_loss[b_idx] / static_cast<T>(scale[0]);
-  }
-}
-
-template <typename T>
-__global__ void CTCGradBatchScaleKernel(T* d_out, const T* d_ctc,
-                                        const T* d_loss, const int64_t* scales,
-                                        int Tmax, int B, int D) {
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  int n_elems = Tmax * B * D;
-  int b_idx = (tid / D) % B;
-  // scale is vector, (B)
-  for (; tid < n_elems; tid += gridDim.x * blockDim.x) {
-    d_out[tid] = d_ctc[tid] * d_loss[b_idx] / scales[b_idx];
-  }
-}
-
-template <typename DeviceContext, typename T>
-class WarpCTCGradCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* warpctc_grad = ctx.Input<LoDTensor>("WarpCTCGrad");
-    auto* logits_grad = ctx.Output<LoDTensor>(framework::GradVarName("Logits"));
-    const Tensor* loss_grad = ctx.Input<Tensor>(framework::GradVarName("Loss"));
-
-    logits_grad->mutable_data<T>(ctx.GetPlace());
-    bool norm_by_times = ctx.Attr<bool>("norm_by_times");
-    bool norm_by_batchsize = ctx.Attr<bool>("norm_by_batchsize");
-    bool norm_by_total_logits_len = ctx.Attr<bool>("norm_by_total_logits_len");
-
-    if ((norm_by_times && norm_by_batchsize) ||
-        (norm_by_times && norm_by_total_logits_len) ||
-        (norm_by_batchsize && norm_by_total_logits_len)) {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "[warpctc grad] norm_by_times, norm_by_batchsize and "
-          "norm_by_total_logits_len "
-          "should one be true."));
-    }
-
-    if (ctx.HasInput("LogitsLength")) {
-      auto& dev_ctx = ctx.template device_context<DeviceContext>();
-      auto stream = dev_ctx.stream();
-      int max_seq_length = warpctc_grad->dims()[0];  // Tmax
-      int num_sequences = warpctc_grad->dims()[1];   // B
-      int seq_width = warpctc_grad->dims()[2];       // D
-
-      auto* logits_length = ctx.Input<framework::Tensor>("LogitsLength");
-      const int64_t* logits_length_ptr = logits_length->data<int64_t>();
-
-      int n_elems = max_seq_length * num_sequences * seq_width;
-      int num_blocks =
-          (n_elems + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS;
-      int shm_bytes = PADDLE_CUDA_NUM_THREADS * sizeof(T);
-
-      auto logits_grad_ptr =
-          logits_grad->mutable_data<T>(ctx.GetPlace());  // (Tmax, B, D)
-      auto warpctc_grad_ptr = warpctc_grad->data<T>();   // (Tmax, B, D)
-      auto loss_grad_ptr = loss_grad->data<T>();         // (B, 1)
-
-      if (norm_by_total_logits_len) {
-        VLOG(3) << "norm_by_total_logits_len no impl ";
-        // total length
-        Tensor total_length;
-        int64_t* total_length_ptr =
-            total_length.mutable_data<int64_t>({1}, ctx.GetPlace());
-        int bytes = num_sequences * sizeof(int64_t);
-        ReduceSumKernel<int64_t><<<1, num_sequences, bytes, stream>>>(
-            logits_length_ptr, total_length_ptr);
-
-        CTCGradScaleKernel<
-            T><<<num_blocks, PADDLE_CUDA_NUM_THREADS, shm_bytes, stream>>>(
-            logits_grad_ptr, warpctc_grad_ptr, loss_grad_ptr, total_length_ptr,
-            max_seq_length, num_sequences, seq_width);
-
-      } else if (norm_by_batchsize) {
-        VLOG(3) << "norm_by_batchsize ";
-        CTCGradScaleKernel<
-            T><<<num_blocks, PADDLE_CUDA_NUM_THREADS, shm_bytes, stream>>>(
-            logits_grad_ptr, warpctc_grad_ptr, loss_grad_ptr, num_sequences,
-            max_seq_length, num_sequences, seq_width);
-      } else if (norm_by_times) {
-        VLOG(3) << "norm_by_times ";
-        CTCGradBatchScaleKernel<
-            T><<<num_blocks, PADDLE_CUDA_NUM_THREADS, shm_bytes, stream>>>(
-            logits_grad_ptr, warpctc_grad_ptr, loss_grad_ptr, logits_length_ptr,
-            max_seq_length, num_sequences, seq_width);
-      } else {
-        VLOG(3) << "default ";
-        CTCGradScaleKernel<
-            T><<<num_blocks, PADDLE_CUDA_NUM_THREADS, shm_bytes, stream>>>(
-            logits_grad_ptr, warpctc_grad_ptr, loss_grad_ptr, 1, max_seq_length,
-            num_sequences, seq_width);
-      }
-    } else {
-      math::UnpaddingLoDTensorFunctor<DeviceContext, T>()(
-          ctx.template device_context<DeviceContext>(), *warpctc_grad,
-          logits_grad, -1, 0, norm_by_times, norm_by_batchsize,
-          norm_by_total_logits_len, math::kLengthBatchWidth);
-
-      const T* loss_grad_data = loss_grad->data<T>();
-      math::ScaleLoDTensorFunctor<DeviceContext, T>()(
-          ctx.template device_context<DeviceContext>(), loss_grad_data,
-          logits_grad);
-    }
-  }
-};
-
-}  // operators
-}  // paddle
 
 namespace ops = paddle::operators;
-
 // register forward and backward of CUDA OP must in same *.cu file.
 // Eigen can be used on GPU device, but must be in *.cu file not *.cu.cc file.
 // *.cu.cc also using GCC compiler. *.cu using NVCC compiler
@@ -199,5 +23,5 @@ REGISTER_OP_CUDA_KERNEL(
     ops::WarpCTCKernel<paddle::platform::CUDADeviceContext, double>);
 REGISTER_OP_CUDA_KERNEL(
     warpctc_grad,
-    ops::WarpCTCGradCUDAKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::WarpCTCGradCUDAKernel<paddle::platform::CUDADeviceContext, double>);
+    ops::WarpCTCGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::WarpCTCGradKernel<paddle::platform::CUDADeviceContext, double>);
\ No newline at end of file
diff --git a/paddle/fluid/operators/warpctc_op.h b/paddle/fluid/operators/warpctc_op.h
index b515adc43fdfe..4cce33c3f520f 100644
--- a/paddle/fluid/operators/warpctc_op.h
+++ b/paddle/fluid/operators/warpctc_op.h
@@ -17,7 +17,6 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/sequence_padding.h"
 #include "paddle/fluid/operators/math/sequence_scale.h"
@@ -152,7 +151,7 @@ class WarpCTCFunctor {
     PADDLE_ENFORCE_EQ(
         CTC_STATUS_SUCCESS, status,
         platform::errors::PreconditionNotMet(
-            "warp-ctc [version %d] Error in ComputeCtcLossFunctor: %s",
+            "warp-ctc [version %d] Error in get_workspace_size: %s",
             warpctc_version_, platform::dynload::ctcGetStatusString(status)));
   }
 
@@ -315,8 +314,8 @@ class WarpCTCKernel : public framework::OpKernel<T> {
 
       math::PaddingLoDTensorFunctor<DeviceContext, T>()(
           ctx.template device_context<DeviceContext>(), *logits,
-          &warpctc_logits, pad_value, -1, 0, false /* norm_by_times */, false,
-          false, math::kLengthBatchWidth);
+          &warpctc_logits, pad_value, -1, 0, false /* norm_by_times */,
+          math::kLengthBatchWidth);
     }
     const T* warpctc_logits_data = warpctc_logits.data<T>();
 
@@ -351,7 +350,7 @@ class WarpCTCKernel : public framework::OpKernel<T> {
         math::UnpaddingLoDTensorFunctor<DeviceContext, int>()(
             ctx.template device_context<DeviceContext>(), *label,
             &warpctc_label, label->dims()[1] /*pad_seq_len*/, 0 /*lod_level*/,
-            false /*norm_by_times*/, false, false, math::kBatchLengthWidth);
+            false /*norm_by_times*/, math::kBatchLengthWidth);
       } else {
         LoDTensor gpu_label;
         gpu_label.mutable_data<int>(
@@ -361,7 +360,7 @@ class WarpCTCKernel : public framework::OpKernel<T> {
         math::UnpaddingLoDTensorFunctor<DeviceContext, int>()(
             ctx.template device_context<DeviceContext>(), *label, &gpu_label,
             label->dims()[1] /*pad_seq_len*/, 0 /*lod_level*/,
-            false /*norm_by_times*/, false, false, math::kBatchLengthWidth);
+            false /*norm_by_times*/, math::kBatchLengthWidth);
         TensorCopySync(gpu_label, platform::CPUPlace(), &warpctc_label);
       }
     } else {
@@ -390,23 +389,12 @@ template <typename DeviceContext, typename T>
 class WarpCTCGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    const Tensor* loss_grad = ctx.Input<Tensor>(framework::GradVarName("Loss"));
     auto* warpctc_grad = ctx.Input<LoDTensor>("WarpCTCGrad");
     auto* logits_grad = ctx.Output<LoDTensor>(framework::GradVarName("Logits"));
+    const Tensor* loss_grad = ctx.Input<Tensor>(framework::GradVarName("Loss"));
 
     logits_grad->mutable_data<T>(ctx.GetPlace());
     bool norm_by_times = ctx.Attr<bool>("norm_by_times");
-    bool norm_by_batchsize = ctx.Attr<bool>("norm_by_batchsize");
-    bool norm_by_total_logits_len = ctx.Attr<bool>("norm_by_total_logits_len");
-
-    if ((norm_by_times && norm_by_batchsize) ||
-        (norm_by_times && norm_by_total_logits_len) ||
-        (norm_by_batchsize && norm_by_total_logits_len)) {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "[warpctc grad] norm_by_times, norm_by_batchsize and "
-          "norm_by_total_logits_len "
-          "should one be true."));
-    }
 
     if (ctx.HasInput("LogitsLength")) {
       int max_seq_length = warpctc_grad->dims()[0];  // Tmax
@@ -430,20 +418,7 @@ class WarpCTCGradKernel : public framework::OpKernel<T> {
                       loss_grad_e.reshape(grad_shape).broadcast(bcast).eval();
 
       auto* place = ctx.template device_context<DeviceContext>().eigen_device();
-      if (norm_by_total_logits_len) {
-        // Compute the avg. log-probability per batch sample and frame.
-        // Rank is 0
-        auto inv_len = logits_len_e.sum().cast<T>().inverse().eval();
-        logits_grad_e.device(*place) =
-            logits_g *
-            inv_len.reshape(Eigen::DSizes<int, 3>{1, 1, 1})
-                .broadcast(Eigen::DSizes<int, 3>{max_seq_length, num_sequences,
-                                                 seq_width});
-      } else if (norm_by_batchsize) {
-        // Compute the avg. log-probability per batch sample.
-        T scale = 1.0 / static_cast<T>(num_sequences);
-        logits_grad_e.device(*place) = logits_g * scale;
-      } else if (norm_by_times) {
+      if (norm_by_times) {
         auto scales = logits_len_e.cast<T>()
                           .inverse()
                           .reshape(grad_shape)
@@ -456,8 +431,7 @@ class WarpCTCGradKernel : public framework::OpKernel<T> {
     } else {
       math::UnpaddingLoDTensorFunctor<DeviceContext, T>()(
           ctx.template device_context<DeviceContext>(), *warpctc_grad,
-          logits_grad, -1, 0, norm_by_times, norm_by_batchsize,
-          norm_by_total_logits_len, math::kLengthBatchWidth);
+          logits_grad, -1, 0, norm_by_times, math::kLengthBatchWidth);
 
       const T* loss_grad_data = loss_grad->data<T>();
       math::ScaleLoDTensorFunctor<DeviceContext, T>()(
diff --git a/paddle/fluid/operators/xpu_api_wrapper.h b/paddle/fluid/operators/xpu_api_wrapper.h
new file mode 100644
index 0000000000000..4fdb33ca6c408
--- /dev/null
+++ b/paddle/fluid/operators/xpu_api_wrapper.h
@@ -0,0 +1,53 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#ifdef PADDLE_WITH_XPU
+#include <vector>
+
+namespace paddle {
+namespace operators {
+
+template <typename XPUType, typename FCT>
+int xpu_fc_wrapper(xpu::Context* ctx, const XPUType* x, const XPUType* w,
+                   XPUType* y, int m, int n, int k, bool x_trans, bool w_trans,
+                   const float* x_maxptr, const float* w_maxptr,
+                   float* y_maxptr, int ldx, int ldw, int ldy, float alpha,
+                   float beta, const float* bias,
+                   const xpu::Activation_t& act) {
+  int r = 0;
+  if (x_trans && std::getenv("XPU_PADDLE_FC_TRANS_A") != nullptr &&
+      std::is_same<float, XPUType>::value) {
+    XPUType* l3_addr = nullptr;
+    xpu::ctx_guard RAII_GUARD(ctx);
+    l3_addr = RAII_GUARD.alloc_l3_or_gm<XPUType>(m * k);
+    if (l3_addr == nullptr) return XPUERR_NOMEM;
+
+    std::vector<int> shape = {k, m};
+    std::vector<int> axis = {1, 0};
+    r = xpu::transpose<XPUType>(ctx, x, l3_addr, shape, axis);
+    if (r != XPU_SUCCESS) return r;
+
+    r = xpu::fc_fusion<XPUType, XPUType, XPUType, FCT>(
+        ctx, l3_addr, w, y, m, n, k, false, w_trans, x_maxptr, w_maxptr,
+        y_maxptr, k, ldw, ldy, alpha, beta, bias, act);
+    if (r != XPU_SUCCESS) return r;
+  } else {
+    r = xpu::fc_fusion<XPUType, XPUType, XPUType, FCT>(
+        ctx, x, w, y, m, n, k, x_trans, w_trans, x_maxptr, w_maxptr, y_maxptr,
+        ldx, ldw, ldy, alpha, beta, bias, act);
+  }
+  return r;
+}
+
+}  // namespace operators
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 21213f9e6ff21..54e73c5c1d9fa 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -169,7 +169,7 @@ if(WITH_GPU)
   nv_test(device_event_test SRCS device_event_test.cc DEPS device_event_gpu)
 
   nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info)
-  nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
+  nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda pten)
   nv_test(cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda)
   nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context)
 endif()
diff --git a/paddle/fluid/platform/complex.h b/paddle/fluid/platform/complex.h
index 065ccd375c94c..35de34086c57d 100644
--- a/paddle/fluid/platform/complex.h
+++ b/paddle/fluid/platform/complex.h
@@ -352,12 +352,12 @@ HOSTDEVICE inline bool operator>=(const complex<T>& a, const complex<T>& b) {
 }
 
 template <typename T>
-HOSTDEVICE inline complex<T> max(const complex<T>& a, const complex<T>& b) {
+HOSTDEVICE inline complex<T>(max)(const complex<T>& a, const complex<T>& b) {
   return (a.real >= b.real) ? a : b;
 }
 
 template <typename T>
-HOSTDEVICE inline complex<T> min(const complex<T>& a, const complex<T>& b) {
+HOSTDEVICE inline complex<T>(min)(const complex<T>& a, const complex<T>& b) {
   return (a.real < b.real) ? a : b;
 }
 
@@ -507,13 +507,13 @@ struct numeric_limits<paddle::platform::complex<T>> {
   static const bool traps = false;
   static const bool tinyness_before = false;
 
-  static paddle::platform::complex<T> min() {
+  static paddle::platform::complex<T>(min)() {
     return paddle::platform::complex<T>(0.0, 0.0);
   }
   static paddle::platform::complex<T> lowest() {
     return paddle::platform::complex<T>(0.0, 0.0);
   }
-  static paddle::platform::complex<T> max() {
+  static paddle::platform::complex<T>(max)() {
     return paddle::platform::complex<T>(0.0, 0.0);
   }
   static paddle::platform::complex<T> epsilon() {
diff --git a/paddle/fluid/platform/cuda_graph.cc b/paddle/fluid/platform/cuda_graph.cc
index 693a592799027..6f3d452ef5c50 100644
--- a/paddle/fluid/platform/cuda_graph.cc
+++ b/paddle/fluid/platform/cuda_graph.cc
@@ -22,14 +22,14 @@ std::unique_ptr<CUDAGraph> CUDAGraph::capturing_graph_{nullptr};
 void CUDAGraph::Reset() {
   if (is_reset_) return;
 #if CUDA_VERSION >= 10010
-  if (graph_) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaGraphDestroy(graph_));
-    graph_ = nullptr;
+  for (auto graph : graphs_) {
+    PADDLE_ENFORCE_CUDA_SUCCESS(cudaGraphDestroy(graph));
   }
-  if (exec_graph_) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaGraphExecDestroy(exec_graph_));
-    exec_graph_ = nullptr;
+  graphs_.clear();
+  for (auto exec_graph : exec_graphs_) {
+    PADDLE_ENFORCE_CUDA_SUCCESS(cudaGraphExecDestroy(exec_graph));
   }
+  exec_graphs_.clear();
 #endif
   // callback should be called in reverse order because the latter added
   // callback may rely on the former added callback.
@@ -45,16 +45,33 @@ void CUDAGraph::Replay() {
   PADDLE_ENFORCE_EQ(is_reset_, false,
                     errors::PermissionDenied(
                         "Cannot replay the CUDA Graph after reset is called."));
-  PADDLE_ENFORCE_NOT_NULL(exec_graph_,
-                          errors::PermissionDenied(
-                              "CUDA Graph must be captured before replaying."));
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaGraphLaunch(exec_graph_, stream_));
+  for (auto exec_graph : exec_graphs_) {
+    PADDLE_ENFORCE_CUDA_SUCCESS(cudaGraphLaunch(exec_graph, stream_));
+  }
+#endif
+}
+
+void CUDAGraph::BeginSegmentCapture() {
+  ThrowErrorIfNotSupportCUDAGraph();
+#if CUDA_VERSION >= 10010
+  PADDLE_ENFORCE_EQ(
+      IsCapturing(), true,
+      errors::PermissionDenied("BeginSegmentCapture should be called when CUDA "
+                               "Graph is capturing."));
+  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamBeginCapture(
+      capturing_graph_->stream_, capturing_graph_->capture_mode_));
+  PADDLE_ENFORCE_EQ(IsValidCapturing(), true,
+                    platform::errors::PermissionDenied(
+                        "CUDA Graph should not be invalidated."));
+  VLOG(10) << "Begin to capture CUDA Graph with ID " << capturing_graph_->id_
+           << ", segment id " << capturing_graph_->graphs_.size();
 #endif
 }
 
 void CUDAGraph::BeginCapture(platform::CUDAPlace place, cudaStream_t stream,
                              cudaStreamCaptureMode mode) {
   ThrowErrorIfNotSupportCUDAGraph();
+#if CUDA_VERSION >= 10010
   PADDLE_ENFORCE_EQ(
       IsCapturing(), false,
       errors::PermissionDenied("CUDA Graph can only captured one by one."));
@@ -64,40 +81,87 @@ void CUDAGraph::BeginCapture(platform::CUDAPlace place, cudaStream_t stream,
   capturing_graph_.reset(new CUDAGraph());
   capturing_graph_->place_ = place;
   capturing_graph_->stream_ = stream;
-
-  PADDLE_ENFORCE_CUDA_SUCCESS(
-      cudaStreamBeginCapture(capturing_graph_->stream_, mode));
-  cudaStreamCaptureStatus status;
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamGetCaptureInfo(
-      capturing_graph_->stream_, &status, &(capturing_graph_->id_)));
-  PADDLE_ENFORCE_EQ(IsValidCapturing(), true,
-                    platform::errors::PermissionDenied(
-                        "CUDA Graph should not be invalidated."));
-  VLOG(10) << "Begin to capture CUDA Graph with ID " << capturing_graph_->id_;
+  capturing_graph_->capture_mode_ = mode;
+  BeginSegmentCapture();
+#endif
 }
 
-std::unique_ptr<CUDAGraph> CUDAGraph::EndCapture() {
+void CUDAGraph::EndSegmentCapture() {
   ThrowErrorIfNotSupportCUDAGraph();
 #if CUDA_VERSION >= 10010
   PADDLE_ENFORCE_EQ(IsCapturing(), true,
                     errors::PermissionDenied("No CUDA Graph is capturing."));
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamEndCapture(
-      capturing_graph_->stream_, &(capturing_graph_->graph_)));
+  cudaGraph_t graph;
   PADDLE_ENFORCE_CUDA_SUCCESS(
-      cudaGraphInstantiate(&(capturing_graph_->exec_graph_),
-                           capturing_graph_->graph_, nullptr, nullptr, 0));
-  VLOG(10) << "End to capture CUDA Graph with ID " << capturing_graph_->id_;
-  return std::move(capturing_graph_);
+      cudaStreamEndCapture(capturing_graph_->stream_, &graph));
+  auto num_nodes = static_cast<size_t>(-1);
+  PADDLE_ENFORCE_CUDA_SUCCESS(cudaGraphGetNodes(graph, nullptr, &num_nodes));
+  if (num_nodes == 0) {
+    PADDLE_ENFORCE_CUDA_SUCCESS(cudaGraphDestroy(graph));
+    VLOG(10) << "Skip empty CUDA Graph with ID " << capturing_graph_->id_
+             << ", segment id " << capturing_graph_->graphs_.size();
+    return;
+  }
+
+  cudaGraphExec_t exec_graph;
+  PADDLE_ENFORCE_CUDA_SUCCESS(
+      cudaGraphInstantiate(&exec_graph, graph, nullptr, nullptr, 0));
+  VLOG(10) << "End to capture CUDA Graph with ID " << capturing_graph_->id_
+           << ", segment id " << capturing_graph_->graphs_.size();
+  capturing_graph_->graphs_.emplace_back(graph);
+  capturing_graph_->exec_graphs_.emplace_back(exec_graph);
 #endif
 }
 
+std::unique_ptr<CUDAGraph> CUDAGraph::EndCapture() {
+  EndSegmentCapture();
+  return std::move(capturing_graph_);
+}
+
 bool CUDAGraph::IsValidCapturing() {
+#if CUDA_VERSION >= 10010
   if (!IsCapturing()) return false;
   cudaStreamCaptureStatus status;
   CUDAGraphID id;
   PADDLE_ENFORCE_CUDA_SUCCESS(
       cudaStreamGetCaptureInfo(capturing_graph_->stream_, &status, &id));
   return status == cudaStreamCaptureStatusActive;
+#else
+  return false;
+#endif
+}
+
+static std::string ConcatPath(const std::string &dirname,
+                              const std::string &filename) {
+#ifdef _WIN32
+  const char kFileSep[] = "\\";
+#else
+  const char kFileSep[] = "/";
+#endif
+  if (!dirname.empty() && dirname.back() == kFileSep[0]) {
+    return dirname + filename;
+  } else {
+    return dirname + kFileSep + filename;
+  }
+}
+
+void CUDAGraph::PrintToDotFiles(const std::string &dirname,
+                                unsigned int flags) {
+  ThrowErrorIfNotSupportCUDAGraph();
+#if CUDA_VERSION >= 11030
+  for (size_t i = 0; i < graphs_.size(); ++i) {
+    auto filename =
+        ConcatPath(dirname, "segment_" + std::to_string(i) + ".dot");
+    VLOG(10) << "Save the " << i << "-th segment of graph " << id_ << " to "
+             << filename;
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        cudaGraphDebugDotPrint(graphs_[i], filename.c_str(), flags));
+  }
+#else
+  PADDLE_THROW(platform::errors::Unimplemented(
+      "The print_to_dot_files() method is only supported when CUDA version >= "
+      "11.3."));
+#endif
 }
 
 }  // namespace platform
diff --git a/paddle/fluid/platform/cuda_graph.h b/paddle/fluid/platform/cuda_graph.h
index 55ec463556b45..f70a66f76242f 100644
--- a/paddle/fluid/platform/cuda_graph.h
+++ b/paddle/fluid/platform/cuda_graph.h
@@ -14,9 +14,11 @@
 
 #pragma once
 
+#include <atomic>
 #include <functional>
 #include <memory>
 #include <mutex>
+#include <vector>
 #include "cuda.h"          // NOLINT
 #include "cuda_runtime.h"  // NOLINT
 #include "paddle/fluid/platform/type_defs.h"
@@ -51,7 +53,10 @@ class CUDAGraph {
   // Since the constructor would throw error is CUDA_VERSION < 10010.
   // The non-static method of CUDAGraph need not check CUDA_VERSION
   // again.
-  CUDAGraph() { ThrowErrorIfNotSupportCUDAGraph(); }
+  CUDAGraph() {
+    ThrowErrorIfNotSupportCUDAGraph();
+    id_ = UniqueID();
+  }
 
  public:
   ~CUDAGraph() { Reset(); }
@@ -67,9 +72,15 @@ class CUDAGraph {
     callbacks_.push_back(std::move(callback));
   }
 
+  void PrintToDotFiles(const std::string &dirname, unsigned int flags);
+
   static void BeginCapture(platform::CUDAPlace place, cudaStream_t stream,
                            cudaStreamCaptureMode mode);
   static std::unique_ptr<CUDAGraph> EndCapture();
+
+  static void BeginSegmentCapture();
+  static void EndSegmentCapture();
+
   static void AddResetCallbackDuringCapturing(std::function<void()> callback) {
     capturing_graph_->AddResetCallback(std::move(callback));
   }
@@ -88,14 +99,21 @@ class CUDAGraph {
   // supported during capturing CUDA Graph.
   static bool IsValidCapturing();
 
+ private:
+  static CUDAGraphID UniqueID() {
+    static std::atomic<CUDAGraphID> id;
+    return id.fetch_add(1);
+  }
+
  private:
 #if CUDA_VERSION >= 10010
-  cudaGraph_t graph_{nullptr};
-  cudaGraphExec_t exec_graph_{nullptr};
+  std::vector<cudaGraph_t> graphs_;
+  std::vector<cudaGraphExec_t> exec_graphs_;
+  cudaStreamCaptureMode capture_mode_;
 #endif
   cudaStream_t stream_{nullptr};
   platform::CUDAPlace place_;
-  CUDAGraphID id_{0};
+  CUDAGraphID id_;
   std::vector<std::function<void()>> callbacks_;
   bool is_reset_{false};
   std::mutex mtx_;
diff --git a/paddle/fluid/platform/cuda_graph_with_memory_pool.h b/paddle/fluid/platform/cuda_graph_with_memory_pool.h
index f9f0248e5153b..6586146c5aefb 100644
--- a/paddle/fluid/platform/cuda_graph_with_memory_pool.h
+++ b/paddle/fluid/platform/cuda_graph_with_memory_pool.h
@@ -60,5 +60,30 @@ inline void AddResetCallbackIfCapturingCUDAGraph(Callback &&callback) {
   callback();
 }
 
+class SkipCUDAGraphCaptureGuard {
+  DISABLE_COPY_AND_ASSIGN(SkipCUDAGraphCaptureGuard);
+
+ public:
+  SkipCUDAGraphCaptureGuard() {
+#ifdef PADDLE_WITH_CUDA
+#if CUDA_VERSION >= 10010
+    if (UNLIKELY(CUDAGraph::IsCapturing())) {
+      CUDAGraph::EndSegmentCapture();
+    }
+#endif
+#endif
+  }
+
+  ~SkipCUDAGraphCaptureGuard() {
+#ifdef PADDLE_WITH_CUDA
+#if CUDA_VERSION >= 10010
+    if (UNLIKELY(CUDAGraph::IsCapturing())) {
+      CUDAGraph::BeginSegmentCapture();
+    }
+#endif
+#endif
+  }
+};
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 587ad5f37e55e..cc3aab3ecdb7c 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -91,7 +91,7 @@ DeviceType Place2DeviceType(const platform::Place& place) {
 DeviceContextPool* DeviceContextPool::pool = nullptr;
 
 platform::DeviceContext* DeviceContextPool::Get(const platform::Place& place) {
-  VLOG(4) << "DeviceContextPool Get: " << place;
+  VLOG(6) << "DeviceContextPool Get: " << place;
   auto it = device_contexts_.find(place);
   if (it == device_contexts_.end()) {
     PADDLE_THROW(platform::errors::Unimplemented(
@@ -222,9 +222,13 @@ XPUDeviceContext::XPUDeviceContext(XPUPlace place) : place_(place) {
 
   context_ = xpu::create_context();
   const int MAX_XPU_NUM = 16;
-  const int l3_size = 13.5 * 1024 * 1024;
   static void* l3ptrs[MAX_XPU_NUM] = {nullptr};
 
+  int l3_size = 13.5 * 1024 * 1024;
+  if (std::getenv("XPU_PADDLE_L3_SIZE") != nullptr) {
+    l3_size = atoi(std::getenv("XPU_PADDLE_L3_SIZE"));
+  }
+
   auto selected_xpus = GetXPUSelectedDevices();
   for (unsigned int i = 0; i < selected_xpus.size(); i++) {
     if (place.device == selected_xpus[i]) {
diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt
index 6e90ccfc51e1b..b396caf54a45a 100644
--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/dynload/CMakeLists.txt
@@ -11,8 +11,8 @@ if (WITH_ROCM)
 endif()
 
 # There is no macOS version of NCCL.
-# Disable nvrtc and cuda_driver api on MacOS and Windows, and only do a early test on Linux.
-if (NOT APPLE AND NOT WIN32)
+# Disable nvrtc and cuda_driver api on MacOS, and only do a early test on Linux and Windows.
+if (NOT APPLE)
     list(APPEND CUDA_SRCS nvrtc.cc cuda_driver.cc)
   if (WITH_NCCL)
     list(APPEND CUDA_SRCS nccl.cc)
diff --git a/paddle/fluid/platform/dynload/cublas.h b/paddle/fluid/platform/dynload/cublas.h
index ab30ab307a9c7..17ae4d5bf03d7 100644
--- a/paddle/fluid/platform/dynload/cublas.h
+++ b/paddle/fluid/platform/dynload/cublas.h
@@ -75,6 +75,8 @@ extern void *cublas_dso_handle;
   __macro(cublasDgeam);                   \
   __macro(cublasStrsm_v2);                \
   __macro(cublasDtrsm_v2);                \
+  __macro(cublasCtrsm_v2);                \
+  __macro(cublasZtrsm_v2);                \
   __macro(cublasCreate_v2);               \
   __macro(cublasDestroy_v2);              \
   __macro(cublasSetStream_v2);            \
@@ -84,6 +86,10 @@ extern void *cublas_dso_handle;
   __macro(cublasDgemmBatched);            \
   __macro(cublasCgemmBatched);            \
   __macro(cublasZgemmBatched);            \
+  __macro(cublasStrsmBatched);            \
+  __macro(cublasDtrsmBatched);            \
+  __macro(cublasCtrsmBatched);            \
+  __macro(cublasZtrsmBatched);            \
   __macro(cublasSgetrfBatched);           \
   __macro(cublasSgetriBatched);           \
   __macro(cublasDgetrfBatched);           \
diff --git a/paddle/fluid/platform/dynload/cuda_driver.cc b/paddle/fluid/platform/dynload/cuda_driver.cc
index 89a29bae7f337..6110e6b6ba93f 100644
--- a/paddle/fluid/platform/dynload/cuda_driver.cc
+++ b/paddle/fluid/platform/dynload/cuda_driver.cc
@@ -23,6 +23,9 @@ void* cuda_dso_handle = nullptr;
 
 #define DEFINE_WRAP(__name) DynLoad__##__name __name
 
+#if CUDA_VERSION >= 10020
+CUDA_ROUTINE_EACH_VVM(DEFINE_WRAP);
+#endif
 CUDA_ROUTINE_EACH(DEFINE_WRAP);
 
 bool HasCUDADriver() {
diff --git a/paddle/fluid/platform/dynload/cuda_driver.h b/paddle/fluid/platform/dynload/cuda_driver.h
index 5799b084f5f31..b5212c64cd14d 100644
--- a/paddle/fluid/platform/dynload/cuda_driver.h
+++ b/paddle/fluid/platform/dynload/cuda_driver.h
@@ -57,7 +57,23 @@ extern bool HasCUDADriver();
   __macro(cuCtxCreate);                                 \
   __macro(cuCtxGetCurrent);                             \
   __macro(cuDeviceGetCount);                            \
-  __macro(cuDevicePrimaryCtxGetState)
+  __macro(cuDevicePrimaryCtxGetState);                  \
+  __macro(cuDeviceGetAttribute);                        \
+  __macro(cuDeviceGet)
+
+#if CUDA_VERSION >= 10020
+#define CUDA_ROUTINE_EACH_VVM(__macro)    \
+  __macro(cuMemGetAllocationGranularity); \
+  __macro(cuMemAddressReserve);           \
+  __macro(cuMemCreate);                   \
+  __macro(cuMemMap);                      \
+  __macro(cuMemSetAccess);                \
+  __macro(cuMemUnmap);                    \
+  __macro(cuMemRelease);                  \
+  __macro(cuMemAddressFree)
+
+CUDA_ROUTINE_EACH_VVM(DECLARE_DYNAMIC_LOAD_CUDA_WRAP);
+#endif
 
 CUDA_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDA_WRAP);
 
diff --git a/paddle/fluid/platform/dynload/cusparse.h b/paddle/fluid/platform/dynload/cusparse.h
index e5be003fadf06..e44e8ed08560f 100644
--- a/paddle/fluid/platform/dynload/cusparse.h
+++ b/paddle/fluid/platform/dynload/cusparse.h
@@ -56,8 +56,8 @@ extern void *cusparse_dso_handle;
 
 CUSPARSE_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP);
 
-// APIs available after CUDA 11.2
-#if CUDA_VERSION >= 11020
+// APIs available after CUDA 11.3
+#if CUDA_VERSION >= 11030
 #define CUSPARSE_ROUTINE_EACH_R2(__macro) \
   __macro(cusparseSDDMM_bufferSize);      \
   __macro(cusparseSDDMM_preprocess);      \
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc
index 1bfd48b133907..544c1c194d996 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -21,6 +21,10 @@ limitations under the License. */
 #include "paddle/fluid/platform/dynload/cupti_lib_path.h"
 #include "paddle/fluid/platform/enforce.h"
 
+#if defined(_WIN32)
+#include <windows.h>
+#endif
+
 DEFINE_string(cudnn_dir, "",
               "Specify path for loading libcudnn.so. For instance, "
               "/usr/local/cudnn/lib. If empty [default], dlopen "
@@ -414,6 +418,10 @@ void* GetCUDADsoHandle() {
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcuda.dylib", false);
 #elif defined(PADDLE_WITH_HIP)
   return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libamdhip64.so", false);
+#elif defined(_WIN32)
+  char system32_dir[MAX_PATH];
+  GetSystemDirectory(system32_dir, MAX_PATH);
+  return GetDsoHandleFromSearchPath(system32_dir, "nvcuda.dll");
 #else
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcuda.so", false);
 #endif
diff --git a/paddle/fluid/platform/dynload/mklml.h b/paddle/fluid/platform/dynload/mklml.h
index 1120828916593..335b919f41c34 100644
--- a/paddle/fluid/platform/dynload/mklml.h
+++ b/paddle/fluid/platform/dynload/mklml.h
@@ -25,7 +25,7 @@ namespace platform {
 namespace dynload {
 
 extern std::once_flag mklml_dso_flag;
-extern void* mklml_dso_handle;
+extern void *mklml_dso_handle;
 
 /**
  * The following macro definition can generate structs
@@ -40,7 +40,7 @@ extern void* mklml_dso_handle;
       std::call_once(mklml_dso_flag, []() {                                \
         mklml_dso_handle = paddle::platform::dynload::GetMKLMLDsoHandle(); \
       });                                                                  \
-      static void* p_##_name = dlsym(mklml_dso_handle, #__name);           \
+      static void *p_##_name = dlsym(mklml_dso_handle, #__name);           \
       return reinterpret_cast<mklmlFunc>(p_##_name)(args...);              \
     }                                                                      \
   };                                                                       \
@@ -67,6 +67,8 @@ extern void* mklml_dso_handle;
   __macro(cblas_zgemv);             \
   __macro(cblas_strsm);             \
   __macro(cblas_dtrsm);             \
+  __macro(cblas_ctrsm);             \
+  __macro(cblas_ztrsm);             \
   __macro(cblas_sgemm_alloc);       \
   __macro(cblas_dgemm_alloc);       \
   __macro(cblas_sgemm_pack);        \
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index caa495bb7f8c5..bdb901f583e26 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -188,11 +188,8 @@ struct TypeConverterImpl<T1, T2, false> {
 
 template <typename T1, typename T2>
 struct TypeConverter {
- private:
   static constexpr bool kIsArithmetic =
       IsArithmetic<T1>() && IsArithmetic<T2>();
-
- public:
   using Type1 = typename TypeConverterImpl<T1, T2, kIsArithmetic>::Type1;
   using Type2 = typename TypeConverterImpl<T1, T2, kIsArithmetic>::Type2;
 };
@@ -717,6 +714,7 @@ DEFINE_EXTERNAL_API_TYPE(cudnnStatus_t, CUDNN_STATUS_SUCCESS, CUDNN);
 DEFINE_EXTERNAL_API_TYPE(cublasStatus_t, CUBLAS_STATUS_SUCCESS, CUBLAS);
 DEFINE_EXTERNAL_API_TYPE(cusolverStatus_t, CUSOLVER_STATUS_SUCCESS, CUSOLVER);
 DEFINE_EXTERNAL_API_TYPE(cufftResult_t, CUFFT_SUCCESS, CUFFT);
+DEFINE_EXTERNAL_API_TYPE(CUresult, CUDA_SUCCESS, CU);
 
 #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
 DEFINE_EXTERNAL_API_TYPE(ncclResult_t, ncclSuccess, NCCL);
@@ -731,6 +729,7 @@ inline const char* GetErrorMsgUrl(T status) {
       details::ExternalApiType<__CUDA_STATUS_TYPE__>::kProtoType;
   switch (proto_type) {
     case platform::proto::ApiType::CUDA:
+    case platform::proto::ApiType::CU:
       return "https://docs.nvidia.com/cuda/cuda-runtime-api/"
              "group__CUDART__TYPES.html#group__CUDART__TYPES_"
              "1g3f51e3575c2178246db0a94a430e0038";
@@ -845,6 +844,7 @@ template std::string GetExternalErrorMsg<cudnnStatus_t>(cudnnStatus_t);
 template std::string GetExternalErrorMsg<cublasStatus_t>(cublasStatus_t);
 template std::string GetExternalErrorMsg<cusolverStatus_t>(cusolverStatus_t);
 template std::string GetExternalErrorMsg<cufftResult_t>(cufftResult_t);
+template std::string GetExternalErrorMsg<CUresult>(CUresult);
 #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
 template std::string GetExternalErrorMsg<ncclResult_t>(ncclResult_t);
 #endif
@@ -914,6 +914,15 @@ inline std::string build_nvidia_error_msg(cufftResult_t stat) {
   return sout.str();
 }
 
+/*************** CUresult ERROR ***************/
+inline bool is_error(CUresult stat) { return stat != CUDA_SUCCESS; }
+
+inline std::string build_nvidia_error_msg(CUresult stat) {
+  std::ostringstream sout;
+  sout << "CU error(" << stat << "). " << GetExternalErrorMsg(stat);
+  return sout.str();
+}
+
 /**************** NCCL ERROR ****************/
 #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
 inline bool is_error(ncclResult_t nccl_result) {
diff --git a/paddle/fluid/platform/external_error.proto b/paddle/fluid/platform/external_error.proto
index cbbf803492e64..fcbbb4162612d 100644
--- a/paddle/fluid/platform/external_error.proto
+++ b/paddle/fluid/platform/external_error.proto
@@ -25,6 +25,7 @@ enum ApiType {
   CUSOLVER = 4;
   NCCL = 5;
   CUFFT = 6;
+  CU = 7;
 }
 
 message MessageDesc {
diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc
index ef908be8462ed..a674a6a8acdf2 100644
--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
@@ -681,6 +681,18 @@ PADDLE_DEFINE_EXPORTED_bool(
     apply_pass_to_program, false,
     "It controls whether to apply IR pass to program when using Fleet APIs");
 
+/**
+ * Pt kernel related FLAG
+ * Name: FLAGS_run_pten_kernel
+ * Since Version: 2.3.0
+ * Value Range: bool, default=false
+ * Example: FLAGS_run_pten_kernel=true would use the pt kernel to compute in the
+ * Op.
+ * Note:
+ */
+PADDLE_DEFINE_EXPORTED_bool(run_pten_kernel, true,
+                            "It controls whether to use pten kernel");
+
 /**
  * Distributed related FLAG
  * Name: FLAGS_allreduce_record_one_event
@@ -698,6 +710,7 @@ PADDLE_DEFINE_EXPORTED_bool(allreduce_record_one_event, false,
                             "events. Currently, only fuse allreduce supports "
                             "this. Otherwise, the precision may be wrong.");
 
+#ifdef PADDLE_WITH_CINN
 /*
  * CINN related FLAG
  * Name: FLAGS_use_cinn
@@ -705,9 +718,31 @@ PADDLE_DEFINE_EXPORTED_bool(allreduce_record_one_event, false,
  * Value Range: bool, default=false
  * Example: FLAGS_use_cinn=true would run PaddlePaddle using CINN
  */
-#ifdef PADDLE_WITH_CINN
 PADDLE_DEFINE_EXPORTED_bool(
     use_cinn, false, "It controls whether to run PaddlePaddle using CINN");
+
+/*
+ * CINN related FLAG
+ * Name: FLAGS_allow_cinn_ops
+ * Since Version: 2.3
+ * Value Range: string, default=""
+ * Example: FLAGS_allow_cinn_ops="mul;relu" would only cover `mul` and `relu`
+ * when using CINN
+ */
+PADDLE_DEFINE_EXPORTED_string(allow_cinn_ops, "",
+                              "It controls the cinn op subset to be used, "
+                              "which has the highest priority.");
+
+/*
+ * CINN related FLAG
+ * Name: FLAGS_deny_cinn_ops
+ * Since Version: 2.3
+ * Value Range: string, default=""
+ * Example: FLAGS_deny_cinn_ops="mul;relu" would block `mul` and `relu` two ops
+ * when using CINN
+ */
+PADDLE_DEFINE_EXPORTED_string(deny_cinn_ops, "",
+                              "It controls the cinn op subset to be not used.");
 #endif
 
 DEFINE_int32(record_pool_max_size, 2000000,
diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc
index c624ba94b74a3..9dc6254234a97 100644
--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
@@ -26,6 +26,11 @@ limitations under the License. */
 #include "paddle/fluid/platform/dynload/cudnn.h"
 #endif
 #include "paddle/fluid/memory/malloc.h"
+#ifdef PADDLE_WITH_CUDA
+#if CUDA_VERSION >= 10020
+#include "paddle/fluid/platform/dynload/cuda_driver.h"
+#endif
+#endif
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/lock_guard_ptr.h"
 #include "paddle/fluid/platform/macros.h"
@@ -641,6 +646,30 @@ class RecordedCudaMallocHelper {
 
   uint64_t LimitSize() const { return limit_size_; }
 
+#ifdef PADDLE_WITH_CUDA
+#if CUDA_VERSION >= 10020
+  CUresult MemCreate(CUmemGenericAllocationHandle *handle, size_t size,
+                     const CUmemAllocationProp *prop,
+                     unsigned long long flags) {  // NOLINT
+    auto result =
+        paddle::platform::dynload::cuMemCreate(handle, size, prop, flags);
+    if (result == CUDA_SUCCESS) {
+      cur_size_.fetch_add(size);
+    }
+    return result;
+  }
+
+  CUresult MemRelease(CUmemGenericAllocationHandle handle, size_t size) {
+    auto result = paddle::platform::dynload::cuMemRelease(handle);
+    if (result == CUDA_SUCCESS) {
+      cur_size_.fetch_sub(size);
+    }
+    return result;
+  }
+
+#endif
+#endif
+
  private:
   const int dev_id_;
   const uint64_t limit_size_;
@@ -664,6 +693,22 @@ void RecordedCudaFree(void *p, size_t size, int dev_id) {
   return RecordedCudaMallocHelper::Instance(dev_id)->Free(p, size);
 }
 
+#ifdef PADDLE_WITH_CUDA
+#if CUDA_VERSION >= 10020
+CUresult RecordedCuMemCreate(CUmemGenericAllocationHandle *handle, size_t size,
+                             const CUmemAllocationProp *prop,
+                             unsigned long long flags, int dev_id) {  // NOLINT
+  return RecordedCudaMallocHelper::Instance(dev_id)->MemCreate(handle, size,
+                                                               prop, flags);
+}
+
+CUresult RecordedCuMemRelease(CUmemGenericAllocationHandle handle, size_t size,
+                              int dev_id) {
+  return RecordedCudaMallocHelper::Instance(dev_id)->MemRelease(handle, size);
+}
+#endif
+#endif
+
 bool RecordedCudaMemGetInfo(size_t *avail, size_t *total, size_t *actual_avail,
                             size_t *actual_total, int dev_id) {
   return RecordedCudaMallocHelper::Instance(dev_id)->GetMemInfo(
diff --git a/paddle/fluid/platform/gpu_info.h b/paddle/fluid/platform/gpu_info.h
index 401873dcd77da..93e787fcf36f5 100644
--- a/paddle/fluid/platform/gpu_info.h
+++ b/paddle/fluid/platform/gpu_info.h
@@ -131,6 +131,20 @@ gpuError_t RecordedCudaMalloc(void **ptr, size_t size, int dev_id);
 //! CudaFree with recorded info
 void RecordedCudaFree(void *p, size_t size, int dev_id);
 
+#ifdef PADDLE_WITH_CUDA
+#if CUDA_VERSION >= 10020
+
+//! cuMemCreate with recorded info
+CUresult RecordedCuMemCreate(CUmemGenericAllocationHandle *handle, size_t size,
+                             const CUmemAllocationProp *prop,
+                             unsigned long long flags, int dev_id);  // NOLINT
+
+//! cuMemRelease with recorded info
+CUresult RecordedCuMemRelease(CUmemGenericAllocationHandle handle, size_t size,
+                              int dev_id);
+#endif
+#endif
+
 //! Get available and total gpu memory with considering limitation
 bool RecordedCudaMemGetInfo(size_t *avail, size_t *total, size_t *actual_avail,
                             size_t *actual_total, int dev_id);
diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index 290b3353ae54c..1109ecd52824a 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -37,6 +37,9 @@ limitations under the License. */
 #ifdef WITH_WIN_DUMP_DBG
 #include <stdio.h>
 #include <time.h>
+#ifndef NOMINMAX
+#define NOMINMAX  // msvc max/min macro conflict with std::min/max
+#endif
 #include <windows.h>
 
 #include "DbgHelp.h"
diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h
index 37fa58e423db7..9236521fe1d95 100644
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -333,6 +333,43 @@ inline mkldnn::memory::format_tag GetMKLDNNFormat(const mkldnn::memory memory) {
   return GetMKLDNNFormat(mem_desc);
 }
 
+inline mkldnn::memory::format_tag GetPlainMKLDNNFormat(int tensor_rank) {
+  switch (tensor_rank) {
+    case 1:
+      return mkldnn::memory::format_tag::a;
+      break;
+    case 2:
+      return mkldnn::memory::format_tag::ab;
+      break;
+    case 3:
+      return mkldnn::memory::format_tag::abc;
+      break;
+    case 4:
+      return mkldnn::memory::format_tag::abcd;
+      break;
+    case 5:
+      return mkldnn::memory::format_tag::abcde;
+      break;
+    case 6:
+      return mkldnn::memory::format_tag::abcdef;
+      break;
+    case 7:
+      return mkldnn::memory::format_tag::abcdefg;
+      break;
+    case 8:
+      return mkldnn::memory::format_tag::abcdefgh;
+      break;
+    case 9:
+      return mkldnn::memory::format_tag::abcdefghi;
+      break;
+    default:
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Paddle support tensors with rank in range <1, 9>, but received "
+          "tensor with rank: %d",
+          tensor_rank));
+  }
+}
+
 inline MKLDNNMemoryFormat MKLDNNFormatForSize(size_t dims_size,
                                               MKLDNNMemoryFormat data_format) {
   if (dims_size == 1) {
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
index 084b47bb3c7a3..2bb08bcf81b6c 100644
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -207,7 +207,7 @@ class MKLDNNHandlerNoCachingT {
   std::shared_ptr<mkldnn::memory> AcquireMemoryWithReorder(
       const mkldnn::memory::desc& user_md,
       const mkldnn::memory::desc& target_md, void* ptr,
-      const std::string& suffix, bool is_persistent = false,
+      bool is_persistent = false,
       std::function<std::shared_ptr<F>(const F*)> custom_reorder_func = {}) {
     std::shared_ptr<mkldnn::memory> target_memory_p;
     if (custom_reorder_func) {
@@ -500,18 +500,9 @@ class MKLDNNHandlerT {
   }
 
   void AcquireReorder(const std::shared_ptr<mkldnn::memory>& user_memory_p,
-                      const std::shared_ptr<mkldnn::memory>& target_memory_p,
-                      const std::string& suffix) {
-    const auto key_reorder_p = key_ + suffix + "reorder_p";
-
-    auto reorder_p = std::static_pointer_cast<mkldnn::reorder>(
-        dev_ctx_.GetBlob(key_reorder_p));
-
-    if (reorder_p == nullptr) {
-      reorder_p =
-          std::make_shared<mkldnn::reorder>(*user_memory_p, *target_memory_p);
-      dev_ctx_.SetBlob(key_reorder_p, reorder_p);
-    }
+                      const std::shared_ptr<mkldnn::memory>& target_memory_p) {
+    auto reorder_p =
+        std::make_shared<mkldnn::reorder>(*user_memory_p, *target_memory_p);
 
     auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
 
@@ -578,6 +569,8 @@ class MKLDNNHandlerT {
           std::static_pointer_cast<dnnl::memory>(dev_ctx_.GetBlob(user_key));
       user_memory_p->set_data_handle(ptr);
 
+      // TODO(jczaja): Here we detect if reorder is cached it means it is needed
+      // need to change this to get rid of keys
       auto reorder_p = std::static_pointer_cast<mkldnn::reorder>(
           dev_ctx_.GetBlob(key_reorder_p));
       if (reorder_p != nullptr) {
@@ -614,7 +607,8 @@ class BinaryMKLDNNHandler
   BinaryMKLDNNHandler(const dnnl::algorithm algo, const int axis,
                       const mkldnn::engine engine, platform::Place cpu_place,
                       const Tensor* x, const Tensor* y, Tensor* z,
-                      float scale_x, float scale_y, float scale_z)
+                      float scale_x, float scale_y, float scale_z,
+                      const dnnl::post_ops& post_ops = dnnl::post_ops())
       : platform::MKLDNNHandlerNoCachingT<T, dnnl::binary>(engine, cpu_place) {
     PADDLE_ENFORCE_EQ(
         x->layout(), DataLayout::kMKLDNN,
@@ -663,10 +657,11 @@ class BinaryMKLDNNHandler
                                      MKLDNNMemoryFormat::any);
 
     auto attributes = CreateAttributes(algo, scale_x, scale_y, scale_z);
+    attributes.set_post_ops(post_ops);
+
     this->AcquireForwardPrimitiveDescriptor(attributes, algo, src0_md, src1_md,
                                             dst_md);
   }
-
   std::shared_ptr<mkldnn::memory> AcquireSecondSrcMemory(
       const framework::Tensor* input) {
     const T* input_data = input->data<T>();
diff --git a/paddle/fluid/platform/port.h b/paddle/fluid/platform/port.h
index 453bea625b0ab..981e5f5af644e 100644
--- a/paddle/fluid/platform/port.h
+++ b/paddle/fluid/platform/port.h
@@ -30,13 +30,16 @@
 #include <sys/time.h>
 #include <algorithm>  // std::accumulate
 #else
+#ifndef NOMINMAX
 #define NOMINMAX  // msvc max/min macro conflict with std::min/max
+#endif
 // solve static linking error in windows
 // https://github.com/google/glog/issues/301
 #define GOOGLE_GLOG_DLL_DECL
 #include <io.h>  // _popen, _pclose
 #include <stdio.h>
 #include <windows.h>
+#include <winsock.h>
 #include <numeric>  // std::accumulate in msvc
 #ifndef S_ISDIR     // windows port for sys/stat.h
 #define S_ISDIR(mode) (((mode)&S_IFMT) == S_IFDIR)
diff --git a/paddle/fluid/platform/variant.h b/paddle/fluid/platform/variant.h
index 0f802c08842d0..fb4772abd3062 100644
--- a/paddle/fluid/platform/variant.h
+++ b/paddle/fluid/platform/variant.h
@@ -38,12 +38,13 @@ limitations under the License. */
 #endif
 #endif
 
-#include <boost/any.hpp>
 #include <boost/mpl/comparison.hpp>
 #include <boost/mpl/less_equal.hpp>
-#include <boost/optional.hpp>
 #include <boost/variant.hpp>
 
+#include "paddle/utils/any.h"
+#include "paddle/utils/optional.h"
+
 // some platform-independent defintion
 #if defined(_WIN32)
 #define UNUSED
diff --git a/paddle/fluid/platform/xpu/xpu2_op_list.h b/paddle/fluid/platform/xpu/xpu2_op_list.h
index 121d26e39dd8b..5eb86a36f5167 100644
--- a/paddle/fluid/platform/xpu/xpu2_op_list.h
+++ b/paddle/fluid/platform/xpu/xpu2_op_list.h
@@ -90,6 +90,12 @@ XPUOpMap& get_kl2_ops() {
                                   pOpKernelType(vartype::FP16, XPUPlace())})},
       {"adam", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"adamw", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"reduce_sum", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"reduce_sum_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"softmax", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"softmax_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"softmax_with_cross_entropy",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"softmax_with_cross_entropy_grad",
@@ -171,6 +177,39 @@ XPUOpMap& get_kl2_ops() {
                      pOpKernelType(vartype::INT32, XPUPlace()),
                      pOpKernelType(vartype::INT8, XPUPlace()),
                      pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"matmul_v2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"matmul_v2_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"matmul", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"matmul_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"relu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"relu_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"assign_value",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"dropout", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"dropout_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"elementwise_div",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"elementwise_div_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"range", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                              pOpKernelType(vartype::INT64, XPUPlace())})},
+      {"reshape2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"reshape2_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"shape", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                              pOpKernelType(vartype::INT64, XPUPlace())})},
+      {"one_hot_v2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                                   pOpKernelType(vartype::INT64, XPUPlace())})},
+      {"layer_norm", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"layer_norm_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"lookup_table_v2",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"lookup_table_v2_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"scale", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
 
       {"flatten_contiguous_range",
        XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
@@ -184,6 +223,46 @@ XPUOpMap& get_kl2_ops() {
                      pOpKernelType(vartype::INT8, XPUPlace()),
                      pOpKernelType(vartype::FP16, XPUPlace()),
                      pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"scale", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                              pOpKernelType(vartype::FP16, XPUPlace()),
+                              pOpKernelType(vartype::INT64, XPUPlace())})},
+      {"tanh", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                             pOpKernelType(vartype::FP16, XPUPlace())})},
+      {"tanh_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                                  pOpKernelType(vartype::FP16, XPUPlace())})},
+      {"gelu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                             pOpKernelType(vartype::FP16, XPUPlace())})},
+      {"gelu_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                                  pOpKernelType(vartype::FP16, XPUPlace())})},
+      {"gather", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                               pOpKernelType(vartype::FP16, XPUPlace())})},
+      {"gather_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                                    pOpKernelType(vartype::FP16, XPUPlace())})},
+      {"fill_constant",
+       XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
+                     pOpKernelType(vartype::INT32, XPUPlace()),
+                     pOpKernelType(vartype::INT16, XPUPlace()),
+                     pOpKernelType(vartype::INT8, XPUPlace()),
+                     pOpKernelType(vartype::BOOL, XPUPlace()),
+                     pOpKernelType(vartype::FP64, XPUPlace()),
+                     pOpKernelType(vartype::FP32, XPUPlace()),
+                     pOpKernelType(vartype::FP16, XPUPlace()),
+                     pOpKernelType(vartype::BF16, XPUPlace()),
+                     pOpKernelType(vartype::COMPLEX64, XPUPlace()),
+                     pOpKernelType(vartype::COMPLEX128, XPUPlace())})},
+      {"softmax", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                                pOpKernelType(vartype::FP16, XPUPlace())})},
+      {"softmax_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                     pOpKernelType(vartype::FP16, XPUPlace())})},
+      {"gather_nd", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
+                                  pOpKernelType(vartype::INT64, XPUPlace()),
+                                  pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"tile", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
+                             pOpKernelType(vartype::INT64, XPUPlace()),
+                             pOpKernelType(vartype::BOOL, XPUPlace()),
+                             pOpKernelType(vartype::FP32, XPUPlace())})}
+
       // AddMore
   };
 
diff --git a/paddle/fluid/platform/xpu/xpu_header.h b/paddle/fluid/platform/xpu/xpu_header.h
index caee41ae299c7..a72fbd65e2462 100644
--- a/paddle/fluid/platform/xpu/xpu_header.h
+++ b/paddle/fluid/platform/xpu/xpu_header.h
@@ -19,6 +19,7 @@
 #include <string>
 #include <unordered_map>
 
+#include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/errors.h"
 #include "paddle/fluid/platform/float16.h"
 #include "xpu/runtime.h"
@@ -68,4 +69,10 @@ class XPUTypeTrait<paddle::platform::float16> {
   using Type = float16;
 };
 
+template <>
+class XPUTypeTrait<paddle::platform::bfloat16> {
+ public:
+  using Type = bfloat16;
+};
+
 #endif
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 875e6af9652a2..595c833cbfa8a 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -7,7 +7,7 @@ set(PYBIND_DEPS pybind python proto_desc memory executor fleet_wrapper box_wrapp
   feed_fetch_method pass generate_pass pass_builder parallel_executor profiler layer tracer engine scope_pool
   analysis_predictor imperative_profiler imperative_flag save_load_util dlpack_tensor device_context
   gloo_wrapper infer_io_utils heter_wrapper generator op_version_registry ps_gpu_wrapper custom_operator
-  cost_model cuda_graph_with_memory_pool)
+  cost_model cuda_graph_with_memory_pool fleet_executor)
 
 if (WITH_PSCORE)
   set(PYBIND_DEPS ${PYBIND_DEPS} ps_service)
@@ -61,6 +61,7 @@ set(PYBIND_SRCS
   imperative.cc
   ir.cc
   bind_cost_model.cc
+  bind_fleet_executor.cc
   inference_api.cc
   compatible.cc
   io.cc
diff --git a/paddle/fluid/pybind/bind_fleet_executor.cc b/paddle/fluid/pybind/bind_fleet_executor.cc
new file mode 100644
index 0000000000000..392cdfe19bd7a
--- /dev/null
+++ b/paddle/fluid/pybind/bind_fleet_executor.cc
@@ -0,0 +1,34 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/pybind/bind_fleet_executor.h"
+#include <pybind11/stl.h>
+#include "paddle/fluid/distributed/fleet_executor/fleet_executor.h"
+#include "paddle/fluid/framework/program_desc.h"
+
+namespace py = pybind11;
+
+namespace paddle {
+namespace pybind {
+
+using paddle::distributed::FleetExecutor;
+
+void BindFleetExecutor(py::module* m) {
+  py::class_<FleetExecutor>(*m, "FleetExecutor")
+      .def(py::init<const std::string&>())
+      .def("init", &FleetExecutor::Init)
+      .def("run", &FleetExecutor::Run);
+}
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/bind_fleet_executor.h b/paddle/fluid/pybind/bind_fleet_executor.h
new file mode 100644
index 0000000000000..733701fa36ba8
--- /dev/null
+++ b/paddle/fluid/pybind/bind_fleet_executor.h
@@ -0,0 +1,25 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+namespace paddle {
+namespace pybind {
+
+void BindFleetExecutor(pybind11::module* m);
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/fleet_py.cc b/paddle/fluid/pybind/fleet_py.cc
index ea9faf57ac52b..e6b8238010a35 100644
--- a/paddle/fluid/pybind/fleet_py.cc
+++ b/paddle/fluid/pybind/fleet_py.cc
@@ -158,7 +158,8 @@ void BindDistCommunicator(py::module* m) {
       .def("start", &Communicator::Start)
       .def("push_sparse_param", &Communicator::RpcSendSparseParam)
       .def("is_running", &Communicator::IsRunning)
-      .def("init_params", &Communicator::InitParams);
+      .def("init_params", &Communicator::InitParams)
+      .def("pull_dense", &Communicator::PullDense);
   //  .def("recv", &Communicator::RecvNoBarrier);
 }
 
@@ -204,7 +205,8 @@ void BindGraphPyClient(py::module* m) {
       .def("add_table_feat_conf", &GraphPyClient::add_table_feat_conf)
       .def("pull_graph_list", &GraphPyClient::pull_graph_list)
       .def("start_client", &GraphPyClient::start_client)
-      .def("batch_sample_neighboors", &GraphPyClient::batch_sample_neighboors)
+      .def("batch_sample_neighboors", &GraphPyClient::batch_sample_neighbors)
+      .def("batch_sample_neighbors", &GraphPyClient::batch_sample_neighbors)
       .def("remove_graph_node", &GraphPyClient::remove_graph_node)
       .def("random_sample_nodes", &GraphPyClient::random_sample_nodes)
       .def("stop_server", &GraphPyClient::stop_server)
diff --git a/paddle/fluid/pybind/global_value_getter_setter.cc b/paddle/fluid/pybind/global_value_getter_setter.cc
index e7f1bef4bee62..276c528f2a6ea 100644
--- a/paddle/fluid/pybind/global_value_getter_setter.cc
+++ b/paddle/fluid/pybind/global_value_getter_setter.cc
@@ -145,6 +145,7 @@ class PYBIND11_HIDDEN GlobalVarGetterSetterRegistry {
   }
 
   void Set(const std::string &name, const py::object &value) const {
+    VLOG(4) << "set " << name << " to " << value;
     SetterMethod(name)(value);
   }
 
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 8b01f02ee2c3a..4403eb469723a 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -985,6 +985,12 @@ void BindImperative(py::module *m_ptr) {
                 auto value_tensor =
                     value_obj.cast<std::shared_ptr<imperative::VarBase>>();
                 ins.insert({"ValueTensor", {value_tensor}});
+
+                // pass the stop_gradient from value to tensor
+                if (!value_tensor->OverridedStopGradient() &&
+                    self->OverridedStopGradient()) {
+                  self->SetOverridedStopGradient(false);
+                }
               } else if (py::isinstance<py::array>(value_obj)) {
                 auto value_tensor = std::shared_ptr<imperative::VarBase>(
                     new imperative::VarBase(false,
diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc
index 54ea0f2aee17f..850f208359e05 100644
--- a/paddle/fluid/pybind/op_function_generator.cc
+++ b/paddle/fluid/pybind/op_function_generator.cc
@@ -567,7 +567,9 @@ GenerateOpFunctions() {
     auto& op_type = op_proto->type();
     // Skip ooerator which is not inherit form OperatorWithKernel, like while,
     // since only OperatorWithKernel can run in dygraph mode.
-    if (!all_kernels.count(op_type)) {
+    // if the pten lib contains op kernel, we still generate ops method
+    if (!all_kernels.count(op_type) &&
+        !pten::KernelFactory::Instance().HasCompatiblePtenKernel(op_type)) {
       continue;
     }
 
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index b27c05d98a1c0..a5b0b1cd2a061 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -81,6 +81,7 @@ limitations under the License. */
 #include "paddle/fluid/pybind/ascend_wrapper_py.h"
 #endif
 #include "paddle/fluid/pybind/bind_cost_model.h"
+#include "paddle/fluid/pybind/bind_fleet_executor.h"
 #include "paddle/fluid/pybind/box_helper_py.h"
 #include "paddle/fluid/pybind/compatible.h"
 #include "paddle/fluid/pybind/const_value.h"
@@ -226,6 +227,23 @@ bool SupportsBfloat16FastPerformance() {
 #endif
 }
 
+bool SupportsInt8() {
+#ifndef PADDLE_WITH_MKLDNN
+  return false;
+#else
+  return (platform::MayIUse(platform::cpu_isa_t::avx2) ||
+          platform::MayIUse(platform::cpu_isa_t::avx512f));
+#endif
+}
+
+bool SupportsVNNI() {
+#ifndef PADDLE_WITH_MKLDNN
+  return false;
+#else
+  return platform::MayIUse(platform::cpu_isa_t::avx512_core_vnni);
+#endif
+}
+
 // According to the input `place` and `dtype`, this function returns a tuple
 // consists of three sets:
 // 1) All operators registered in the Paddle framework.
@@ -488,6 +506,17 @@ static int GetNCCLVersion() {
 }
 #endif
 
+template <typename PlaceType>
+static void TensorCopyFrom(framework::Tensor *dst, const framework::Tensor &src,
+                           const PlaceType &place, int64_t batch_size) {
+  if (batch_size < 0) {
+    framework::TensorCopy(src, place, dst);
+  } else {
+    auto sliced = src.Slice(0, batch_size);
+    framework::TensorCopy(sliced, place, dst);
+  }
+}
+
 #ifdef PADDLE_WITH_AVX
 PYBIND11_MODULE(core_avx, m) {
 #else
@@ -517,8 +546,13 @@ PYBIND11_MODULE(core_noavx, m) {
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   m.def("cudnn_version", &platform::CudnnVersion);
+  m.def("gpu_memory_available", []() {
+    size_t available = 0;
+    size_t total = 0;
+    paddle::platform::GpuMemoryUsage(&available, &total);
+    return available;
+  });
 #endif
-
 #ifdef PADDLE_WITH_NCCL
   m.def("nccl_version", &GetNCCLVersion);
 #endif
@@ -533,7 +567,8 @@ PYBIND11_MODULE(core_noavx, m) {
                   })
       .def_static("end_capture", &platform::EndCUDAGraphCapture)
       .def("replay", &platform::CUDAGraph::Replay)
-      .def("reset", &platform::CUDAGraph::Reset);
+      .def("reset", &platform::CUDAGraph::Reset)
+      .def("print_to_dot_files", &platform::CUDAGraph::PrintToDotFiles);
 #endif
 
   m.def("wait_device", [](const platform::Place &place) {
@@ -737,16 +772,17 @@ PYBIND11_MODULE(core_noavx, m) {
               paddle::framework::proto::VarType::Type type) {
              return reinterpret_cast<uintptr_t>(self.mutable_data(place, type));
            })
-      .def("_copy_from",
-           [](framework::Tensor &self, const framework::Tensor &other,
-              const platform::Place &place, int64_t batch_size) {
-             if (batch_size < 0) {
-               framework::TensorCopy(other, place, &self);
-             } else {
-               auto sliced = other.Slice(0, batch_size);
-               framework::TensorCopy(sliced, place, &self);
-             }
-           },
+      .def("_copy_from", &TensorCopyFrom<paddle::platform::CPUPlace>,
+           py::arg("tensor"), py::arg("place"), py::arg("batch_size") = -1)
+      .def("_copy_from", &TensorCopyFrom<paddle::platform::XPUPlace>,
+           py::arg("tensor"), py::arg("place"), py::arg("batch_size") = -1)
+      .def("_copy_from", &TensorCopyFrom<paddle::platform::CUDAPlace>,
+           py::arg("tensor"), py::arg("place"), py::arg("batch_size") = -1)
+      .def("_copy_from", &TensorCopyFrom<paddle::platform::NPUPlace>,
+           py::arg("tensor"), py::arg("place"), py::arg("batch_size") = -1)
+      .def("_copy_from", &TensorCopyFrom<paddle::platform::CUDAPinnedPlace>,
+           py::arg("tensor"), py::arg("place"), py::arg("batch_size") = -1)
+      .def("_copy_from", &TensorCopyFrom<paddle::platform::Place>,
            py::arg("tensor"), py::arg("place"), py::arg("batch_size") = -1)
       .def("set", SetTensorFromPyArray<paddle::platform::CPUPlace>,
            py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
@@ -1709,6 +1745,14 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("get_xpu_device_count", platform::GetXPUDeviceCount);
   m.def("get_xpu_device_version",
         [](int device_id) { return platform::get_xpu_version(device_id); });
+  m.def("is_float16_supported", [](const platform::XPUPlace &place) -> bool {
+    // XPUs with Compute Capability > xpu2 support float16 and bfloat16
+    return platform::get_xpu_version(place.device) > platform::XPUVersion::XPU1;
+  });
+  m.def("is_bfloat16_supported", [](const platform::XPUPlace &place) -> bool {
+    // XPUs with Compute Capability > xpu2 support float16 and bfloat16
+    return platform::get_xpu_version(place.device) > platform::XPUVersion::XPU1;
+  });
 #endif
 
   py::class_<paddle::platform::CPUPlace>(m, "CPUPlace", R"DOC(
@@ -2025,11 +2069,13 @@ All parameter, weight, gradient are variables in Paddle.
                  fetch_vars);
       });
 
-  py::class_<framework::CostInfo>(m, "CostInfo")
+  py::class_<framework::interpreter::CostInfo>(m, "CostInfo")
       .def(py::init<>())
-      .def("total_time", [](CostInfo &self) { return self.total_time; })
-      .def("device_memory_bytes",
-           [](CostInfo &self) { return self.device_memory_bytes; });
+      .def("total_time",
+           [](interpreter::CostInfo &self) { return self.total_time; })
+      .def("device_memory_bytes", [](interpreter::CostInfo &self) {
+        return self.device_memory_bytes;
+      });
 
   py::class_<framework::StandaloneExecutor>(m, "StandaloneExecutor")
       .def(py::init<const platform::Place &, const ProgramDesc &,
@@ -2038,7 +2084,7 @@ All parameter, weight, gradient are variables in Paddle.
            [](StandaloneExecutor &self,
               const std::unordered_map<std::string, py::array> &input_dict,
               std::vector<std::string> fetch_names) {
-             std::vector<framework::Tensor> feed_tensors;
+             std::vector<framework::LoDTensor> feed_tensors;
              std::vector<std::string> feed_names;
 
              for (auto &item : input_dict) {
@@ -2058,10 +2104,10 @@ All parameter, weight, gradient are variables in Paddle.
            })
       .def("run",
            [](StandaloneExecutor &self,
-              const std::unordered_map<std::string, framework::Tensor>
+              const std::unordered_map<std::string, framework::LoDTensor>
                   &input_dict,
               std::vector<std::string> fetch_names) {
-             std::vector<framework::Tensor> feed_tensors;
+             std::vector<framework::LoDTensor> feed_tensors;
              std::vector<std::string> feed_names;
 
              for (auto &item : input_dict) {
@@ -2079,7 +2125,7 @@ All parameter, weight, gradient are variables in Paddle.
       .def("dry_run",
            [](StandaloneExecutor &self,
               const std::unordered_map<std::string, py::array> &input_dict) {
-             std::vector<framework::Tensor> feed_tensors;
+             std::vector<framework::LoDTensor> feed_tensors;
              std::vector<std::string> feed_names;
 
              for (auto &item : input_dict) {
@@ -2090,7 +2136,7 @@ All parameter, weight, gradient are variables in Paddle.
                feed_tensors.push_back(t);
              }
 
-             CostInfo cost_info;
+             framework::interpreter::CostInfo cost_info;
              {
                pybind11::gil_scoped_release release;
                cost_info = self.DryRun(feed_names, feed_tensors);
@@ -2113,6 +2159,8 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("_is_compiled_with_heterps", IsCompiledWithHETERPS);
   m.def("supports_bfloat16", SupportsBfloat16);
   m.def("supports_bfloat16_fast_performance", SupportsBfloat16FastPerformance);
+  m.def("supports_int8", SupportsInt8);
+  m.def("supports_vnni", SupportsVNNI);
   m.def("op_supported_infos", OpSupportedInfos);
   m.def("is_compiled_with_brpc", IsCompiledWithBrpc);
   m.def("is_compiled_with_dist", IsCompiledWithDIST);
@@ -2189,6 +2237,7 @@ All parameter, weight, gradient are variables in Paddle.
   BindConstValue(&m);
   BindGlobalValueGetterSetter(&m);
   BindProcessMeshDesc(&m);
+  BindFleetExecutor(&m);
 
   py::class_<framework::LoDRankTable>(m, "LodRankTable")
       .def("items", [](framework::LoDRankTable &table) {
@@ -2352,23 +2401,31 @@ All parameter, weight, gradient are variables in Paddle.
         py::return_value_policy::copy);
 
   py::class_<gpuDeviceProp>(m, "_gpuDeviceProperties")
-      .def_readonly("name", &gpuDeviceProp::name)
-      .def_readonly("major", &gpuDeviceProp::major)
-      .def_readonly("minor", &gpuDeviceProp::minor)
-      .def_readonly("is_multi_gpu_board", &gpuDeviceProp::isMultiGpuBoard)
-      .def_readonly("is_integrated", &gpuDeviceProp::integrated)
-      .def_readonly("multi_processor_count",
-                    &gpuDeviceProp::multiProcessorCount)
-      .def_readonly("total_memory", &gpuDeviceProp::totalGlobalMem)
-      .def("__repr__", [](const gpuDeviceProp &gpu_device_prop) {
-        std::ostringstream stream;
-        stream << "_gpuDeviceProperties(name='" << gpu_device_prop.name
-               << "', major=" << gpu_device_prop.major
-               << ", minor=" << gpu_device_prop.minor << ", total_memory="
-               << gpu_device_prop.totalGlobalMem / (1024 * 1024)
-               << "MB, multi_processor_count="
-               << gpu_device_prop.multiProcessorCount << ")";
-        return stream.str();
+      .def_property_readonly(
+          "name", [](const gpuDeviceProp &prop) { return prop.name; })
+      .def_property_readonly(
+          "major", [](const gpuDeviceProp &prop) { return prop.major; })
+      .def_property_readonly(
+          "minor", [](const gpuDeviceProp &prop) { return prop.minor; })
+      .def_property_readonly(
+          "total_memory",
+          [](const gpuDeviceProp &prop) { return prop.totalGlobalMem; })
+      .def_property_readonly(
+          "multi_processor_count",
+          [](const gpuDeviceProp &prop) { return prop.multiProcessorCount; })
+      .def_property_readonly(
+          "is_multi_gpu_board",
+          [](const gpuDeviceProp &prop) { return prop.isMultiGpuBoard; })
+      .def_property_readonly(
+          "is_integrated",
+          [](const gpuDeviceProp &prop) { return prop.integrated; })
+      .def("__repr__", [](const gpuDeviceProp &prop) {
+        std::stringstream ostr;
+        ostr << "_gpuDeviceProperties(name='" << prop.name
+             << "', major=" << prop.major << ", minor=" << prop.minor
+             << ", total_memory=" << prop.totalGlobalMem / (1024 * 1024)
+             << "MB, multi_processor_count=" << prop.multiProcessorCount << ")";
+        return ostr.str();
       });
 
 #if !defined(PADDLE_WITH_HIP) && !defined(_WIN32)
diff --git a/paddle/pten/CMakeLists.txt b/paddle/pten/CMakeLists.txt
new file mode 100644
index 0000000000000..0b3bb2557039c
--- /dev/null
+++ b/paddle/pten/CMakeLists.txt
@@ -0,0 +1,23 @@
+# pten (low level) api headers: include
+# pten (high level) api
+add_subdirectory(api)
+# pten core components
+add_subdirectory(core)
+# pten kernels for diff device
+add_subdirectory(kernels)
+# pten infershape
+add_subdirectory(infershape)
+# pten tests
+add_subdirectory(tests)
+
+# make an unity target for compile deps
+set(PTEN_DEPS convert_utils dense_tensor kernel_factory kernel_context)
+set(PTEN_DEPS ${PTEN_DEPS} math_cpu linalg_cpu creation_cpu manipulation_cpu)
+set(PTEN_DEPS ${PTEN_DEPS} nary unary binary)
+if(WITH_GPU OR WITH_ROCM)
+  set(PTEN_DEPS ${PTEN_DEPS} math_cuda linalg_cuda creation_cuda manipulation_cuda)
+endif()
+if(WITH_XPU)
+  set(PTEN_DEPS ${PTEN_DEPS} manipulation_xpu)
+endif()
+cc_library(pten SRCS all.cc DEPS ${PTEN_DEPS})
diff --git a/paddle/pten/all.cc b/paddle/pten/all.cc
new file mode 100644
index 0000000000000..d8d96e1cd461e
--- /dev/null
+++ b/paddle/pten/all.cc
@@ -0,0 +1,17 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/all.h"
+
+namespace pten {}  // namespace pten
diff --git a/paddle/pten/all.h b/paddle/pten/all.h
new file mode 100644
index 0000000000000..e8e41a8c3eabc
--- /dev/null
+++ b/paddle/pten/all.h
@@ -0,0 +1,23 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+// develop apis
+#include "paddle/pten/include/core.h"
+#include "paddle/pten/include/creation.h"
+#include "paddle/pten/include/infershape.h"
+#include "paddle/pten/include/linalg.h"
+#include "paddle/pten/include/manipulation.h"
+#include "paddle/pten/include/math.h"
diff --git a/paddle/pten/api/CMakeLists.txt b/paddle/pten/api/CMakeLists.txt
new file mode 100644
index 0000000000000..387da3bc68f12
--- /dev/null
+++ b/paddle/pten/api/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_subdirectory(lib)
+
+cc_library(pten_api SRCS all.cc DEPS linalg_api math_api creation_api manipulation_api)
diff --git a/paddle/pten/api/all.cc b/paddle/pten/api/all.cc
new file mode 100644
index 0000000000000..c270fbb070689
--- /dev/null
+++ b/paddle/pten/api/all.cc
@@ -0,0 +1,19 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/api/all.h"
+
+namespace paddle {
+namespace experimental {}  // namespace experimental
+}  // namespace paddle
diff --git a/paddle/pten/api/all.h b/paddle/pten/api/all.h
new file mode 100644
index 0000000000000..3a2c7b3fa98fe
--- /dev/null
+++ b/paddle/pten/api/all.h
@@ -0,0 +1,22 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+// user apis
+#include "paddle/pten/api/include/creation.h"
+#include "paddle/pten/api/include/linalg.h"
+#include "paddle/pten/api/include/manipulation.h"
+#include "paddle/pten/api/include/math.h"
+#include "paddle/pten/api/include/tensor.h"
diff --git a/paddle/pten/api/include/backend_set.h b/paddle/pten/api/include/backend_set.h
new file mode 100644
index 0000000000000..e01c195e95530
--- /dev/null
+++ b/paddle/pten/api/include/backend_set.h
@@ -0,0 +1,72 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <ostream>
+
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/pten/common/backend.h"
+namespace paddle {
+namespace experimental {
+
+/**
+ * We use the backend to form a bit set to assist the runtime kernel selection,
+ * and the higher backend bit has a higher priority.
+ *
+ * A Tensor may belong to multiple backends at the same time, such CPU and
+ * MKLDNN. Only one backend value cannot
+ */
+class BackendSet final {
+ public:
+  constexpr BackendSet() : bitset_(0) {}
+  explicit constexpr BackendSet(Backend b)
+      : bitset_(b == Backend::UNDEFINED ? 0 : 1ULL << (static_cast<uint8_t>(b) -
+                                                       1)) {}
+
+  uint64_t bitset() const { return bitset_; }
+
+  bool inline Has(Backend b) const {
+    PADDLE_ENFORCE_NE(b,
+                      Backend::UNDEFINED,
+                      platform::errors::InvalidArgument(
+                          "Backend argument can't be UNDEFINED."));
+    return static_cast<bool>(bitset_ & BackendSet(b).bitset());
+  }
+  bool IsEmpty() const { return bitset_ == 0; }
+
+  BackendSet operator|(const BackendSet& other) const {
+    return BackendSet(bitset_ | other.bitset());
+  }
+  BackendSet operator&(const BackendSet& other) const {
+    return BackendSet(bitset_ & other.bitset());
+  }
+  BackendSet operator-(const BackendSet& other) const {
+    return BackendSet(bitset_ & ~other.bitset());
+  }
+  BackendSet operator^(const BackendSet& other) const {
+    return BackendSet(bitset_ ^ other.bitset());
+  }
+
+  bool operator==(const BackendSet& other) const {
+    return bitset_ == other.bitset();
+  }
+
+ private:
+  constexpr BackendSet(uint64_t bitset) : bitset_(bitset) {}
+  uint64_t bitset_;
+};
+
+}  // namespace experimental
+}  // namespace paddle
diff --git a/paddle/pten/api/include/creation.h b/paddle/pten/api/include/creation.h
new file mode 100644
index 0000000000000..b7e7bf55c6bc5
--- /dev/null
+++ b/paddle/pten/api/include/creation.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/pten/api/include/tensor.h"
+#include "paddle/pten/common/data_type.h"
+#include "paddle/pten/common/scalar.h"
+
+namespace paddle {
+namespace experimental {
+
+Tensor full(const std::vector<int64_t>& shape,
+            const Scalar& value,
+            DataType dtype = DataType::FLOAT32,
+            Backend backend = Backend::CPU,
+            DataLayout layout = DataLayout::NCHW);
+
+Tensor full_like(const Tensor& x,
+                 const Scalar& value,
+                 DataType dtype = DataType::UNDEFINED);
+
+Tensor ones_like(const Tensor& x, DataType dtype = DataType::UNDEFINED);
+
+Tensor zeros_like(const Tensor& x, DataType dtype = DataType::UNDEFINED);
+
+}  // namespace experimental
+}  // namespace paddle
diff --git a/paddle/pten/api/include/linalg.h b/paddle/pten/api/include/linalg.h
new file mode 100644
index 0000000000000..c28c133018464
--- /dev/null
+++ b/paddle/pten/api/include/linalg.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/pten/api/include/tensor.h"
+
+namespace paddle {
+namespace experimental {
+
+Tensor dot(const Tensor& x, const Tensor& y);
+
+Tensor matmul(const Tensor& x,
+              const Tensor& y,
+              bool transpose_x,
+              bool transpose_y);
+
+}  // namespace experimental
+}  // namespace paddle
diff --git a/paddle/pten/api/include/manipulation.h b/paddle/pten/api/include/manipulation.h
new file mode 100644
index 0000000000000..fe8c01cb74b95
--- /dev/null
+++ b/paddle/pten/api/include/manipulation.h
@@ -0,0 +1,25 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/pten/api/include/tensor.h"
+
+namespace paddle {
+namespace experimental {
+
+Tensor flatten(const Tensor& x, int start_axis, int stop_axis);
+
+}  // namespace experimental
+}  // namespace paddle
diff --git a/paddle/pten/api/include/math.h b/paddle/pten/api/include/math.h
new file mode 100644
index 0000000000000..9fea515646d6e
--- /dev/null
+++ b/paddle/pten/api/include/math.h
@@ -0,0 +1,27 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/pten/api/include/tensor.h"
+
+namespace paddle {
+namespace experimental {
+
+// TODO(chenweihang): add scale API
+// TODO(chenweihang): move mean API into stat.h/cc
+Tensor mean(const Tensor& x);
+
+}  // namespace experimental
+}  // namespace paddle
diff --git a/paddle/pten/api/include/tensor.h b/paddle/pten/api/include/tensor.h
new file mode 100644
index 0000000000000..66ea7853541bd
--- /dev/null
+++ b/paddle/pten/api/include/tensor.h
@@ -0,0 +1,258 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <utility>
+
+#include "paddle/pten/core/tensor_base.h"
+
+/**
+ * [ Why still include the fluid headers? ]
+ *
+ * We hope to organize the basic implementation of Tensor and the logic related
+ * to Tensor computation into an independent library, which we call
+ * [Tensor Operation Library, pten], so we extract or rewrite the original
+ * Kernels.
+ *
+ * In the future, the training library, inference library and custom operators
+ * will link to this Tensor Operation library.
+ *
+ * However, if we directly split the link relation, we need to make too many
+ * changes, which will affect the stability of the framework, so here we still
+ * rely on the implementation of the framework, which is a intermediate state.
+ *
+ * In the future, the necessary components will be moved to the this library,
+ * or the corresponding components will be re-implemented.
+ */
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace experimental {
+
+class Tensor;
+
+class AbstractAutogradMeta {
+ public:
+  // No AbstractAutogradMeta should be created
+  virtual ~AbstractAutogradMeta() {}
+};
+
+/**
+ * Tensor is the API description of the basic data structure in the
+ * [ "Paddle Tensor Operation (pten)" Library ].
+ *
+ * It is not limited to a simple n-dimensional array.
+ * It contains a smart pointer to `TensorImpl`. The data description contained
+ * in Tensor is defined by TensorImpl. Tensor only defines the interface for
+ * computation.
+ *
+ * This is a new Tensor design, which is independent of the original
+ * framework::Tensor in fluid. The original Tensor will be gradually discarded
+ * in the future.
+ *
+ * Note: Tensor can be NULL state, Tensor is meaningful only when the
+ * TensorImpl to which it is pointed is not empty.
+ *
+ * Note: For the consistency of C++ API self, and the consistency between C++
+ * API and Python API, all member methods of Tensor are named with lowercase
+ * letters and underscores.
+ *
+ * Note: Tensor cannot be inherited. The heterogeneous Tensor implementation
+ * can be achieved by inheriting the underlying TensorBase.
+ *
+ * Note: This Tensor API is suitable for training and custom operators,
+ * another simple Tensor design may be required for inference.
+ */
+
+class Tensor final {
+ public:
+  /* Part 1: Construction and destruction methods */
+  Tensor() {}
+  Tensor(const Tensor&) = default;
+  Tensor(Tensor&&) = default;
+
+  /**
+   * @description: Use a TensorImpl pointer to construct a Tensor
+   * @param {shared_ptr<TensorBase>} tensor_impl
+   * @return {Tensor}
+   */
+  explicit Tensor(std::shared_ptr<pten::TensorBase> tensor_impl)
+      : impl_(std::move(tensor_impl)) {
+    PADDLE_ENFORCE_NOT_NULL(impl_,
+                            platform::errors::InvalidArgument(
+                                "TensorImpl with nullptr is not supported"));
+  }
+
+  /* Part 2: Dimension, DataType and DataLayout methods */
+  /**
+   * @description: Return the number of elements of current Tensor.
+   * @param None
+   * @return {int64_t}
+   */
+  int64_t numel() const { return impl_->numel(); }
+
+  /**
+   * @description: Return the shape (dimensions) of current Tensor.
+   * @param None
+   * @return {DDim}
+   */
+  paddle::framework::DDim shape() const { return impl_->dims(); }
+
+  /**
+   * @description: Return the data type of current Tensor.
+   * @param None
+   * @return {DataType}
+   */
+  paddle::experimental::DataType type() const { return impl_->data_type(); }
+
+  /**
+   * @description: Return the layout of current Tensor.
+   * @param None
+   * @return {DataLayout}
+   */
+  paddle::experimental::DataLayout layout() const { return impl_->layout(); }
+
+  /* Part 3: Device and Backend methods */
+  /**
+   * @description: Return the place (device) of current Tensor.
+   * @param None
+   * @return {Place}
+   */
+  paddle::platform::Place place() const { return impl_->place(); }
+
+  /**
+   * Backend judgment APIs, shield the concept of Backend.
+   */
+  bool is_cpu() const { return paddle::platform::is_cpu_place(place()); }
+  bool is_cuda() const { return paddle::platform::is_gpu_place(place()); }
+
+  /**
+   * Backend convert APIs.
+   */
+  Tensor cpu() const;
+  Tensor cuda() const;
+
+  /* Part 4: Data Access methods */
+  /**
+   * @description: Return the implemention of current Tensor.
+   * @param None
+   * @return {std::shared_ptr<TensorBase>}
+   */
+  std::shared_ptr<pten::TensorBase> impl() const { return impl_; }
+
+  /**
+   * @description: Set the implemention of current Tensor.
+   * @param {std::shared_ptr<TensorBase>}
+   * @return None
+   */
+  void set_impl(const std::shared_ptr<pten::TensorBase>& impl) { impl_ = impl; }
+
+  // TODO(chenweihang): Whether API Tensor need `data` and `mutable_data`?
+
+  // TODO(chenweihang): slice and split methods use kernels?
+
+  /* Part 5: Status utils methods */
+  /**
+   * @description: Determine whether it is a meaningful Tensor
+   * @param None
+   * @return {bool}
+   */
+  bool defined() const { return impl_ != nullptr; }
+
+  /**
+   * @description: Determine whether Tensor is initialized
+   * @param None
+   * @return {bool}
+   */
+  bool initialized() const { return impl_->initialized(); }
+
+  /**
+   * @description: Reset the Tensor implementation
+   * @param None
+   * @return {void}
+   */
+  void reset() { impl_.reset(); }
+
+  /* Part 6: Operator overloading */
+  Tensor& operator=(const Tensor& x) & {
+    impl_ = x.impl_;
+    autograd_meta_ = x.autograd_meta_;
+    return *this;
+  }
+  Tensor& operator=(Tensor&& x) & {
+    impl_ = std::move(x.impl_);
+    autograd_meta_ = std::move(x.autograd_meta_);
+    return *this;
+  }
+
+  /* Part 7: Autograd methods */
+  AbstractAutogradMeta* get_autograd_meta() const {
+    return autograd_meta_.get();
+  }
+
+  void set_autograd_meta(std::shared_ptr<AbstractAutogradMeta> autograd_meta) {
+    autograd_meta_ = std::move(autograd_meta);
+  }
+
+  /* Part 8: Auto generated Tensor methods */
+  // ...
+
+ private:
+  /**
+   * [ Why use abstract TensorImpl interface here? ]
+   *
+   * We hope that the data structure at the API level of the framework can be
+   * unified to Tensor, but Tensor itself is heterogeneous.
+   *
+   * Tensor can generally be represented by void* and size_t, place.
+   * This is suitable for most scenarios including CPU, CUDA, HIP, CPU, etc.,
+   * but there are a few cases where this definition cannot be described,
+   * such as the Tensor representation in third-party lib such as Metal,
+   * OpenCL, etc., as well as some special Tensor implementations, including
+   * Tensor containing only one Scalar value, or Tensor representing String,
+   * etc.
+   *
+   * Therefore, we hope to use a unified interface to shield the underlying
+   * heterogeneous Tensor implementation, so that the API level can be unified
+   * to one `Tensor`.
+   */
+  std::shared_ptr<pten::TensorBase> impl_;
+
+  /**
+   * [ Why need abstract AbstractAutogradMeta here? ]
+   *
+   * Dynamic graphs need to hold backward information
+   *
+   * [ Why AutogradMeta not in TensorImpl? ]
+   *
+   * 1. AutogradMeta is only used in dynamic graph, It is execution-related
+   *    information, not Tensor data description-related information.
+   * 2. Kernel calculation does not require AutogradMeta.
+   */
+  std::shared_ptr<AbstractAutogradMeta> autograd_meta_{nullptr};
+
+  /**
+   * Tensor name: used for adapt original execution mechanism and debug analysis
+   * in the development of new dygraph.
+   */
+  std::string name_;
+};
+
+}  // namespace experimental
+}  // namespace paddle
diff --git a/paddle/pten/api/lib/CMakeLists.txt b/paddle/pten/api/lib/CMakeLists.txt
new file mode 100644
index 0000000000000..a4726b3d426f6
--- /dev/null
+++ b/paddle/pten/api/lib/CMakeLists.txt
@@ -0,0 +1,6 @@
+add_subdirectory(utils)
+
+cc_library(math_api SRCS math.cc DEPS pten)
+cc_library(linalg_api SRCS linalg.cc DEPS pten)
+cc_library(creation_api SRCS creation.cc DEPS pten)
+cc_library(manipulation_api SRCS manipulation.cc DEPS pten)
diff --git a/paddle/pten/api/lib/creation.cc b/paddle/pten/api/lib/creation.cc
new file mode 100644
index 0000000000000..e2cd611dbda5f
--- /dev/null
+++ b/paddle/pten/api/lib/creation.cc
@@ -0,0 +1,116 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/api/include/creation.h"
+
+#include <memory>
+
+#include "glog/logging.h"
+
+#include "paddle/pten/api/lib/kernel_dispatch.h"
+#include "paddle/pten/api/lib/utils/allocator.h"
+#include "paddle/pten/include/core.h"
+#include "paddle/pten/include/infershape.h"
+
+namespace paddle {
+namespace experimental {
+
+Tensor full(const std::vector<int64_t>& shape,
+            const Scalar& value,
+            DataType dtype,
+            Backend backend,
+            DataLayout layout) {
+  // 1. Get kernel signature and kernel
+  pten::KernelKey kernel_key{backend, layout, dtype};
+  auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
+      "fill_constant.scalar", kernel_key);
+
+  // 2. Get Device Context
+  auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
+  auto kernel_context = pten::KernelContext(dev_ctx);
+
+  // 3. Auto data transform
+  kernel_context.EmplaceBackAttr(value);
+
+  // 4. InferShape
+  auto out_meta = pten::FullInferShape(shape, dtype, layout);
+
+  // 5. Prepare outputs
+  const auto allocator =
+      std::make_shared<paddle::experimental::DefaultAllocator>(
+          pten::TransToFluidPlace(kernel_key.backend()));
+  auto dense_out = std::make_shared<pten::DenseTensor>(allocator, out_meta);
+  kernel_context.EmplaceBackOutput(dense_out);
+  Tensor out;
+  out.set_impl(dense_out);
+
+  // 6. Call kernel
+  kernel(&kernel_context);
+
+  return out;
+}
+
+Tensor full_like(const Tensor& x,
+                 const Scalar& value,
+                 paddle::experimental::DataType dtype) {
+  // 1. Get kernel signature and kernel
+  auto kernel_key_set = ParseKernelKeyByInputArgs(x);
+  auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
+  auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
+      "fill_any_like",
+      {kernel_key.backend(),
+       kernel_key.layout(),
+       dtype == DataType::UNDEFINED ? kernel_key.dtype() : dtype});
+
+  // 2. Get Device Context
+  auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
+  auto kernel_context = pten::KernelContext(dev_ctx);
+
+  // 3. Auto data transform
+  auto dense_x = std::dynamic_pointer_cast<pten::DenseTensor>(x.impl());
+  kernel_context.EmplaceBackInput(dense_x);
+  kernel_context.EmplaceBackAttr(value);
+
+  // 4. InferShape
+  auto out_meta = UnchangedInferShape(dense_x->meta());
+
+  // 5. Prepare outputs
+  Tensor out;
+  // InferDataType
+  if (dtype != pten::DataType::UNDEFINED) {
+    const_cast<pten::DenseTensorMeta::DataType&>(out_meta.type) = dtype;
+  }
+  const auto allocator =
+      std::make_shared<paddle::experimental::DefaultAllocator>(
+          pten::TransToFluidPlace(kernel_key.backend()));
+  auto dense_out = std::make_shared<pten::DenseTensor>(allocator, out_meta);
+  kernel_context.EmplaceBackOutput(dense_out);
+  out.set_impl(dense_out);
+
+  // 6. Call kernel
+  kernel(&kernel_context);
+
+  return out;
+}
+
+Tensor ones_like(const Tensor& x, DataType dtype) {
+  return full_like(x, 1, dtype);
+}
+
+Tensor zeros_like(const Tensor& x, DataType dtype) {
+  return full_like(x, 0, dtype);
+}
+
+}  // namespace experimental
+}  // namespace paddle
diff --git a/paddle/pten/api/lib/kernel_dispatch.h b/paddle/pten/api/lib/kernel_dispatch.h
new file mode 100644
index 0000000000000..567c21eeee9e8
--- /dev/null
+++ b/paddle/pten/api/lib/kernel_dispatch.h
@@ -0,0 +1,154 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <limits>
+#include <string>
+#include <utility>
+
+#include "paddle/pten/api/include/backend_set.h"
+#include "paddle/pten/api/include/tensor.h"
+#include "paddle/pten/common/data_type.h"
+#include "paddle/pten/common/layout.h"
+
+// TODO(chenweihang): split KernelName, Key, Kernel, Factory into diff files
+#include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/kernel_factory.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace experimental {
+
+// TODO(shixiaowei): replaced by new DeviceContext later
+using CPUContext = paddle::platform::CPUDeviceContext;
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+using CUDAContext = paddle::platform::CUDADeviceContext;
+#endif
+
+namespace detail {
+BackendSet GetTensorBackendSet(const Tensor& t) {
+  BackendSet backend_set(pten::TransToPtenBackend(t.place()));
+  switch (t.layout()) {
+    case DataLayout::MKLDNN:
+      backend_set = backend_set | BackendSet(Backend::MKLDNN);
+      break;
+    default:
+      // do nothing
+      break;
+  }
+  return backend_set;
+}
+
+std::size_t CountLeadingZeros(uint64_t val) {
+  if (val == 0) {
+    return 64;
+  }
+  std::size_t zero_bits = 0;
+  for (std::size_t shift = 64 >> 1; shift; shift >>= 1) {
+    uint64_t tmp = val >> shift;
+    if (tmp) {
+      val = tmp;
+    } else {
+      zero_bits |= shift;
+    }
+  }
+  return zero_bits;
+}
+}  // namespace detail
+
+// TODO(chenweihang): support DataLayout and DataType selected
+struct KernelKeySet {
+  BackendSet backend_set{Backend::UNDEFINED};
+  DataLayout layout{DataLayout::UNDEFINED};
+  DataType dtype{DataType::UNDEFINED};
+
+  // TODO(chenweihang): iterate all kernelkey for kernel selection
+  pten::KernelKey GetHigestPriorityKernelKey() {
+    return pten::KernelKey(static_cast<Backend>(64 - detail::CountLeadingZeros(
+                                                         backend_set.bitset())),
+                           layout,
+                           dtype);
+  }
+};
+
+namespace detail {
+
+template <typename Functor>
+struct ArgsIterator {
+  template <typename... Args>
+  inline Functor& apply() {
+    return self();
+  }
+
+  template <typename T, typename... Args>
+  inline Functor& apply(T&& arg, Args&&... args) {
+    self()(std::forward<T>(arg));
+    if (self().short_circuit()) {
+      return self();
+    } else {
+      return apply(std::forward<Args>(args)...);
+    }
+  }
+
+  constexpr bool short_circuit() const { return false; }
+
+ private:
+  inline Functor& self() { return *static_cast<Functor*>(this); }
+};
+
+struct KernelKeyParser : ArgsIterator<KernelKeyParser> {
+  KernelKeySet key_set;
+
+  // TODO(chenweihang): deal with multiple diff input Tensors
+  // TODO(chenweihang): add global device guard method to set backend
+  void operator()(const Tensor& x) {
+    key_set.backend_set = key_set.backend_set | detail::GetTensorBackendSet(x);
+    // TODO(chenweihang): selecte multi layout and dtype
+    key_set.layout = x.layout();
+    key_set.dtype = x.type();
+  }
+
+  void operator()(const std::vector<Tensor>& x) {
+    key_set.backend_set =
+        key_set.backend_set | detail::GetTensorBackendSet(x[0]);
+    // TODO(chenweihang): selecte multi layout and dtype
+    key_set.layout = x[0].layout();
+    key_set.dtype = x[0].type();
+  }
+
+  // skip other type args, these args don't used in kernel selection
+  template <typename T>
+  void operator()(const T& x) {
+    // do nothing
+  }
+};
+
+}  // namespace detail
+
+template <typename... Args>
+KernelKeySet ParseKernelKeyByInputArgs(const Args&... args) {
+  return detail::KernelKeyParser().apply(args...).key_set;
+}
+
+paddle::platform::DeviceContext* GetDeviceContextByBackend(
+    pten::Backend backend) {
+  auto& pool = paddle::platform::DeviceContextPool::Instance();
+  return pool.Get(pten::TransToFluidPlace(backend));
+}
+
+}  // namespace experimental
+}  // namespace paddle
diff --git a/paddle/pten/api/lib/linalg.cc b/paddle/pten/api/lib/linalg.cc
new file mode 100644
index 0000000000000..0ede7b8a68b41
--- /dev/null
+++ b/paddle/pten/api/lib/linalg.cc
@@ -0,0 +1,110 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/api/include/linalg.h"
+
+#include <memory>
+
+#include "glog/logging.h"
+
+#include "paddle/pten/api/lib/kernel_dispatch.h"
+#include "paddle/pten/api/lib/utils/allocator.h"
+#include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/kernel_context.h"
+#include "paddle/pten/include/core.h"
+#include "paddle/pten/include/infershape.h"
+
+namespace paddle {
+namespace experimental {
+
+Tensor dot(const Tensor& x, const Tensor& y) {
+  // 1. Get kernel signature and kernel
+  auto kernel_key_set = ParseKernelKeyByInputArgs(x);
+  auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
+  auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
+      "dot", kernel_key);
+
+  // 2. Get Device Context
+  auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
+  auto kernel_context = pten::KernelContext(dev_ctx);
+
+  // 3. Auto data transform
+  auto dense_x = std::dynamic_pointer_cast<pten::DenseTensor>(x.impl());
+  kernel_context.EmplaceBackInput(dense_x);
+  auto dense_y = std::dynamic_pointer_cast<pten::DenseTensor>(y.impl());
+  kernel_context.EmplaceBackInput(dense_y);
+  // TODO(chenweihang): add transform impl
+
+  // 4. InferShape
+  auto out_meta = DotInferShape(dense_x->meta(), dense_y->meta());
+
+  // 5. Prepare outputs
+  Tensor out;
+  const auto allocator = std::make_shared<DefaultAllocator>(
+      pten::TransToFluidPlace(kernel_key.backend()));
+  auto dense_out = std::make_shared<pten::DenseTensor>(allocator, out_meta);
+  kernel_context.EmplaceBackOutput(dense_out);
+  out.set_impl(dense_out);
+
+  // 6. Call kernel
+  kernel(&kernel_context);
+
+  return out;
+}
+
+Tensor matmul(const Tensor& x,
+              const Tensor& y,
+              bool transpose_x,
+              bool transpose_y) {
+  // 1. Get kernel signature and kernel
+  auto kernel_key_set = ParseKernelKeyByInputArgs(x, y);
+  auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
+  auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
+      "matmul_v2", kernel_key);
+
+  // 2. Get Device Context
+  auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
+  auto kernel_context = pten::KernelContext(dev_ctx);
+
+  // 3. Auto data transform
+  auto dense_x = std::dynamic_pointer_cast<pten::DenseTensor>(x.impl());
+  auto dense_y = std::dynamic_pointer_cast<pten::DenseTensor>(y.impl());
+  kernel_context.EmplaceBackInput(dense_x);
+  kernel_context.EmplaceBackInput(dense_y);
+  kernel_context.EmplaceBackAttr(transpose_x);
+  kernel_context.EmplaceBackAttr(transpose_y);
+  // TODO(chenweihang): add transform impl
+
+  // 4. InferShape
+  auto out_meta = MatmulInferShape(
+      dense_x->meta(), dense_y->meta(), transpose_x, transpose_y);
+
+  // 5. Prepare outputs
+  const auto allocator = std::make_shared<DefaultAllocator>(
+      pten::TransToFluidPlace(kernel_key.backend()));
+  auto dense_out = std::make_shared<pten::DenseTensor>(allocator, out_meta);
+  kernel_context.EmplaceBackOutput(dense_out);
+
+  Tensor out;
+  out.set_impl(dense_out);
+
+  // 6. Call kernel
+  kernel(&kernel_context);
+
+  return out;
+}
+
+}  // namespace experimental
+}  // namespace paddle
diff --git a/paddle/pten/api/lib/manipulation.cc b/paddle/pten/api/lib/manipulation.cc
new file mode 100644
index 0000000000000..dd16f4f7f5825
--- /dev/null
+++ b/paddle/pten/api/lib/manipulation.cc
@@ -0,0 +1,62 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/api/include/manipulation.h"
+
+#include <memory>
+
+#include "glog/logging.h"
+#include "paddle/pten/api/lib/kernel_dispatch.h"
+#include "paddle/pten/api/lib/utils/allocator.h"
+#include "paddle/pten/include/core.h"
+#include "paddle/pten/infershape/unary.h"
+
+namespace paddle {
+namespace experimental {
+
+Tensor flatten(const Tensor& x, int start_axis, int stop_axis) {
+  // 1. Get kernel signature and kernel
+  auto kernel_key_set = ParseKernelKeyByInputArgs(x);
+  auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
+  auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
+      "flatten_contiguous_range", kernel_key);
+
+  // 2. Get Device Context
+  auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
+  auto kernel_context = pten::KernelContext(dev_ctx);
+
+  // 3. Auto data transform
+  auto dense_x = std::dynamic_pointer_cast<pten::DenseTensor>(x.impl());
+  kernel_context.EmplaceBackInput(dense_x);
+  kernel_context.EmplaceBackAttr(start_axis);
+  kernel_context.EmplaceBackAttr(stop_axis);
+
+  // 4. InferShape
+  auto out_meta = FlattenInferShape(dense_x->meta(), start_axis, stop_axis);
+
+  // 5. Prepare outputs
+  Tensor out;
+  const auto allocator = std::make_shared<DefaultAllocator>(
+      pten::TransToFluidPlace(kernel_key.backend()));
+  auto dense_out = std::make_shared<pten::DenseTensor>(allocator, out_meta);
+  kernel_context.EmplaceBackOutput(dense_out);
+  out.set_impl(dense_out);
+
+  // 6. Call kernel
+  kernel(&kernel_context);
+
+  return out;
+}
+}  // namespace experimental
+}  // namespace paddle
diff --git a/paddle/pten/api/lib/math.cc b/paddle/pten/api/lib/math.cc
new file mode 100644
index 0000000000000..8102bbaaa58ea
--- /dev/null
+++ b/paddle/pten/api/lib/math.cc
@@ -0,0 +1,64 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/api/include/math.h"
+
+#include <memory>
+
+#include "glog/logging.h"
+
+#include "paddle/pten/api/lib/kernel_dispatch.h"
+#include "paddle/pten/api/lib/utils/allocator.h"
+#include "paddle/pten/include/core.h"
+#include "paddle/pten/include/infershape.h"
+#include "paddle/pten/infershape/unary.h"
+
+namespace paddle {
+namespace experimental {
+
+Tensor mean(const Tensor& x) {
+  // 1. Get kernel signature and kernel
+  auto kernel_key_set = ParseKernelKeyByInputArgs(x);
+  auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
+  auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
+      "mean", kernel_key);
+
+  // 2. Get Device Context
+  auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
+  auto kernel_context = pten::KernelContext(dev_ctx);
+
+  // 3. Auto data transform
+  auto dense_x = std::dynamic_pointer_cast<pten::DenseTensor>(x.impl());
+  kernel_context.EmplaceBackInput(dense_x);
+
+  // 4. InferShape
+  auto out_meta = ReductionInferShape(dense_x->meta());
+
+  // 5. Prepare outputs
+  Tensor out;
+  const auto allocator =
+      std::make_shared<paddle::experimental::DefaultAllocator>(
+          pten::TransToFluidPlace(kernel_key.backend()));
+  auto dense_out = std::make_shared<pten::DenseTensor>(allocator, out_meta);
+  kernel_context.EmplaceBackOutput(dense_out);
+  out.set_impl(dense_out);
+
+  // 6. Call kernel
+  kernel(&kernel_context);
+
+  return out;
+}
+
+}  // namespace experimental
+}  // namespace paddle
diff --git a/paddle/pten/api/lib/utils/CMakeLists.txt b/paddle/pten/api/lib/utils/CMakeLists.txt
new file mode 100644
index 0000000000000..34f8da94c6611
--- /dev/null
+++ b/paddle/pten/api/lib/utils/CMakeLists.txt
@@ -0,0 +1 @@
+cc_library(pten_api_utils SRCS allocator.cc storage.cc tensor_utils.cc DEPS tensor_base convert_utils dense_tensor lod_tensor selected_rows place var_type_traits)
diff --git a/paddle/pten/api/lib/utils/allocator.cc b/paddle/pten/api/lib/utils/allocator.cc
new file mode 100644
index 0000000000000..e80152431e712
--- /dev/null
+++ b/paddle/pten/api/lib/utils/allocator.cc
@@ -0,0 +1,23 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/api/lib/utils/allocator.h"
+
+namespace paddle {
+namespace experimental {
+
+memory::Allocator::AllocationDeleter DefaultAllocator::deleter_;
+
+}  // namespace experimental
+}  // namespace paddle
diff --git a/paddle/pten/api/lib/utils/allocator.h b/paddle/pten/api/lib/utils/allocator.h
new file mode 100644
index 0000000000000..8a8569c73edae
--- /dev/null
+++ b/paddle/pten/api/lib/utils/allocator.h
@@ -0,0 +1,47 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/memory/allocation/allocator.h"
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/pten/core/allocator.h"
+#include "paddle/pten/core/storage.h"
+
+namespace paddle {
+namespace experimental {
+
+class DefaultAllocator : public pten::Allocator {
+ public:
+  using Allocation = pten::Allocation;
+  explicit DefaultAllocator(const paddle::platform::Place& place)
+      : place_(place) {}
+
+  static void Delete(void* data) {
+    deleter_(static_cast<paddle::memory::Allocation*>(data));
+  }
+
+  Allocation Allocate(size_t bytes_size) override {
+    paddle::memory::AllocationPtr a = memory::Alloc(place_, bytes_size);
+    void* ptr = a->ptr();
+    return Allocation(ptr, a.release(), &Delete, place_);
+  }
+
+ private:
+  paddle::platform::Place place_;
+  static paddle::memory::Allocator::AllocationDeleter deleter_;
+};
+
+}  // namespace experimental
+}  // namespace paddle
diff --git a/paddle/pten/api/lib/utils/storage.cc b/paddle/pten/api/lib/utils/storage.cc
new file mode 100644
index 0000000000000..ba26e7f600d60
--- /dev/null
+++ b/paddle/pten/api/lib/utils/storage.cc
@@ -0,0 +1,39 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/api/lib/utils/storage.h"
+
+namespace paddle {
+namespace experimental {
+
+ExternalStorage::ExternalStorage(void* ptr,
+                                 size_t size,
+                                 const paddle::platform::Place& place)
+    : pten::Storage(pten::Allocation(ptr, place)), size_(size) {}
+
+ExternalStorage::ExternalStorage(const pten::intrusive_ptr<pten::Storage>& root,
+                                 size_t delta,
+                                 size_t size)
+    : Storage(pten::Allocation(static_cast<uint8_t*>(root->data()) + delta,
+                               root->place())),
+      size_(size) {
+  PADDLE_ENFORCE_LE(static_cast<size_t>(delta + size),
+                    root->size(),
+                    paddle::platform::errors::InvalidArgument(
+                        "The size of the external storage does "
+                        "not meet the metadata requirements."));
+}
+
+}  // namespace experimental
+}  // namespace paddle
diff --git a/paddle/pten/api/lib/utils/storage.h b/paddle/pten/api/lib/utils/storage.h
new file mode 100644
index 0000000000000..242ea6476ae98
--- /dev/null
+++ b/paddle/pten/api/lib/utils/storage.h
@@ -0,0 +1,113 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/pten/core/storage.h"
+
+namespace paddle {
+namespace experimental {
+
+class ExternalStorage : public pten::Storage {
+ public:
+  ExternalStorage(void* ptr, size_t size, const paddle::platform::Place& place);
+  ExternalStorage(const pten::intrusive_ptr<pten::Storage>& root,
+                  size_t delta,
+                  size_t size);
+
+  static const char* name() { return "ExternalStorage"; }
+
+  void Realloc(size_t n) override {
+    PADDLE_THROW(paddle::platform::errors::Unavailable(
+        "The external shared storage cannot be reallocated."));
+  }
+
+  size_t size() const noexcept override { return size_; }
+  const paddle::platform::Place& place() const override {
+    return data_.place();
+  }
+  bool OwnsMemory() const noexcept override { return false; }
+
+ private:
+  const int64_t size_{0};
+};
+
+class SharedStorage : public pten::Storage {
+ public:
+  explicit SharedStorage(
+      const std::shared_ptr<paddle::memory::Allocation>& allocation,
+      size_t offset)
+      : allocation_(allocation) {
+    CHECK(allocation);
+    data_ = pten::Allocation(
+        reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(allocation->ptr()) +
+                                offset),
+        allocation->place());
+    size_ = allocation->size();
+  }
+
+  static const char* name() { return "SharedStorage"; }
+
+  void Realloc(size_t n) override {
+    PADDLE_THROW(paddle::platform::errors::Unavailable(
+        "The external shared storage cannot be reallocated."));
+  }
+
+  size_t size() const noexcept override { return size_; }
+  const paddle::platform::Place& place() const override {
+    return data_.place();
+  }
+  bool OwnsMemory() const noexcept override { return false; }
+
+  const std::shared_ptr<paddle::memory::Allocation>& GetAllocation() {
+    return allocation_;
+  }
+
+  // Temporary method: For compatible with fluid Tensor and improve performance
+  void ResetAllocation(std::shared_ptr<paddle::memory::Allocation> allocation,
+                       size_t offset) {
+    allocation_ = allocation;
+    data_ = pten::Allocation(
+        reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(allocation->ptr()) +
+                                offset),
+        allocation->place());
+    size_ = allocation->size();
+  }
+
+  // Temporary method: For compatible with fluid Tensor and improve performance
+  void Reset() {
+    allocation_.reset();
+    data_.Clear();
+    size_ = 0;
+  }
+
+ private:
+  int64_t size_{0};
+  std::shared_ptr<paddle::memory::Allocation> allocation_;
+};
+
+class TensorStorage : public paddle::memory::allocation::Allocation {
+ public:
+  explicit TensorStorage(pten::intrusive_ptr<pten::Storage> storage)
+      : paddle::memory::allocation::Allocation(
+            storage->data(), storage->size(), storage->place()),
+        storage_(std::move(storage)) {}
+
+ private:
+  pten::intrusive_ptr<pten::Storage> storage_;
+};
+
+}  // namespace experimental
+}  // namespace paddle
diff --git a/paddle/pten/api/lib/utils/tensor_utils.cc b/paddle/pten/api/lib/utils/tensor_utils.cc
new file mode 100644
index 0000000000000..52554bf7af0ca
--- /dev/null
+++ b/paddle/pten/api/lib/utils/tensor_utils.cc
@@ -0,0 +1,230 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/api/lib/utils/tensor_utils.h"
+
+#include <vector>
+
+#include "paddle/pten/core/compat_utils.h"
+
+namespace paddle {
+namespace experimental {
+
+template <typename DstLoD, typename SrcLoD>
+void SetLoD(DstLoD* dst, const SrcLoD& src) {
+  dst->reserve(src.size());
+  dst->clear();
+  for (auto&& v : src) {
+    dst->emplace_back(v);
+  }
+}
+
+std::unique_ptr<pten::DenseTensor> MakePtenDenseTensor(
+    const paddle::framework::Tensor& src) {
+  pten::DenseTensorMeta meta{pten::TransToPtenDataType(src.type()),
+                             src.dims(),
+                             pten::TransToPtenDataLayout(src.layout())};
+  auto shared_storage =
+      pten::make_intrusive<SharedStorage>(src.Holder(), src.offset());
+  return std::make_unique<pten::DenseTensor>(std::move(shared_storage),
+                                             std::move(meta));
+}
+
+std::unique_ptr<pten::DenseTensor> MakePtenDenseTensor(
+    const paddle::framework::LoDTensor& src) {
+  pten::DenseTensorMeta meta{pten::TransToPtenDataType(src.type()),
+                             src.dims(),
+                             pten::TransToPtenDataLayout(src.layout())};
+  SetLoD(&meta.lod, src.lod());
+  auto shared_storage =
+      pten::make_intrusive<SharedStorage>(src.Holder(), src.offset());
+
+  return std::make_unique<pten::DenseTensor>(std::move(shared_storage),
+                                             std::move(meta));
+}
+
+std::unique_ptr<pten::TensorBase> MakePtenTensorBaseFromVar(
+    const framework::Variable& variable, const pten::TensorArgDef& arg_def) {
+  auto expected_place = pten::TransToFluidPlace(arg_def.backend);
+
+  if (variable.IsType<framework::LoDTensor>()) {
+    const auto& tensor = variable.Get<framework::LoDTensor>();
+    if (!platform::is_same_place(tensor.place(), expected_place)) {
+      framework::LoDTensor tmp_tensor;
+      framework::TensorCopySync(tensor, expected_place, &tmp_tensor);
+      return MakePtenDenseTensor(tmp_tensor);
+    } else {
+      return MakePtenDenseTensor(tensor);
+    }
+  } else if (variable.IsType<framework::SelectedRows>()) {
+    // TODO(chenweihang): now we don't deal with row and height
+    // by xiaowei's advice
+    const auto& tensor = variable.Get<framework::SelectedRows>();
+    if (!platform::is_same_place(tensor.value().place(), expected_place)) {
+      framework::Tensor tmp_tensor;
+      TensorCopySync(tensor.value(), expected_place, &tmp_tensor);
+      // TODO(chenweihang): adapt SelectedRows by xiaowei's design
+      return MakePtenDenseTensor(tmp_tensor);
+    } else {
+      return MakePtenDenseTensor(tensor.value());
+    }
+  } else {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Unsupported shared input `%s` type now when call pt kernel.",
+        framework::ToTypeName(variable.Type())));
+  }
+  return {};
+}
+
+std::unique_ptr<pten::TensorBase> MakePtenTensorBaseFromVar(
+    framework::Variable* variable, const pten::TensorArgDef& arg_def) {
+  // mutable_data before run kernel, to avoid share output form
+  // KernelContext to original tensor
+  if (variable->template IsType<framework::LoDTensor>()) {
+    auto* tensor = variable->template GetMutable<framework::LoDTensor>();
+    tensor->mutable_data(pten::TransToFluidPlace(arg_def.backend),
+                         pten::TransToProtoVarType(arg_def.dtype));
+    return MakePtenDenseTensor(*tensor);
+  } else if (variable->template IsType<framework::SelectedRows>()) {
+    auto* tensor = variable->template GetMutable<framework::SelectedRows>();
+    tensor->mutable_value()->mutable_data(
+        pten::TransToFluidPlace(arg_def.backend),
+        pten::TransToProtoVarType(arg_def.dtype));
+    // TODO(chenweihang): adapt SelectedRows by xiaowei's design,
+    // here the row and height will lost in output!
+    return MakePtenDenseTensor(tensor->value());
+  } else {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Unsupported shared output `%s` type now when call pt kernel.",
+        framework::ToTypeName(variable->Type())));
+  }
+  return {};
+}
+
+void MovesStorage(pten::DenseTensor* src, paddle::framework::Tensor* dst) {
+  CHECK(src);
+  CHECK(dst);
+  dst->Resize(src->dims());
+  auto storage = src->release();
+  CHECK(storage->OwnsMemory());
+  std::shared_ptr<paddle::memory::allocation::Allocation> holder(
+      new TensorStorage(std::move(storage)));
+  dst->ResetHolderWithType(holder, pten::TransToProtoVarType(src->data_type()));
+}
+
+void MovesStorage(pten::DenseTensor* src, paddle::framework::LoDTensor* dst) {
+  CHECK(src);
+  CHECK(dst);
+  SetLoD(dst->mutable_lod(), src->lod());
+  MovesStorage(src, static_cast<paddle::framework::Tensor*>(dst));
+}
+
+void ReMakePtenDenseTensor(const paddle::framework::Tensor& src,
+                           pten::DenseTensor* dst) {
+  auto* meta = pten::CompatibleDenseTensorUtils::GetMutableMeta(dst);
+  meta->dims = src.dims();
+  // Since the type of DenseTensorMeta is const, const_cast must be used
+  const_cast<DataType&>(meta->type) = pten::TransToPtenDataType(src.type());
+  // Since the type of DenseTensorMeta is const, const_cast must be used
+  const_cast<DataLayout&>(meta->layout) =
+      pten::TransToPtenDataLayout(src.layout());
+  auto* shared_storage = static_cast<SharedStorage*>(
+      pten::CompatibleDenseTensorUtils::UnsafeGetMutableStorage(dst));
+  PADDLE_ENFORCE_NOT_NULL(
+      shared_storage,
+      platform::errors::NotFound(
+          "Target DenseTensor's shared storage is nullptr."));
+  shared_storage->ResetAllocation(src.Holder(), src.offset());
+}
+
+void ReMakePtenDenseTensor(const paddle::framework::LoDTensor& src,
+                           pten::DenseTensor* dst) {
+  auto* meta = pten::CompatibleDenseTensorUtils::GetMutableMeta(dst);
+  meta->dims = src.dims();
+  // Since the type of DenseTensorMeta is const, const_cast must be used
+  const_cast<DataType&>(meta->type) = pten::TransToPtenDataType(src.type());
+  // Since the type of DenseTensorMeta is const, const_cast must be used
+  const_cast<DataLayout&>(meta->layout) =
+      pten::TransToPtenDataLayout(src.layout());
+  SetLoD(&(meta->lod), src.lod());
+  auto* shared_storage = static_cast<SharedStorage*>(
+      pten::CompatibleDenseTensorUtils::UnsafeGetMutableStorage(dst));
+  PADDLE_ENFORCE_NOT_NULL(
+      shared_storage,
+      platform::errors::NotFound(
+          "Target DenseTensor's shared storage is nullptr."));
+  shared_storage->ResetAllocation(src.Holder(), src.offset());
+}
+
+void ReMakePtenDenseTensorFromVar(const framework::Variable& variable,
+                                  const pten::TensorArgDef& arg_def,
+                                  pten::DenseTensor* dst) {
+  auto expected_place = pten::TransToFluidPlace(arg_def.backend);
+
+  if (variable.IsType<framework::LoDTensor>()) {
+    const auto& tensor = variable.Get<framework::LoDTensor>();
+    if (!platform::is_same_place(tensor.place(), expected_place)) {
+      framework::LoDTensor tmp_tensor;
+      framework::TensorCopySync(tensor, expected_place, &tmp_tensor);
+      ReMakePtenDenseTensor(tmp_tensor, dst);
+    } else {
+      ReMakePtenDenseTensor(tensor, dst);
+    }
+  } else if (variable.IsType<framework::SelectedRows>()) {
+    // TODO(chenweihang): now we don't deal with row and height
+    // by xiaowei's advice
+    const auto& tensor = variable.Get<framework::SelectedRows>();
+    if (!platform::is_same_place(tensor.value().place(), expected_place)) {
+      framework::Tensor tmp_tensor;
+      TensorCopySync(tensor.value(), expected_place, &tmp_tensor);
+      // TODO(chenweihang): adapt SelectedRows by xiaowei's design
+      ReMakePtenDenseTensor(tmp_tensor, dst);
+    } else {
+      ReMakePtenDenseTensor(tensor.value(), dst);
+    }
+  } else {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Unsupported shared input `%s` type now when call pt kernel.",
+        framework::ToTypeName(variable.Type())));
+  }
+}
+
+void ReMakePtenDenseTensorFromVar(framework::Variable* variable,
+                                  const pten::TensorArgDef& arg_def,
+                                  pten::DenseTensor* dst) {
+  // mutable_data before run kernel, to avoid share output form
+  // KernelContext to original tensor
+  if (variable->template IsType<framework::LoDTensor>()) {
+    auto* tensor = variable->template GetMutable<framework::LoDTensor>();
+    // TODO(chenweihang): use original var type if arg_def.dtype is UNDEFINED
+    tensor->mutable_data(pten::TransToFluidPlace(arg_def.backend),
+                         pten::TransToProtoVarType(arg_def.dtype));
+    ReMakePtenDenseTensor(*tensor, dst);
+  } else if (variable->template IsType<framework::SelectedRows>()) {
+    auto* tensor = variable->template GetMutable<framework::SelectedRows>();
+    tensor->mutable_value()->mutable_data(
+        pten::TransToFluidPlace(arg_def.backend),
+        pten::TransToProtoVarType(arg_def.dtype));
+    // TODO(chenweihang): adapt SelectedRows by xiaowei's design,
+    // here the row and height will lost in output!
+    ReMakePtenDenseTensor(tensor->value(), dst);
+  } else {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Unsupported shared output `%s` type now when call pt kernel.",
+        framework::ToTypeName(variable->Type())));
+  }
+}
+
+}  // namespace experimental
+}  // namespace paddle
diff --git a/paddle/pten/api/lib/utils/tensor_utils.h b/paddle/pten/api/lib/utils/tensor_utils.h
new file mode 100644
index 0000000000000..c1840d97fd2e3
--- /dev/null
+++ b/paddle/pten/api/lib/utils/tensor_utils.h
@@ -0,0 +1,72 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/variable.h"
+
+#include "paddle/pten/api/lib/utils/allocator.h"
+#include "paddle/pten/api/lib/utils/storage.h"
+#include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/kernel_factory.h"
+
+namespace paddle {
+namespace experimental {
+
+std::unique_ptr<pten::DenseTensor> MakePtenDenseTensor(
+    const paddle::framework::Tensor& src);
+
+std::unique_ptr<pten::DenseTensor> MakePtenDenseTensor(
+    const paddle::framework::LoDTensor& src);
+
+std::unique_ptr<pten::TensorBase> MakePtenTensorBaseFromVar(
+    const framework::Variable& variable, const pten::TensorArgDef& arg_def);
+
+std::unique_ptr<pten::TensorBase> MakePtenTensorBaseFromVar(
+    framework::Variable* variable, const pten::TensorArgDef& arg_def);
+
+void MovesStorage(pten::DenseTensor* src, paddle::framework::Tensor* dst);
+
+void MovesStorage(pten::DenseTensor* src, paddle::framework::LoDTensor* dst);
+
+/**
+ * In order to improve the compatibility state performance, some tricky tool
+ * functions are added.
+ *
+ * The ReMake** function takes out the LoDTensor information and directly
+ * replaces it with the corresponding member of the DenseTensor to avoid
+ * the overhead caused by frequent construction and destruction of the
+ * DenseTensor.
+ */
+
+void ReMakePtenDenseTensor(const paddle::framework::Tensor& src,
+                           pten::DenseTensor* dst);
+
+void ReMakePtenDenseTensor(const paddle::framework::LoDTensor& src,
+                           pten::DenseTensor* dst);
+
+void ReMakePtenDenseTensorFromVar(const framework::Variable& variable,
+                                  const pten::TensorArgDef& arg_def,
+                                  pten::DenseTensor* dst);
+
+void ReMakePtenDenseTensorFromVar(framework::Variable* variable,
+                                  const pten::TensorArgDef& arg_def,
+                                  pten::DenseTensor* dst);
+
+}  // namespace experimental
+}  // namespace paddle
diff --git a/paddle/pten/common/backend.h b/paddle/pten/common/backend.h
new file mode 100644
index 0000000000000..e0bf746050a67
--- /dev/null
+++ b/paddle/pten/common/backend.h
@@ -0,0 +1,94 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <ostream>
+
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace experimental {
+
+/**
+ * [ Why need Backend? ]
+ *
+ * Backend not only means place. Backend is a superset of place.
+ *
+ * Place cannot indicate the difference in calculation methods on the device,
+ * but in order to make the boundary of the kernel clearer and the function
+ * more specific, we need to distinguish the calculation method.
+ *
+ * Such as the kernel for CPU device, it can be a native CPU kernel,
+ * or a kernel implemented by MKLDNN library.
+ *
+ * Note(chenweihang): HIP is not needed now, we can added it if needed
+ * in the future
+ */
+enum class Backend : uint8_t {
+  // kernel backend cannot be undefined
+  UNDEFINED = 0,
+
+  // basic kernel backend
+  CPU,
+
+  // various acceleration devices' backends
+  CUDA,
+  XPU,  // XPU currently does not exist at the same time as CUDA
+  NPU,  // NPU currently does not exist at the same time as CUDA
+
+  // the third library backend
+  MKLDNN,
+  CUDNN,
+
+  // end of backend types
+  NUM_BACKENDS,
+};
+
+inline std::ostream& operator<<(std::ostream& os, Backend backend) {
+  switch (backend) {
+    case Backend::UNDEFINED:
+      os << "Undefined";
+      break;
+    case Backend::CPU:
+      os << "CPU";
+      break;
+    case Backend::CUDA:
+      os << "CUDA";
+      break;
+    case Backend::XPU:
+      os << "XPU";
+      break;
+    case Backend::NPU:
+      os << "NPU";
+      break;
+    case Backend::MKLDNN:
+      os << "MKLDNN";
+      break;
+    case Backend::CUDNN:
+      os << "CUDNN";
+      break;
+    default:
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Invalid enum backend type `%d`.", static_cast<int>(backend)));
+  }
+  return os;
+}
+
+}  // namespace experimental
+}  // namespace paddle
+
+namespace pten {
+using Backend = paddle::experimental::Backend;
+}
diff --git a/paddle/pten/common/data_type.h b/paddle/pten/common/data_type.h
new file mode 100644
index 0000000000000..27ca28b273485
--- /dev/null
+++ b/paddle/pten/common/data_type.h
@@ -0,0 +1,187 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/platform/bfloat16.h"
+#include "paddle/fluid/platform/complex.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/float16.h"
+
+namespace paddle {
+namespace experimental {
+
+using complex64 = ::paddle::platform::complex<float>;
+using complex128 = ::paddle::platform::complex<double>;
+using float16 = ::paddle::platform::float16;
+using bfloat16 = ::paddle::platform::bfloat16;
+
+enum class DataType {
+  UNDEFINED = 0,
+  BOOL,
+  INT8,   // Char
+  UINT8,  // BYte
+  INT16,
+  INT32,
+  UINT32,
+  INT64,
+  UINT64,
+  BFLOAT16,
+  FLOAT16,
+  UINT16,
+  FLOAT32,
+  FLOAT64,
+  COMPLEX64,
+  COMPLEX128,
+  NUM_DATA_TYPES
+};
+
+inline size_t SizeOf(DataType data_type) {
+  switch (data_type) {
+    case DataType::BOOL:
+    case DataType::UINT8:
+    case DataType::INT8:
+      return 1;
+    case DataType::BFLOAT16:
+    case DataType::FLOAT16:
+    case DataType::INT16:
+    case DataType::UINT16:
+      return 2;
+    case DataType::FLOAT32:
+    case DataType::INT32:
+    case DataType::UINT32:
+      return 4;
+    case DataType::FLOAT64:
+    case DataType::INT64:
+    case DataType::UINT64:
+    case DataType::COMPLEX64:
+      return 8;
+    case DataType::COMPLEX128:
+      return 16;
+    case DataType::UNDEFINED:
+    case DataType::NUM_DATA_TYPES:
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Data type %d is not supported by tensor.",
+          static_cast<int>(data_type)));
+  }
+  return 0;
+}
+
+#define PT_FOR_EACH_DATA_TYPE(_)    \
+  _(bool, DataType::BOOL)           \
+  _(int8_t, DataType::INT8)         \
+  _(uint8_t, DataType::UINT8)       \
+  _(int16_t, DataType::INT16)       \
+  _(uint16_t, DataType::UINT16)     \
+  _(int32_t, DataType::INT32)       \
+  _(uint32_t, DataType::UINT32)     \
+  _(int64_t, DataType::INT64)       \
+  _(uint64_t, DataType::UINT64)     \
+  _(bfloat16, DataType::BFLOAT16)   \
+  _(float16, DataType::FLOAT16)     \
+  _(float, DataType::FLOAT32)       \
+  _(double, DataType::FLOAT64)      \
+  _(complex64, DataType::COMPLEX64) \
+  _(complex128, DataType::COMPLEX128)
+
+template <DataType T>
+struct DataTypeToCppType;
+
+template <typename T>
+struct CppTypeToDataType;
+
+#define PT_SPECIALIZE_DataTypeToCppType(cpp_type, data_type) \
+  template <>                                                \
+  struct DataTypeToCppType<data_type> {                      \
+    using type = cpp_type;                                   \
+  };
+
+PT_FOR_EACH_DATA_TYPE(PT_SPECIALIZE_DataTypeToCppType)
+
+#undef PT_SPECIALIZE_DataTypeToCppType
+
+#define PT_SPECIALIZE_CppTypeToDataType(cpp_type, data_type) \
+  template <>                                                \
+  struct CppTypeToDataType<cpp_type> {                       \
+    constexpr static DataType Type() { return data_type; }   \
+  };
+
+PT_FOR_EACH_DATA_TYPE(PT_SPECIALIZE_CppTypeToDataType)
+
+#undef PT_SPECIALIZE_CppTypeToDataType
+
+inline std::ostream& operator<<(std::ostream& os, DataType dtype) {
+  switch (dtype) {
+    case DataType::UNDEFINED:
+      os << "Undefined";
+      break;
+    case DataType::BOOL:
+      os << "bool";
+      break;
+    case DataType::INT8:
+      os << "int8";
+      break;
+    case DataType::UINT8:
+      os << "uint8";
+      break;
+    case DataType::INT16:
+      os << "int16";
+      break;
+    case DataType::UINT16:
+      os << "uint16";
+      break;
+    case DataType::INT32:
+      os << "int32";
+      break;
+    case DataType::UINT32:
+      os << "uint32";
+      break;
+    case DataType::INT64:
+      os << "int64";
+      break;
+    case DataType::UINT64:
+      os << "uint64";
+      break;
+    case DataType::BFLOAT16:
+      os << "bfloat16";
+      break;
+    case DataType::FLOAT16:
+      os << "float16";
+      break;
+    case DataType::FLOAT32:
+      os << "float32";
+      break;
+    case DataType::FLOAT64:
+      os << "float64";
+      break;
+    case DataType::COMPLEX64:
+      os << "complex64";
+      break;
+    case DataType::COMPLEX128:
+      os << "complex128";
+      break;
+    default:
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Invalid enum data type `%d`.", static_cast<int>(dtype)));
+  }
+  return os;
+}
+
+}  // namespace experimental
+}  // namespace paddle
+
+namespace pten {
+using DataType = paddle::experimental::DataType;
+}
diff --git a/paddle/pten/common/layout.h b/paddle/pten/common/layout.h
new file mode 100644
index 0000000000000..0da10dff4335b
--- /dev/null
+++ b/paddle/pten/common/layout.h
@@ -0,0 +1,60 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace experimental {
+
+enum class DataLayout {
+  UNDEFINED = 0,
+  ANY,
+  NHWC,
+  NCHW,
+  MKLDNN,
+  NUM_DATA_LAYOUTS,
+};
+
+inline std::ostream& operator<<(std::ostream& os, DataLayout layout) {
+  switch (layout) {
+    case DataLayout::UNDEFINED:
+      os << "Undefined";
+      break;
+    case DataLayout::ANY:
+      os << "Any";
+      break;
+    case DataLayout::NHWC:
+      os << "NHWC";
+      break;
+    case DataLayout::NCHW:
+      os << "NCHW";
+      break;
+    case DataLayout::MKLDNN:
+      os << "MKLDNN";
+      break;
+    default:
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Invalid enum data layout type `%d`.", static_cast<int>(layout)));
+  }
+  return os;
+}
+
+}  // namespace experimental
+}  // namespace paddle
+
+namespace pten {
+using DataLayout = paddle::experimental::DataLayout;
+}
diff --git a/paddle/pten/common/scalar.h b/paddle/pten/common/scalar.h
new file mode 100644
index 0000000000000..ef648ba70f336
--- /dev/null
+++ b/paddle/pten/common/scalar.h
@@ -0,0 +1,86 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <cstdint>
+
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace experimental {
+
+class Scalar {
+ public:
+  // Constructor support implicit
+  Scalar(float val) : tag(Tag::HAS_F) { data_.f = val; }  // NOLINT
+
+  Scalar(double val) : tag(Tag::HAS_D) { data_.d = val; }  // NOLINT
+
+  Scalar(int32_t val) : tag(Tag::HAS_I32) { data_.i32 = val; }  // NOLINT
+
+  Scalar(int64_t val) : tag(Tag::HAS_I64) { data_.i64 = val; }  // NOLINT
+
+  Scalar(bool val) : tag(Tag::HAS_B) { data_.b = val; }  // NOLINT
+
+  Scalar(const std::string& str_value) : tag(Tag::HAS_D) {  // NOLINT
+    if (str_value == "inf") {
+      data_.d = std::numeric_limits<double>::infinity();
+    } else if (str_value == "-inf") {
+      data_.d = -std::numeric_limits<double>::infinity();
+    } else if (str_value == "nan") {
+      data_.d = std::numeric_limits<double>::quiet_NaN();
+    } else {
+      data_.d = std::stod(str_value);
+    }
+  }
+
+  template <typename T>
+  inline T to() const {
+    switch (tag) {
+      case Tag::HAS_F:
+        return static_cast<T>(data_.f);
+      case Tag::HAS_D:
+        return static_cast<T>(data_.d);
+      case Tag::HAS_I32:
+        return static_cast<T>(data_.i32);
+      case Tag::HAS_I64:
+        return static_cast<T>(data_.i64);
+      case Tag::HAS_B:
+        return static_cast<T>(data_.b);
+      default:
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "Invalid enum scalar type tag `%d`.", static_cast<int>(tag)));
+    }
+  }
+
+ private:
+  enum class Tag { HAS_F, HAS_D, HAS_I32, HAS_I64, HAS_B };
+  Tag tag;
+
+  union data {
+    float f;
+    double d;
+    int32_t i32;
+    int64_t i64;
+    bool b;
+  } data_;
+};
+
+}  // namespace experimental
+}  // namespace paddle
+
+namespace pten {
+using Scalar = paddle::experimental::Scalar;
+}
diff --git a/paddle/pten/core/CMakeLists.txt b/paddle/pten/core/CMakeLists.txt
new file mode 100644
index 0000000000000..a7ccf31467438
--- /dev/null
+++ b/paddle/pten/core/CMakeLists.txt
@@ -0,0 +1,19 @@
+IF(WITH_MKLDNN)
+    set(MKLDNN_CTX_DEPS mkldnn)
+ELSE()
+    set(MKLDNN_CTX_DEPS)
+ENDIF()
+
+if(WITH_GPU)
+  cc_library(convert_utils SRCS convert_utils.cc DEPS data_type place gpu_info)
+elseif(WITH_ROCM)
+  cc_library(convert_utils SRCS convert_utils.cc DEPS data_type place gpu_info)
+else()
+  cc_library(convert_utils SRCS convert_utils.cc DEPS data_type place)
+endif()
+
+cc_library(kernel_factory SRCS kernel_factory.cc DEPS enforce)
+cc_library(kernel_context SRCS kernel_context.cc DEPS enforce device_context)
+
+cc_library(tensor_base SRCS tensor_base.cc allocator.cc storage.cc DEPS enforce)
+cc_library(dense_tensor SRCS dense_tensor.cc DEPS tensor_base)
diff --git a/paddle/pten/core/allocator.cc b/paddle/pten/core/allocator.cc
new file mode 100644
index 0000000000000..bcf03ee5acf0a
--- /dev/null
+++ b/paddle/pten/core/allocator.cc
@@ -0,0 +1,17 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/core/allocator.h"
+
+namespace pten {}  // namespace pten
diff --git a/paddle/pten/core/allocator.h b/paddle/pten/core/allocator.h
new file mode 100644
index 0000000000000..c16c4ffaa6a37
--- /dev/null
+++ b/paddle/pten/core/allocator.h
@@ -0,0 +1,159 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <cstdint>
+#include "paddle/fluid/platform/place.h"
+
+namespace pten {
+
+/// \brief Encapsulates strategies for access/addressing, allocation/
+/// deallocation and construction/destruction of objects.
+class RawAllocator {
+ public:
+  using Place = paddle::platform::Place;
+
+  /// \brief Default destructor.
+  virtual ~RawAllocator() = default;
+
+  /// \brief Allocates storage suitable for an array object of n bytes
+  /// and creates the array, but does not construct array elements.
+  /// May throw exceptions.
+  /// \param bytes_size The number of bytes to allocate.
+  /// \return The first address allocated.
+  virtual void* Allocate(size_t bytes_size) = 0;
+
+  /// \brief Deallocates storage pointed to ptr, which must be a value
+  /// returned by a previous call to allocate that has not been
+  /// invalidated by an intervening call to deallocate. The bytes_size
+  /// must match the value previously passed to allocate.
+  /// \param ptr The first address to deallocate.
+  /// \param bytes_size The number of bytes to deallocate.
+  virtual void Deallocate(void* ptr, size_t bytes_size) = 0;
+
+  /// \brief Get the place value of the allocator and the allocation.
+  /// \return The place value of the allocator and the allocation.
+  virtual const Place& place() const = 0;
+};
+
+/// \brief Fancy pointer with context. The use of this data type
+/// is to be compatible with allocators from different frameworks
+/// without significant performance loss. This class does not
+/// support being inherited.
+class Allocation final {
+ public:
+  using Place = paddle::platform::Place;
+  using DeleterFnPtr = void (*)(void*);
+
+  Allocation() = default;
+  Allocation(Allocation&&) = default;
+  Allocation& operator=(Allocation&&) = default;
+
+  Allocation(void* data, const Place& place) : data_(data), place_(place) {}
+
+  Allocation(void* data,
+             void* ctx,
+             DeleterFnPtr ctx_deleter,
+             const Place& place)
+      : data_(data), ctx_(ctx, ctx_deleter), place_(place) {}
+
+  void* operator->() const noexcept { return data_; }
+  operator bool() const noexcept { return data_ || ctx_.Get(); }
+  const Place& place() const noexcept { return place_; }
+
+  void Clear() noexcept {
+    data_ = nullptr;
+    ctx_.Clear();
+  }
+
+  /// \brief Statically cast the void pointer of the context object to
+  /// the primitive type. Conversion of any pointer to void* and back
+  /// to pointer to the original cv type preserves its original value.
+  /// \param T The primitive type name of the context pointer.
+  /// \param expected_deleter The destructor passed in to enhance type
+  /// safety checking.
+  template <typename T>
+  T* CastContext(DeleterFnPtr expected_deleter) const noexcept {
+    if (ctx_.deleter() != expected_deleter) {
+      return nullptr;
+    }
+    return static_cast<T*>(ctx_.Get());
+  }
+
+ public:
+  class Context {
+   public:
+    Context() = default;
+    Context(void* ctx, DeleterFnPtr deleter) noexcept : ctx_(ctx),
+                                                        deleter_(deleter) {}
+    Context(Context&& other) noexcept {
+      // Exchange them explicitly to avoid moving is equivalent
+      // to copying.
+      swap(*this, other);
+    }
+    Context& operator=(Context&& other) noexcept {
+      swap(*this, other);
+      return *this;
+    }
+    ~Context() {
+      if (deleter_) {
+        deleter_(ctx_);
+      }
+    }
+    void Clear() noexcept {
+      ctx_ = nullptr;
+      deleter_ = nullptr;
+    }
+    void* Get() const noexcept { return ctx_; }
+    DeleterFnPtr deleter() const noexcept { return deleter_; }
+    void* Release() noexcept {
+      deleter_ = nullptr;
+      return ctx_;
+    }
+    friend void swap(Context& a, Context& b) noexcept;
+
+   private:
+    void* ctx_{nullptr};
+    DeleterFnPtr deleter_{nullptr};
+  };
+
+ private:
+  void* data_{nullptr};
+  Context ctx_;
+  // TODO(Shixiaowei02): Enum needs to be used instead to reduce
+  // the construction overhead by more than 50%.
+  Place place_;
+};
+
+inline void swap(Allocation::Context& a, Allocation::Context& b) noexcept {
+  ::std::swap(a.ctx_, b.ctx_);
+  ::std::swap(a.deleter_, b.deleter_);
+}
+
+/// \brief Context compatible allocator interface. This allocator is
+/// mainly used for general data structures such as Tensor. The raw
+/// allocator is more universal and efficient.
+class Allocator {
+ public:
+  virtual ~Allocator() = default;
+  virtual Allocation Allocate(size_t bytes_size) = 0;
+};
+
+inline Allocation Allocate(const std::shared_ptr<Allocator>& a, size_t n) {
+  CHECK(a);
+  return a->Allocate(n);
+}
+
+}  // namespace pten
diff --git a/paddle/pten/core/compat_utils.h b/paddle/pten/core/compat_utils.h
new file mode 100644
index 0000000000000..289c311bf3eba
--- /dev/null
+++ b/paddle/pten/core/compat_utils.h
@@ -0,0 +1,50 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/pten/api/lib/utils/storage.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/storage.h"
+#include "paddle/pten/core/tensor_meta.h"
+
+namespace pten {
+
+/**
+ * In order to meet some adaptation requirements of the compatible state,
+ * these class is added to provide some tool functions.
+ *
+ * These utility functions may be deleted in the future, It is not recommended
+ * to be widely used in the framework
+ */
+
+class CompatibleDenseTensorUtils {
+ public:
+  static Storage* UnsafeGetMutableStorage(DenseTensor* tensor) {
+    return tensor->storage_.get();
+  }
+
+  static DenseTensorMeta* GetMutableMeta(DenseTensor* tensor) {
+    return &(tensor->meta_);
+  }
+
+  // only can deal with SharedStorage now
+  static void ClearStorage(DenseTensor* tensor) {
+    // use static_cast to improve performance, replace by dynamic_cast later
+    static_cast<paddle::experimental::SharedStorage*>(tensor->storage_.get())
+        ->Reset();
+  }
+};
+
+}  // namespace pten
diff --git a/paddle/pten/core/convert_utils.cc b/paddle/pten/core/convert_utils.cc
new file mode 100644
index 0000000000000..32f2497dd18a5
--- /dev/null
+++ b/paddle/pten/core/convert_utils.cc
@@ -0,0 +1,163 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/core/convert_utils.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/platform/gpu_info.h"
+
+namespace pten {
+
+// TODO(chenweihang): Add other place trans cases later
+Backend TransToPtenBackend(const paddle::platform::Place& place) {
+  if (paddle::platform::is_cpu_place(place)) {
+    return Backend::CPU;
+  } else if (paddle::platform::is_gpu_place(place)) {
+    return Backend::CUDA;
+  } else {
+    return Backend::UNDEFINED;
+  }
+}
+
+paddle::experimental::DataType TransToPtenDataType(
+    const paddle::framework::proto::VarType::Type& dtype) {
+  // Set the order of case branches according to the frequency with
+  // the data type is used
+  switch (dtype) {
+    case paddle::framework::proto::VarType::FP32:
+      return DataType::FLOAT32;
+    case paddle::framework::proto::VarType::FP64:
+      return DataType::FLOAT64;
+    case paddle::framework::proto::VarType::INT64:
+      return DataType::INT64;
+    case paddle::framework::proto::VarType::INT32:
+      return DataType::INT32;
+    case paddle::framework::proto::VarType::INT8:
+      return DataType::INT8;
+    case paddle::framework::proto::VarType::UINT8:
+      return DataType::UINT8;
+    case paddle::framework::proto::VarType::INT16:
+      return DataType::INT16;
+    case paddle::framework::proto::VarType::COMPLEX64:
+      return DataType::COMPLEX64;
+    case paddle::framework::proto::VarType::COMPLEX128:
+      return DataType::COMPLEX128;
+    case paddle::framework::proto::VarType::FP16:
+      return DataType::FLOAT16;
+    case paddle::framework::proto::VarType::BF16:
+      return DataType::BFLOAT16;
+    case paddle::framework::proto::VarType::BOOL:
+      return DataType::BOOL;
+    default:
+      return DataType::UNDEFINED;
+  }
+}
+
+DataLayout TransToPtenDataLayout(const paddle::framework::DataLayout& layout) {
+  switch (layout) {
+    case paddle::framework::DataLayout::kNHWC:
+      return DataLayout::NHWC;
+    case paddle::framework::DataLayout::kNCHW:
+      return DataLayout::NCHW;
+    case paddle::framework::DataLayout::kAnyLayout:
+      return DataLayout::ANY;
+    case paddle::framework::DataLayout::kMKLDNN:
+      return DataLayout::MKLDNN;
+    default:
+      return DataLayout::UNDEFINED;
+  }
+}
+
+paddle::platform::Place TransToFluidPlace(const Backend& backend) {
+  // TODO(chenweihang): add other trans cases later
+  switch (backend) {
+    case pten::Backend::CPU:
+      return paddle::platform::CPUPlace();
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    case pten::Backend::CUDA:
+      return paddle::platform::CUDAPlace(
+          paddle::platform::GetCurrentDeviceId());
+#endif
+#ifdef PADDLE_WITH_MKLDNN
+    case pten::Backend::MKLDNN:
+      return paddle::platform::CPUPlace();
+#endif
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    case pten::Backend::CUDNN:
+      return paddle::platform::CUDAPlace(
+          paddle::platform::GetCurrentDeviceId());
+#endif
+    default:
+      PADDLE_THROW(paddle::platform::errors::Unimplemented(
+          "Unsupported backend `%s` when casting it to paddle place type.",
+          backend));
+  }
+}
+
+paddle::framework::proto::VarType::Type TransToProtoVarType(
+    const paddle::experimental::DataType& dtype) {
+  // Set the order of case branches according to the frequency with
+  // the data type is used
+  switch (dtype) {
+    case DataType::FLOAT32:
+      return paddle::framework::proto::VarType::FP32;
+    case DataType::FLOAT64:
+      return paddle::framework::proto::VarType::FP64;
+    case DataType::INT64:
+      return paddle::framework::proto::VarType::INT64;
+    case DataType::INT32:
+      return paddle::framework::proto::VarType::INT32;
+    case DataType::INT8:
+      return paddle::framework::proto::VarType::INT8;
+    case DataType::UINT8:
+      return paddle::framework::proto::VarType::UINT8;
+    case DataType::INT16:
+      return paddle::framework::proto::VarType::INT16;
+    case DataType::COMPLEX64:
+      return paddle::framework::proto::VarType::COMPLEX64;
+    case DataType::COMPLEX128:
+      return paddle::framework::proto::VarType::COMPLEX128;
+    case DataType::FLOAT16:
+      return paddle::framework::proto::VarType::FP16;
+    case DataType::BFLOAT16:
+      return paddle::framework::proto::VarType::BF16;
+    case DataType::BOOL:
+      return paddle::framework::proto::VarType::BOOL;
+    default:
+      PADDLE_THROW(paddle::platform::errors::Unimplemented(
+          "Unsupported data type `%s` when casting it into "
+          "paddle data type.",
+          dtype));
+  }
+}
+
+paddle::framework::DataLayout TransToFluidDataLayout(const DataLayout& layout) {
+  switch (layout) {
+    case DataLayout::NHWC:
+      return paddle::framework::DataLayout::kNHWC;
+    case DataLayout::NCHW:
+      return paddle::framework::DataLayout::kNCHW;
+    case DataLayout::ANY:
+      return paddle::framework::DataLayout::kAnyLayout;
+    case DataLayout::MKLDNN:
+      return paddle::framework::DataLayout::kMKLDNN;
+    default:
+      PADDLE_THROW(paddle::platform::errors::Unimplemented(
+          "Unsupported data layout `%s` when casting it into "
+          "paddle data layout.",
+          layout));
+  }
+}
+
+}  // namespace pten
diff --git a/paddle/pten/core/convert_utils.h b/paddle/pten/core/convert_utils.h
new file mode 100644
index 0000000000000..aa79cb240dd04
--- /dev/null
+++ b/paddle/pten/core/convert_utils.h
@@ -0,0 +1,43 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/pten/common/backend.h"
+#include "paddle/pten/common/data_type.h"
+#include "paddle/pten/common/layout.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/platform/place.h"
+
+// TODO(chenweihang): this file may need to be removed
+
+namespace pten {
+
+using DataType = paddle::experimental::DataType;
+using DataLayout = paddle::experimental::DataLayout;
+
+Backend TransToPtenBackend(const paddle::platform::Place& place);
+DataType TransToPtenDataType(
+    const paddle::framework::proto::VarType::Type& dtype);
+DataLayout TransToPtenDataLayout(const paddle::framework::DataLayout& layout);
+
+paddle::platform::Place TransToFluidPlace(const Backend& backend);
+paddle::framework::proto::VarType::Type TransToProtoVarType(
+    const DataType& dtype);
+paddle::framework::DataLayout TransToFluidDataLayout(const DataLayout& layout);
+
+}  // namespace pten
diff --git a/paddle/pten/core/dense_tensor.cc b/paddle/pten/core/dense_tensor.cc
new file mode 100644
index 0000000000000..647ddea0b4e1b
--- /dev/null
+++ b/paddle/pten/core/dense_tensor.cc
@@ -0,0 +1,138 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/core/dense_tensor.h"
+
+namespace pten {
+
+DenseTensor::DenseTensor(const std::shared_ptr<Allocator>& a,
+                         const DenseTensorMeta& meta)
+    : meta_(meta),
+      storage_(
+          make_intrusive<TensorStorage>(a, SizeOf(data_type()) * numel())) {}
+
+DenseTensor::DenseTensor(const std::shared_ptr<Allocator>& a,
+                         DenseTensorMeta&& meta)
+    : meta_(std::move(meta)),
+      storage_(
+          make_intrusive<TensorStorage>(a, SizeOf(data_type()) * numel())) {}
+
+DenseTensor::DenseTensor(intrusive_ptr<Storage> storage,
+                         const DenseTensorMeta& meta)
+    : meta_(meta), storage_(std::move(storage)) {}
+
+DenseTensor::DenseTensor(intrusive_ptr<Storage> storage, DenseTensorMeta&& meta)
+    : meta_(std::move(meta)), storage_(std::move(storage)) {}
+
+int64_t DenseTensor::numel() const {
+  if (meta_.is_scalar) {
+    return 1;
+  }
+  return product(meta_.dims);
+}
+
+bool DenseTensor::IsSharedWith(const DenseTensor& b) const {
+  return storage_.get() == b.storage_.get() && storage_.get() != nullptr;
+}
+
+void* DenseTensor::mutable_data(size_t request_bytes) {
+  PADDLE_ENFORCE(
+      valid(),
+      paddle::platform::errors::PreconditionNotMet(
+          "The meta data must be valid when call the mutable data function."));
+  PADDLE_ENFORCE_NOT_NULL(
+      storage_,
+      paddle::platform::errors::PreconditionNotMet(
+          "The storage must be valid when call the mutable data function."));
+  size_t bytes = numel() * SizeOf(data_type());
+  if (request_bytes) {
+    PADDLE_ENFORCE_GE(request_bytes,
+                      bytes,
+                      paddle::platform::errors::InvalidArgument(
+                          "The reserved size %d should be enough to meet the "
+                          "volume required by metadata %d.",
+                          request_bytes,
+                          bytes));
+    bytes = request_bytes;
+  }
+  if (storage_->size() < bytes) {
+    storage_->Realloc(bytes);
+  }
+  return storage_->data();
+}
+
+template <typename T>
+T* DenseTensor::mutable_data() {
+  PADDLE_ENFORCE(
+      (data_type() == paddle::experimental::CppTypeToDataType<T>::Type()),
+      paddle::platform::errors::PreconditionNotMet(
+          "The type of data (%d) we are trying to retrieve does not match the "
+          "type of data currently contained in the container (%d).",
+          static_cast<int>(paddle::experimental::CppTypeToDataType<T>::Type()),
+          static_cast<int>(data_type())));
+  return static_cast<T*>(mutable_data());
+}
+
+template <typename T>
+const T* DenseTensor::data() const {
+  PADDLE_ENFORCE(
+      (data_type() == paddle::experimental::CppTypeToDataType<T>::Type()),
+      paddle::platform::errors::PreconditionNotMet(
+          "The type of data we are trying to retrieve does not match the "
+          "type of data currently contained in the container."));
+  return static_cast<const T*>(data());
+}
+
+const void* DenseTensor::data() const {
+  PADDLE_ENFORCE_NOT_NULL(
+      storage_,
+      paddle::platform::errors::PreconditionNotMet(
+          "The storage must be valid when call the mutable data function."));
+  return storage_->data();
+}
+
+void DenseTensor::check_memory_size() const {
+  size_t bytes = numel() * SizeOf(data_type());
+  PADDLE_ENFORCE_GE(memory_size(),
+                    bytes,
+                    paddle::platform::errors::InvalidArgument(
+                        "The memory size %d should be enough to meet the "
+                        "volume required by metadata %d.",
+                        memory_size(),
+                        bytes));
+}
+
+#define DATA_MEMBER_FUNC_INSTANTIATION(dtype)  \
+  template dtype* DenseTensor::mutable_data(); \
+  template const dtype* DenseTensor::data() const;
+
+DATA_MEMBER_FUNC_INSTANTIATION(bool);
+DATA_MEMBER_FUNC_INSTANTIATION(int8_t);
+DATA_MEMBER_FUNC_INSTANTIATION(uint8_t);
+DATA_MEMBER_FUNC_INSTANTIATION(int16_t);
+DATA_MEMBER_FUNC_INSTANTIATION(uint16_t);
+DATA_MEMBER_FUNC_INSTANTIATION(int32_t);
+DATA_MEMBER_FUNC_INSTANTIATION(uint32_t);
+DATA_MEMBER_FUNC_INSTANTIATION(int64_t);
+DATA_MEMBER_FUNC_INSTANTIATION(uint64_t);
+DATA_MEMBER_FUNC_INSTANTIATION(::paddle::platform::bfloat16);
+DATA_MEMBER_FUNC_INSTANTIATION(::paddle::platform::float16);
+DATA_MEMBER_FUNC_INSTANTIATION(float);
+DATA_MEMBER_FUNC_INSTANTIATION(double);
+DATA_MEMBER_FUNC_INSTANTIATION(::paddle::experimental::complex64);
+DATA_MEMBER_FUNC_INSTANTIATION(::paddle::experimental::complex128);
+
+#undef DATA_MEMBER_FUNC_INSTANTIATION
+
+}  // namespace pten
diff --git a/paddle/pten/core/dense_tensor.h b/paddle/pten/core/dense_tensor.h
new file mode 100644
index 0000000000000..e8e57b333ae99
--- /dev/null
+++ b/paddle/pten/core/dense_tensor.h
@@ -0,0 +1,177 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/pten/core/allocator.h"
+#include "paddle/pten/core/storage.h"
+#include "paddle/pten/core/tensor_base.h"
+#include "paddle/pten/core/tensor_meta.h"
+
+namespace pten {
+
+class CompatibleDenseTensorUtils;
+
+/// \brief The Dense tensor store values in a contiguous sequential block
+/// of memory where all values are represented. Tensors or multi-dimensional
+/// arrays are used in math operators.
+/// During the entire life cycle of a DenseTensor, its device type and key
+/// metadata are set unchanged.
+class DenseTensor : public TensorBase,
+                    public TypeInfoTraits<TensorBase, DenseTensor> {
+ public:
+  /// \brief Construct a dense tensor and allocate space.
+  /// \param a The allocator used to allocate space.
+  /// \param meta The meta data of dense tensor.
+  DenseTensor(const std::shared_ptr<Allocator>& a, const DenseTensorMeta& meta);
+
+  /// \brief Construct a dense tensor and allocate space.
+  /// \param a The allocator used to allocate space.
+  /// \param meta The meta data of dense tensor.
+  DenseTensor(const std::shared_ptr<Allocator>& a, DenseTensorMeta&& meta);
+
+  /// \brief Use existing storage space to create dense tensor. This interface
+  /// can be used to deliberately create an uninitialized dense tensor.
+  /// \param storage The existing storage.
+  /// \param meta The meta data of dense tensor.
+  DenseTensor(intrusive_ptr<Storage> storage, const DenseTensorMeta& meta);
+
+  /// \brief Use existing storage space to create dense tensor. This interface
+  /// can be used to deliberately create an uninitialized dense tensor.
+  /// \param storage The existing storage.
+  /// \param meta The meta data of dense tensor.
+  DenseTensor(intrusive_ptr<Storage> storage, DenseTensorMeta&& meta);
+
+  /// \brief Because dense tensor is a kind of container, we give a default
+  /// constructor to use for stl container. But the dense tensor created with
+  /// the default constructor is not practical.
+  DenseTensor() = default;
+
+  /// \brief Because dense tensor is a resource handle, we provide a default
+  /// move constructor to support move semantics.
+  DenseTensor(DenseTensor&& other) = default;
+
+  /// \brief We do not recommend deep copy of dense tensor because of its
+  /// efficiency and complexity across devices. The operation is disabled here.
+  DenseTensor(const DenseTensor& other) = delete;
+
+  /// \brief Destroy the tensor object and release exclusive resources.
+  virtual ~DenseTensor() = default;
+
+ public:
+  /// \brief Returns the name of the class for type traits.
+  /// \return The name of the class.
+  static const char* name() { return "DenseTensor"; }
+
+  /// \brief Returns the number of elements contained in tensor.
+  /// \return The number of elements contained in tensor.
+  int64_t numel() const;
+
+  /// \brief Returns the dims of the tensor.
+  /// \return The dims of the tensor.
+  const DDim& dims() const noexcept { return meta_.dims; }
+
+  /// \brief Returns the lod of the tensor.
+  /// \return The lod of the tensor.
+  const std::vector<std::vector<size_t>>& lod() const noexcept {
+    return meta_.lod;
+  }
+
+  /// \brief Set the lod of the tensor.
+  void set_lod(const std::vector<std::vector<size_t>>& lod) { meta_.lod = lod; }
+
+  /// \brief Returns the data type of the tensor.
+  /// \return The data type of the tensor.
+  DataType data_type() const noexcept { return meta_.type; }
+
+  /// \brief Returns the data layout of the tensor.
+  /// \return The data layout of the tensor.
+  DataLayout layout() const noexcept { return meta_.layout; }
+
+  /// \brief Returns the data place of the tensor.
+  /// \return The data place of the tensor.
+  const Place& place() const { return storage_->place(); }
+
+  /// \brief Returns the meta information of the tensor.
+  /// \return The meta information of the tensor.
+  const DenseTensorMeta& meta() const noexcept { return meta_; }
+
+  /// \brief Test whether the metadata is valid.
+  /// \return Whether the metadata is valid.
+  bool valid() const noexcept { return meta_.valid(); }
+
+  /// \brief Test whether the storage is allocated.
+  /// return Whether the storage is allocated.
+  bool initialized() const { return storage_->data(); }
+
+  /// \brief Check if storage is shared with other objects.
+  /// \return Whether the storage is shared with other objects.
+  bool IsSharedWith(const DenseTensor& b) const;
+
+  /// \brief Change the dims information in the metadata, and the corresponding
+  /// memory allocation will occur when the `mutable_data` is called.
+  /// \param dims The new dims of the dense tensor.
+  void Resize(const DDim& dims) noexcept { meta_.dims = dims; }
+
+  /// \brief Returns the actual storage size occupied by tensor, may be larger
+  /// than its shape dims.
+  /// \return The actual storage size occupied by tensor.
+  size_t memory_size() const { return storage_->size(); }
+
+  /// \brief Check that the storage area is large enough to hold the data of the
+  /// metadata size, and throw an exception if the conditions are not met.
+  void check_memory_size() const;
+
+  /// \brief Release the storage area for other purposes. Because of the
+  /// destruction of encapsulation, we do not support two dense tensors directly
+  /// sharing the same intrusive pointer.
+  /// \return The rvalue of instrusize pointer releated to the released storage.
+  intrusive_ptr<Storage> release() { return std::move(storage_); }
+
+  /// \brief Get the mutable data pointer value of type T.
+  /// Memory allocation may occur when calling this interface:
+  /// 1. When the storage size is not enough to meet the current shape of the
+  /// data.
+  /// \return The mutable data pointer value of type T.
+  template <typename T>
+  T* mutable_data();
+
+  /// \brief Get the mutable data pointer value of raw type.
+  /// Memory allocation may occur when calling this interface:
+  /// 1. When the storage size is not enough to meet the current shape of the
+  /// data.
+  /// 2. When more request_bytes parameters are used to reserve the data
+  /// storage.
+  /// param request_bytes The bytes to reserve the data storage.
+  /// \return The mutable data pointer value of type T.
+  void* mutable_data(size_t request_bytes = 0);
+
+  /// \brief Get the const data pointer value of type T.
+  /// \return The const data pointer value of type T.
+  template <typename T>
+  const T* data() const;
+
+  /// \brief Get the const data pointer value of raw type.
+  /// \return The const data pointer value of raw type.
+  const void* data() const;
+
+ private:
+  friend class CompatibleDenseTensorUtils;
+
+ private:
+  DenseTensorMeta meta_;
+  intrusive_ptr<Storage> storage_;
+};
+
+}  // namespace pten
diff --git a/paddle/pten/core/kernel_context.cc b/paddle/pten/core/kernel_context.cc
new file mode 100644
index 0000000000000..443990c07247d
--- /dev/null
+++ b/paddle/pten/core/kernel_context.cc
@@ -0,0 +1,17 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/pten/core/kernel_context.h"
+
+namespace pten {}  // namespace pten
diff --git a/paddle/pten/core/kernel_context.h b/paddle/pten/core/kernel_context.h
new file mode 100644
index 0000000000000..973640906e0de
--- /dev/null
+++ b/paddle/pten/core/kernel_context.h
@@ -0,0 +1,192 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <iterator>
+#include <utility>
+
+#include "paddle/pten/core/compat_utils.h"
+#include "paddle/pten/core/tensor_base.h"
+#include "paddle/utils/any.h"
+#include "paddle/utils/small_vector.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace pten {
+
+using DeviceContext = paddle::platform::DeviceContext;
+using DataType = paddle::experimental::DataType;
+using DataLayout = paddle::experimental::DataLayout;
+
+/**
+ * Note: KernelContext doesn't manage the life if DeviceContext and Tensor
+ *
+ * Note: KernelContext does not couple the concept of framework,
+ *       its constructor can only take the members it needs as parameters,
+ *       not Scope, RuntimeContext, etc. as parameters
+ */
+class KernelContext {
+ public:
+  KernelContext() = default;
+  explicit KernelContext(DeviceContext* dev_ctx) : dev_ctx_(dev_ctx) {}
+
+  void SetDeviceContext(DeviceContext* dev_ctx) { dev_ctx_ = dev_ctx; }
+
+  template <typename CtxType>
+  const CtxType& GetDeviceContext() const {
+    return static_cast<const CtxType&>(*dev_ctx_);
+  }
+
+  void EmplaceBackInput(std::shared_ptr<TensorBase> input) {
+    int index = inputs_.size();
+    inputs_.emplace_back(std::move(input));
+    // Record the start and end index of the input
+    input_range_.emplace_back(std::pair<int, int>(index, index + 1));
+  }
+
+  void EmplaceBackInputs(
+      paddle::SmallVector<std::shared_ptr<TensorBase>> inputs) {
+    int index = inputs_.size();
+    // Record the start and end index of the input
+    input_range_.emplace_back(
+        std::pair<int, int>(index, index + inputs.size()));
+    inputs_.insert(inputs_.end(),
+                   std::make_move_iterator(inputs.begin()),
+                   std::make_move_iterator(inputs.end()));
+  }
+
+  void EmplaceBackOutput(std::shared_ptr<TensorBase> output) {
+    int index = outputs_.size();
+    outputs_.emplace_back(std::move(output));
+    // Record the start and end index of the input
+    output_range_.emplace_back(std::pair<int, int>(index, index + 1));
+  }
+
+  void EmplaceBackOutputs(
+      paddle::SmallVector<std::shared_ptr<TensorBase>> outputs) {
+    int index = outputs_.size();
+    // Record the start and end index of the input
+    output_range_.emplace_back(
+        std::pair<int, int>(index, index + outputs.size()));
+    outputs_.insert(outputs_.end(),
+                    std::make_move_iterator(outputs.begin()),
+                    std::make_move_iterator(outputs.end()));
+  }
+
+  void EmplaceBackAttr(paddle::any attr) {
+    attrs_.emplace_back(std::move(attr));
+  }
+
+  template <typename TensorType>
+  const TensorType& InputAt(size_t idx) const {
+    return static_cast<const TensorType&>(*(inputs_.at(idx)));
+  }
+
+  template <typename TensorType>
+  std::vector<TensorType> InputBetween(size_t start, size_t end) const {
+    std::vector<TensorType> v;
+    for (size_t i = start; i < end; ++i) {
+      auto t = std::dynamic_pointer_cast<TensorType>(inputs_.at(i));
+      v.emplace_back(std::move(*t.get()));
+    }
+
+    return v;
+  }
+
+  const std::pair<int, int>& InputRangeAt(size_t idx) const {
+    return input_range_.at(idx);
+  }
+
+  const std::pair<int, int>& OutputRangeAt(size_t idx) const {
+    return output_range_.at(idx);
+  }
+
+  std::pair<int, int>& MutableInputRangeAt(size_t idx) {
+    return input_range_[idx];
+  }
+
+  std::pair<int, int>& MutableOutputRangeAt(size_t idx) {
+    return output_range_[idx];
+  }
+
+  template <typename TensorType>
+  TensorType* MutableInputAt(size_t idx) {
+    return static_cast<TensorType*>(inputs_.at(idx).get());
+  }
+
+  template <typename TensorType>
+  TensorType* MutableOutputAt(size_t idx) {
+    return static_cast<TensorType*>(outputs_.at(idx).get());
+  }
+
+  template <typename TensorType>
+  std::vector<TensorType*> MutableOutputBetween(size_t start, size_t end) {
+    std::vector<TensorType*> v;
+    for (size_t i = start; i < end; ++i) {
+      v.emplace_back(static_cast<TensorType*>(outputs_.at(i).get()));
+    }
+
+    return v;
+  }
+
+  template <typename AttrType>
+  AttrType AttrAt(size_t idx) const {
+    try {
+      return paddle::any_cast<AttrType>(attrs_.at(idx));
+    } catch (paddle::bad_any_cast&) {
+      PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+          "Attribute cast error in Op Kernel Context."));
+    }
+  }
+
+  // Temporary method: For compatible with fluid Tensor and improve performance
+  // Only deal with DenseTensor now
+  void ClearData() {
+    for (auto& in : inputs_) {
+      CompatibleDenseTensorUtils::ClearStorage(
+          static_cast<DenseTensor*>(in.get()));
+    }
+    for (auto& out : outputs_) {
+      CompatibleDenseTensorUtils::ClearStorage(
+          static_cast<DenseTensor*>(out.get()));
+    }
+    attrs_.clear();
+  }
+
+  size_t InputsSize() const { return inputs_.size(); }
+  size_t OutputsSize() const { return outputs_.size(); }
+  size_t AttrsSize() const { return attrs_.size(); }
+
+ private:
+  bool IsDuplicable() const { return input_range_.size() != inputs_.size(); }
+
+ private:
+  // DeviceContext base class
+  DeviceContext* dev_ctx_;
+
+  // TODO(chenweihang): Tensor -> Tensor*, Tensor should by managed `scope`
+  // Note: can't use API Tensor here, the inference don't use this API Tensor
+  paddle::SmallVector<std::shared_ptr<TensorBase>> inputs_;
+  paddle::SmallVector<std::shared_ptr<TensorBase>> outputs_;
+  paddle::SmallVector<paddle::any> attrs_;
+
+  // Only contains input like list[Tensor] need `range`
+  paddle::SmallVector<std::pair<int, int>> input_range_;
+  paddle::SmallVector<std::pair<int, int>> output_range_;
+};
+
+}  // namespace pten
diff --git a/paddle/pten/core/kernel_def.h b/paddle/pten/core/kernel_def.h
new file mode 100644
index 0000000000000..48a579cd02b51
--- /dev/null
+++ b/paddle/pten/core/kernel_def.h
@@ -0,0 +1,42 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+namespace pten {
+
+class Kernel;
+class KernelKey;
+class KernelArgsDef;
+class KernelContext;
+
+using KernelFn = void (*)(KernelContext* ctx);
+using KernelArgsDefFn = void (*)(Kernel* kernel);
+using KernelArgsParseFn = void (*)(const KernelKey& default_key,
+                                   KernelArgsDef* args_def);
+
+// Multiple kernels of the same operation are distinguished by the difference
+// of the overload name. For the convenience of reuse, we define some overload
+// naming strings for the naming of the kernel
+
+// For kernels that contains dynamic tensor attribute and it need to be always
+// on host device, such as `ScaleTensor`
+constexpr char kContainHostTensorSuffix[] = "host";
+
+// For kernels with SelectedRowsTensor input and output
+constexpr char kContainSelectedRowsSuffix[] = "sr";
+
+// For kernels with intermediate output
+constexpr char kContainMidOutputTensorSuffix[] = "mid";
+}  // namespace pten
diff --git a/paddle/pten/core/kernel_factory.cc b/paddle/pten/core/kernel_factory.cc
new file mode 100644
index 0000000000000..aeefb7cfefb78
--- /dev/null
+++ b/paddle/pten/core/kernel_factory.cc
@@ -0,0 +1,111 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/pten/core/kernel_factory.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/platform/enforce.h"
+
+namespace pten {
+
+uint32_t KernelKey::Hash::operator()(const KernelKey& key) const {
+  uint32_t hash_value = 0;
+  // |----31-20------|---19-12---|---11-8----|---7-0---|
+  // | For extension | DataType | DataLayout | Backend |
+  hash_value |= static_cast<uint8_t>(key.backend());
+  hash_value |=
+      (static_cast<uint8_t>(key.layout()) << KernelKey::kBackendBitLength);
+  hash_value |=
+      (static_cast<uint16_t>(key.dtype())
+       << (KernelKey::kBackendBitLength + KernelKey::kDataLayoutBitLength));
+  return hash_value;
+}
+
+KernelFactory& KernelFactory::Instance() {
+  static KernelFactory g_op_kernel_factory;
+  return g_op_kernel_factory;
+}
+
+Kernel KernelFactory::SelectKernel(const KernelName& kernel_name,
+                                   const KernelKey& kernel_key) const {
+  auto iter = kernels_.find(kernel_name);
+  if (iter == kernels_.end()) {
+    return Kernel();
+  }
+  auto kernel_iter = iter->second.find(kernel_key);
+  if (kernel_iter == iter->second.end()) {
+    return Kernel();
+  }
+  return kernel_iter->second;
+}
+
+const Kernel& KernelFactory::SelectKernelOrThrowError(
+    const KernelName& kernel_name, const KernelKey& kernel_key) const {
+  auto iter = kernels_.find(kernel_name);
+  PADDLE_ENFORCE_NE(iter,
+                    kernels_.end(),
+                    paddle::platform::errors::NotFound(
+                        "The kernel `%s` is not registered.", kernel_name));
+
+  auto kernel_iter = iter->second.find(kernel_key);
+  // TODO(chenweihang): polish refind impl here
+  if (kernel_iter == iter->second.end() &&
+      kernel_key.layout() != pten::DataLayout::ANY) {
+    pten::KernelKey any_layout_kernel_key(
+        kernel_key.backend(), pten::DataLayout::ANY, kernel_key.dtype());
+    kernel_iter = iter->second.find(any_layout_kernel_key);
+  }
+  PADDLE_ENFORCE_NE(
+      kernel_iter,
+      iter->second.end(),
+      paddle::platform::errors::NotFound(
+          "The kernel with key %s of kernel `%s` is not registered.",
+          kernel_key,
+          kernel_name));
+
+  return kernel_iter->second;
+}
+
+const Kernel& KernelFactory::SelectKernelOrThrowError(
+    const KernelName& kernel_name,
+    Backend backend,
+    DataLayout layout,
+    DataType dtype) const {
+  return SelectKernelOrThrowError(kernel_name,
+                                  KernelKey(backend, layout, dtype));
+}
+
+std::ostream& operator<<(std::ostream& os, const Kernel& kernel) {
+  os << "InputNum(" << kernel.args_def().input_defs().size() << "): [";
+  for (auto& in_def : kernel.args_def().input_defs()) {
+    os << "<" << in_def.backend << ", " << in_def.layout << ", " << in_def.dtype
+       << ">";
+  }
+  os << "]), AttributeNum(" << kernel.args_def().attribute_defs().size()
+     << "), OutputNum(" << kernel.args_def().output_defs().size() << ")";
+  return os;
+}
+
+std::ostream& operator<<(std::ostream& os, KernelFactory& kernel_factory) {
+  for (const auto& op_kernel_pair : kernel_factory.kernels()) {
+    os << "- kernel name: " << op_kernel_pair.first << "\n";
+    for (const auto& kernel_pair : op_kernel_pair.second) {
+      os << "\t- kernel key: " << kernel_pair.first << " | "
+         << "kernel: " << kernel_pair.second << "\n";
+    }
+  }
+  return os;
+}
+
+}  // namespace pten
diff --git a/paddle/pten/core/kernel_factory.h b/paddle/pten/core/kernel_factory.h
new file mode 100644
index 0000000000000..4ec80521b44a6
--- /dev/null
+++ b/paddle/pten/core/kernel_factory.h
@@ -0,0 +1,317 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <ostream>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+
+#include "paddle/pten/common/backend.h"
+#include "paddle/pten/common/data_type.h"
+#include "paddle/pten/common/layout.h"
+#include "paddle/pten/core/kernel_def.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/utils/flat_hash_map.h"
+#include "paddle/utils/small_vector.h"
+
+namespace pten {
+
+using DataType = paddle::experimental::DataType;
+using DataLayout = paddle::experimental::DataLayout;
+
+/**
+ * [ Naming considerations ]
+ *
+ * The tensor operation library contains many kernels, and the computation
+ * in each specific scenario is represented by an kernel.
+ *
+ * We directly named it `Kernel` instead of `Kernel`, the tensor operation
+ * library here and fluid are independent, avoiding developers from
+ * misunderstanding the relationship between the two concepts.
+ */
+
+class KernelContext;
+
+using KernelFn = void (*)(KernelContext* ctx);
+
+class KernelName final {
+ public:
+  KernelName(std::string name, std::string overload_name)
+      : name_(std::move(name)), overload_name_(std::move(overload_name)) {}
+
+  KernelName(const std::string& kernel_name) {
+    ParseNameAndOverloadNameFromString(kernel_name);
+  }
+
+  KernelName(const char* kernel_name) {
+    std::string kernel_name_str(kernel_name);
+    ParseNameAndOverloadNameFromString(kernel_name_str);
+  }
+
+  const std::string& name() const { return name_; }
+  const std::string& overload_name() const { return overload_name_; }
+
+  struct Hash {
+    size_t operator()(const KernelName& kernel_name) const {
+      return std::hash<std::string>()(kernel_name.name()) ^
+             (std::hash<std::string>()(kernel_name.overload_name()) << 1);
+    }
+  };
+
+  size_t hash_value() const { return Hash()(*this); }
+
+  bool operator<(const KernelName& kernel_name) const {
+    return hash_value() < kernel_name.hash_value();
+  }
+
+  bool operator==(const KernelName& kernel_name) const {
+    return hash_value() == kernel_name.hash_value();
+  }
+
+  bool operator!=(const KernelName& kernel_name) const {
+    return hash_value() != kernel_name.hash_value();
+  }
+
+ private:
+  void ParseNameAndOverloadNameFromString(const std::string& kernel_name) {
+    size_t pos = kernel_name.find_first_of('.');
+    if (pos == std::string::npos) {
+      name_ = kernel_name;
+      overload_name_ = "";
+    } else {
+      name_ = kernel_name.substr(0, pos);
+      overload_name_ = kernel_name.substr(pos + 1, kernel_name.size());
+    }
+  }
+
+  // TODO(chenweihang): use string_view to improve performance later
+  std::string name_;
+  std::string overload_name_;
+};
+
+class KernelKey {
+ public:
+  KernelKey() = default;
+
+  KernelKey(Backend backend, DataLayout layout, DataType dtype)
+      : backend_(backend), layout_(layout), dtype_(dtype) {}
+
+  Backend backend() const { return backend_; }
+  DataLayout layout() const { return layout_; }
+  DataType dtype() const { return dtype_; }
+
+  struct Hash {
+    // Note: Now the number of bits we need does not exceed 32 bits, so there is
+    // no need to use 64 bits. If needed in the future, it can be expanded,
+    // but now we don’t over-design.
+    uint32_t operator()(const KernelKey& key) const;
+  };
+
+  uint32_t hash_value() const { return Hash()(*this); }
+
+  bool operator<(const KernelKey& key) const {
+    return hash_value() < key.hash_value();
+  }
+
+  bool operator==(const KernelKey& key) const {
+    return hash_value() == key.hash_value();
+  }
+
+  bool operator!=(const KernelKey& key) const {
+    return hash_value() != key.hash_value();
+  }
+
+ private:
+  // In total should be smaller than 32.
+  constexpr static int kBackendBitLength = 8;
+  constexpr static int kDataLayoutBitLength = 4;
+  constexpr static int kDataTypeBitLength = 8;
+
+  Backend backend_{Backend::UNDEFINED};
+  DataLayout layout_{DataLayout::UNDEFINED};
+  DataType dtype_{DataType::UNDEFINED};
+};
+
+// TODO(chenweihang): how deal with vector<Param>?
+struct TensorArgDef {
+  Backend backend;
+  DataLayout layout;
+  DataType dtype;
+
+  TensorArgDef(Backend in_backend, DataLayout in_layout, DataType in_dtype)
+      : backend(in_backend), layout(in_layout), dtype(in_dtype) {}
+
+  TensorArgDef& SetBackend(Backend in_backend) {
+    backend = in_backend;
+    return *this;
+  }
+
+  TensorArgDef& SetDataLayout(DataLayout in_layout) {
+    layout = in_layout;
+    return *this;
+  }
+
+  TensorArgDef& SetDataType(DataType in_dtype) {
+    dtype = in_dtype;
+    return *this;
+  }
+};
+
+struct AttributeArgDef {
+  std::type_index type_index;
+
+  explicit AttributeArgDef(std::type_index type_index)
+      : type_index(type_index) {}
+};
+
+class KernelArgsDef {
+ public:
+  KernelArgsDef() = default;
+
+  void AppendInput(Backend backend, DataLayout layout, DataType dtype) {
+    input_defs_.emplace_back(TensorArgDef(backend, layout, dtype));
+  }
+
+  void AppendOutput(Backend backend, DataLayout layout, DataType dtype) {
+    output_defs_.emplace_back(TensorArgDef(backend, layout, dtype));
+  }
+
+  void AppendAttribute(std::type_index type_index) {
+    attribute_defs_.emplace_back(AttributeArgDef(type_index));
+  }
+
+  const paddle::SmallVector<TensorArgDef>& input_defs() const {
+    return input_defs_;
+  }
+
+  const paddle::SmallVector<TensorArgDef>& output_defs() const {
+    return output_defs_;
+  }
+
+  const paddle::SmallVector<AttributeArgDef>& attribute_defs() const {
+    return attribute_defs_;
+  }
+
+  paddle::SmallVector<TensorArgDef>& input_defs() { return input_defs_; }
+
+  paddle::SmallVector<TensorArgDef>& output_defs() { return output_defs_; }
+
+  paddle::SmallVector<AttributeArgDef>& attribute_defs() {
+    return attribute_defs_;
+  }
+
+ private:
+  paddle::SmallVector<TensorArgDef> input_defs_{{}};
+  paddle::SmallVector<TensorArgDef> output_defs_{{}};
+  paddle::SmallVector<AttributeArgDef> attribute_defs_{{}};
+};
+
+class Kernel {
+ public:
+  // for map element contruct
+  Kernel() = default;
+
+  explicit Kernel(KernelFn fn) : fn_(fn) {}
+
+  void operator()(KernelContext* ctx) const { fn_(ctx); }
+
+  KernelArgsDef* mutable_args_def() { return &args_def_; }
+
+  const KernelArgsDef& args_def() const { return args_def_; }
+
+  TensorArgDef& InputAt(size_t idx) { return args_def_.input_defs().at(idx); }
+
+  TensorArgDef& OutputAt(size_t idx) { return args_def_.output_defs().at(idx); }
+
+  bool IsValid() { return fn_ != nullptr; }
+
+ private:
+  KernelFn fn_{nullptr};
+  KernelArgsDef args_def_;
+};
+
+/**
+ * Note: Each Computation need a basic kernel map that named by kernel_name.
+ *       Such as for scale op, KernelMap contains a `scale` kernel map,
+ *       if it still need other overload kernel, the op name can be
+ *       `scale.***`.
+ */
+class KernelFactory {
+ public:
+  // replaced by paddle::flat_hash_map later
+  using KernelMap = paddle::flat_hash_map<
+      KernelName,
+      paddle::flat_hash_map<KernelKey, Kernel, KernelKey::Hash>,
+      KernelName::Hash>;
+
+  static KernelFactory& Instance();
+
+  KernelMap& kernels() { return kernels_; }
+
+  void InsertCompatibleOpType(const std::string& op_type) {
+    compatible_op_types_.insert(op_type);
+  }
+
+  bool HasCompatiblePtenKernel(const std::string& op_type) const {
+    return compatible_op_types_.count(op_type) > 0;
+  }
+
+  const Kernel& SelectKernelOrThrowError(const KernelName& kernel_name,
+                                         const KernelKey& kernel_key) const;
+
+  const Kernel& SelectKernelOrThrowError(const KernelName& kernel_name,
+                                         Backend backend,
+                                         DataLayout layout,
+                                         DataType dtype) const;
+
+  Kernel SelectKernel(const KernelName& kernel_name,
+                      const KernelKey& kernel_key) const;
+
+ private:
+  KernelFactory() = default;
+
+  KernelMap kernels_;
+  // Used to be compatible with the original execution system and
+  // quickly confirm whether the new kernel can be called
+  std::unordered_set<std::string> compatible_op_types_;
+};
+
+/** operator << overload **/
+
+inline std::ostream& operator<<(std::ostream& os,
+                                const KernelName& kernel_name) {
+  if (kernel_name.overload_name().empty()) {
+    os << kernel_name.name();
+  } else {
+    os << kernel_name.name() << "." << kernel_name.overload_name();
+  }
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const KernelKey& kernel_key) {
+  os << "(" << kernel_key.backend() << ", " << kernel_key.layout() << ", "
+     << kernel_key.dtype() << ")";
+  return os;
+}
+
+std::ostream& operator<<(std::ostream& os, const Kernel& kernel);
+
+std::ostream& operator<<(std::ostream& os, KernelFactory& kernel_factory);
+
+}  // namespace pten
diff --git a/paddle/pten/core/kernel_registry.h b/paddle/pten/core/kernel_registry.h
new file mode 100644
index 0000000000000..c2b97148aa5fb
--- /dev/null
+++ b/paddle/pten/core/kernel_registry.h
@@ -0,0 +1,858 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstring>
+#include <type_traits>
+#include <typeindex>
+#include <typeinfo>
+#include <vector>
+
+#include "paddle/pten/core/kernel_def.h"
+#include "paddle/pten/core/kernel_factory.h"
+#include "paddle/pten/core/kernel_utils.h"
+
+namespace pten {
+
+#define BACKEND(arg__) pten::Backend::arg__
+#define DATALAYOUT(arg__) pten::DataLayout::arg__
+#define DATATYPE(arg__) pten::DataType::arg__
+
+template <typename Func>
+struct KernelArgsParseFunctor;
+
+template <typename Return_, typename... Args_>
+struct KernelArgsParseFunctor<Return_ (*)(Args_...)> {
+  using Args = std::tuple<Args_...>;
+  enum : std::size_t { Arity = sizeof...(Args_) };
+  using Indices = std::make_index_sequence<Arity>;
+  template <std::size_t Index>
+  using Arg = typename std::tuple_element<Index, Args>::type;
+
+  static void Parse(const KernelKey& default_key, KernelArgsDef* args_def) {
+    // TODO(chenweihang): The fluid Tensor's default layout is NCHW,
+    // it is not same as kernel's layout, we should fix this error on
+    // fluid Tensor
+    auto default_tensor_layout = pten::DataLayout::NCHW;
+    if (default_key.layout() != pten::DataLayout::ANY) {
+      default_tensor_layout = default_key.layout();
+    }
+    auto args_type = ParseArgType(Indices{});
+    for (auto arg_type : args_type) {
+      if (arg_type == std::type_index(typeid(const CPUContext&))
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+          ||
+          arg_type == std::type_index(typeid(const CUDAContext&))) {
+#else
+              ) {
+#endif
+        // do nothing, skip context arg now
+      } else if (arg_type == std::type_index(typeid(const DenseTensor&))) {
+        args_def->AppendInput(
+            default_key.backend(), default_tensor_layout, default_key.dtype());
+      } else if (arg_type ==
+                 std::type_index(typeid(const std::vector<DenseTensor>&))) {
+        args_def->AppendInput(
+            default_key.backend(), default_tensor_layout, default_key.dtype());
+      } else if (arg_type == std::type_index(typeid(DenseTensor*))) {
+        args_def->AppendOutput(
+            default_key.backend(), default_tensor_layout, default_key.dtype());
+      } else if (arg_type ==
+                 std::type_index(typeid(std::vector<DenseTensor*>))) {
+        args_def->AppendOutput(
+            default_key.backend(), default_tensor_layout, default_key.dtype());
+      } else {
+        // Attribute deal with
+        // TODO(chenweihang): now here allow any types of attribute, maybe
+        // should add limits here
+        args_def->AppendAttribute(arg_type);
+      }
+    }
+  }
+
+ private:
+  template <std::size_t... INDEX>
+  static std::vector<std::type_index> ParseArgType(
+      std::index_sequence<INDEX...>) {
+    return {std::type_index(typeid(Arg<INDEX>))...};
+  }
+};
+
+struct KernelRegistrar {
+ public:
+  KernelRegistrar(const char* kernel_name_cstr,
+                  Backend backend,
+                  DataLayout layout,
+                  DataType dtype,
+                  KernelArgsParseFn args_parse_fn,
+                  KernelArgsDefFn args_def_fn,
+                  KernelFn kernel_fn) {
+    ConstructKernel(kernel_name_cstr,
+                    backend,
+                    layout,
+                    dtype,
+                    args_parse_fn,
+                    args_def_fn,
+                    kernel_fn);
+  }
+
+  KernelRegistrar(const char* kernel_name_cstr,
+                  Backend backend,
+                  DataLayout layout,
+                  KernelArgsParseFn args_parse_fn,
+                  KernelArgsDefFn args_def_fn,
+                  KernelFn kernel_fn) {
+    if (layout == DataLayout::ANY) {
+      for (size_t layout_iter = static_cast<size_t>(DataLayout::NHWC);
+           layout_iter != static_cast<size_t>(DataLayout::NUM_DATA_LAYOUTS);
+           layout_iter++) {
+        for (size_t dtype = static_cast<size_t>(DataType::BOOL);
+             dtype != static_cast<size_t>(DataType::NUM_DATA_TYPES);
+             dtype++) {
+          ConstructKernel(kernel_name_cstr,
+                          backend,
+                          static_cast<DataLayout>(layout_iter),
+                          static_cast<DataType>(dtype),
+                          args_parse_fn,
+                          args_def_fn,
+                          kernel_fn);
+        }
+      }
+    } else {
+      for (size_t dtype = static_cast<size_t>(DataType::BOOL);
+           dtype != static_cast<size_t>(DataType::NUM_DATA_TYPES);
+           dtype++) {
+        ConstructKernel(kernel_name_cstr,
+                        backend,
+                        layout,
+                        static_cast<DataType>(dtype),
+                        args_parse_fn,
+                        args_def_fn,
+                        kernel_fn);
+      }
+    }
+  }
+
+ private:
+  void ConstructKernel(const char* kernel_name_cstr,
+                       Backend backend,
+                       DataLayout layout,
+                       DataType dtype,
+                       KernelArgsParseFn args_parse_fn,
+                       KernelArgsDefFn args_def_fn,
+                       KernelFn kernel_fn) {
+    KernelName kernel_name(kernel_name_cstr);
+    KernelKey kernel_key(backend, layout, dtype);
+    Kernel kernel(kernel_fn);
+    args_parse_fn(kernel_key, kernel.mutable_args_def());
+    args_def_fn(&kernel);
+
+    KernelFactory::Instance().InsertCompatibleOpType(kernel_name.name());
+    KernelFactory::Instance().kernels()[kernel_name][kernel_key] = kernel;
+  }
+};
+
+#define PT_STATIC_ASSERT_GLOBAL_NAMESPACE(uniq_name, msg) \
+  _PT_STATIC_ASSERT_GLOBAL_NAMESPACE(uniq_name, msg)
+
+#define _PT_STATIC_ASSERT_GLOBAL_NAMESPACE(uniq_name, msg)                    \
+  struct __test_global_namespace_##uniq_name##__ {};                          \
+  static_assert(std::is_same<::__test_global_namespace_##uniq_name##__,       \
+                             __test_global_namespace_##uniq_name##__>::value, \
+                msg)
+
+#ifdef __COUNTER__
+#define PT_ID __COUNTER__
+#else
+#define PT_ID __LINE__
+#endif
+
+#if defined(_WIN32)
+#define UNUSED
+#define __builtin_expect(EXP, C) (EXP)
+#else
+#define UNUSED __attribute__((unused))
+#endif
+
+#define PT_CONCATENATE(arg1, arg2) PT_CONCATENATE1(arg1, arg2)
+#define PT_CONCATENATE1(arg1, arg2) PT_CONCATENATE2(arg1, arg2)
+#define PT_CONCATENATE2(arg1, arg2) arg1##arg2
+#define PT_EXPAND(x) x
+
+/**
+ * Reference:
+ *
+ *   https://stackoverflow.com/questions/1872220/is-it-possible-to-iterate-over-arguments-in-variadic-macros
+ *   https://stackoverflow.com/questions/9183993/msvc-variadic-macro-expansion?rq=1
+ *   https://stackoverflow.com/questions/5134523/msvc-doesnt-expand-va-args-correctly
+ *
+ * Very carefully tiptoeing around an MSVC bug where it improperly expands
+ * __VA_ARGS__ as a single token in argument lists.  See these URLs for details:
+ *
+ *   http://connect.microsoft.com/VisualStudio/feedback/details/380090/variadic-macro-replacement
+ *   http://cplusplus.co.il/2010/07/17/variadic-macro-to-count-number-of-arguments/#comment-644
+ */
+#define PT_NARGS(...) _PT_NARGS((__VA_ARGS__, _PT_RESQ_N()))
+#define _PT_NARGS(...) _PT_ARG_N(__VA_ARGS__)
+#define _PT_ARG_N_EXPAND(                                                     \
+    _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, N, ...) \
+  N
+#define _PT_ARG_N(args) _PT_ARG_N_EXPAND args
+#define _PT_RESQ_N() 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+#define PT_REGISTER_KERNEL(                                       \
+    kernel_name, backend, layout, meta_kernel_fn, cpp_dtype, ...) \
+  _PT_REGISTER_KERNEL(kernel_name,                                \
+                      PT_ID,                                      \
+                      backend,                                    \
+                      layout,                                     \
+                      meta_kernel_fn,                             \
+                      cpp_dtype,                                  \
+                      __VA_ARGS__)
+#ifndef _WIN32
+#define _PT_REGISTER_KERNEL(                                                   \
+    kernel_name, func_id, backend, layout, meta_kernel_fn, cpp_dtype, ...)     \
+  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                           \
+      PT_CONCATENATE(pt_op_kernel_ns_check_, func_id),                         \
+      "PT_REGISTER_KERNEL must be called in global namespace.");               \
+  PT_KERNEL_INSTANTIATION(meta_kernel_fn, cpp_dtype, __VA_ARGS__);             \
+  static void PT_CONCATENATE(__PT_KERNEL_args_def_FN_,                         \
+                             func_id)(::pten::Kernel*);                        \
+  PT_KERNEL_REGISTRAR_INIT(kernel_name,                                        \
+                           func_id,                                            \
+                           backend,                                            \
+                           layout,                                             \
+                           &PT_CONCATENATE(__PT_KERNEL_args_def_FN_, func_id), \
+                           meta_kernel_fn,                                     \
+                           cpp_dtype,                                          \
+                           __VA_ARGS__);                                       \
+  void PT_CONCATENATE(__PT_KERNEL_args_def_FN_,                                \
+                      func_id)(::pten::Kernel * kernel)
+#else
+#define _PT_REGISTER_KERNEL(                                                   \
+    kernel_name, func_id, backend, layout, meta_kernel_fn, cpp_dtype, ...)     \
+  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                           \
+      PT_CONCATENATE(pt_op_kernel_ns_check_, func_id),                         \
+      "PT_REGISTER_KERNEL must be called in global namespace.");               \
+  static void PT_CONCATENATE(__PT_KERNEL_args_def_FN_,                         \
+                             func_id)(::pten::Kernel*);                        \
+  PT_KERNEL_REGISTRAR_INIT(kernel_name,                                        \
+                           func_id,                                            \
+                           backend,                                            \
+                           layout,                                             \
+                           &PT_CONCATENATE(__PT_KERNEL_args_def_FN_, func_id), \
+                           meta_kernel_fn,                                     \
+                           cpp_dtype,                                          \
+                           __VA_ARGS__);                                       \
+  void PT_CONCATENATE(__PT_KERNEL_args_def_FN_,                                \
+                      func_id)(::pten::Kernel * kernel)
+#endif
+
+#define PT_KERNEL_INSTANTIATION(meta_kernel_fn, cpp_dtype, ...) \
+  _PT_KERNEL_INSTANTIATION(PT_NARGS(cpp_dtype, __VA_ARGS__),    \
+                           meta_kernel_fn,                      \
+                           cpp_dtype,                           \
+                           __VA_ARGS__)
+
+#define _PT_KERNEL_INSTANTIATION(N, meta_kernel_fn, cpp_dtype, ...) \
+  PT_CONCATENATE(_PT_KERNEL_INSTANTIATION_, N)                      \
+  (meta_kernel_fn, cpp_dtype, __VA_ARGS__)
+
+/**
+ * `template decltype(fn) fn` can work on gcc and clang,
+ * but msvc will failed, error like:
+ *
+ *   error C2206: typedef cannot be used for function definition
+ *
+ * reference:
+ *
+ *   https://stackoverflow.com/questions/63989585/explicit-instantiation-of-function-using-decltype-work-on-g-but-not-on-visua
+ *
+ * So we solve the explict instantiation of kernel by CMake
+ */
+
+#define _PT_KERNEL_INSTANTIATION_1(meta_kernel_fn, cpp_dtype, ...) \
+  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>
+#define _PT_KERNEL_INSTANTIATION_2(meta_kernel_fn, cpp_dtype, ...)        \
+  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_1(meta_kernel_fn, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_3(meta_kernel_fn, cpp_dtype, ...)        \
+  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_2(meta_kernel_fn, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_4(meta_kernel_fn, cpp_dtype, ...)        \
+  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_3(meta_kernel_fn, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_5(meta_kernel_fn, cpp_dtype, ...)        \
+  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_4(meta_kernel_fn, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_6(meta_kernel_fn, cpp_dtype, ...)        \
+  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_5(meta_kernel_fn, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_7(meta_kernel_fn, cpp_dtype, ...)        \
+  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_6(meta_kernel_fn, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_8(meta_kernel_fn, cpp_dtype, ...)        \
+  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_7(meta_kernel_fn, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_9(meta_kernel_fn, cpp_dtype, ...)        \
+  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_8(meta_kernel_fn, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_10(meta_kernel_fn, cpp_dtype, ...)       \
+  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_9(meta_kernel_fn, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_11(meta_kernel_fn, cpp_dtype, ...)       \
+  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_10(meta_kernel_fn, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_12(meta_kernel_fn, cpp_dtype, ...)       \
+  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_11(meta_kernel_fn, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_13(meta_kernel_fn, cpp_dtype, ...)       \
+  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_12(meta_kernel_fn, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_14(meta_kernel_fn, cpp_dtype, ...)       \
+  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_13(meta_kernel_fn, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_15(meta_kernel_fn, cpp_dtype, ...)       \
+  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_14(meta_kernel_fn, __VA_ARGS__))
+
+#define PT_KERNEL_REGISTRAR_INIT(kernel_name,                 \
+                                 func_id,                     \
+                                 backend,                     \
+                                 layout,                      \
+                                 args_def_fn,                 \
+                                 meta_kernel_fn,              \
+                                 cpp_dtype,                   \
+                                 ...)                         \
+  _PT_KERNEL_REGISTRAR_INIT(PT_NARGS(cpp_dtype, __VA_ARGS__), \
+                            kernel_name,                      \
+                            func_id,                          \
+                            backend,                          \
+                            layout,                           \
+                            args_def_fn,                      \
+                            meta_kernel_fn,                   \
+                            cpp_dtype,                        \
+                            __VA_ARGS__)
+
+// clang-format off
+
+/* The =pre-commit always treats this macro into the wrong format,
+  and multi-line macros cannot be skipped with NOLINT.*/
+#define _PT_KERNEL_REGISTRAR_INIT(N,              \
+                                  kernel_name,    \
+                                  func_id,        \
+                                  backend,        \
+                                  layout,         \
+                                  args_def_fn,    \
+                                  meta_kernel_fn, \
+                                  cpp_dtype,      \
+                                  ...)            \
+  PT_CONCATENATE(_PT_KERNEL_REGISTRAR_INIT_, N) ( \
+    kernel_name,                                  \
+    func_id,                                      \
+    PT_ID,                                        \
+    backend,                                      \
+    layout,                                       \
+    args_def_fn,                                  \
+    meta_kernel_fn,                               \
+    cpp_dtype,                                    \
+    __VA_ARGS__)
+
+// clang-format on
+
+#define _PT_KERNEL_REGISTRAR_INIT_1(kernel_name,                    \
+                                    func_id,                        \
+                                    registrar_id,                   \
+                                    backend,                        \
+                                    layout,                         \
+                                    args_def_fn,                    \
+                                    meta_kernel_fn,                 \
+                                    cpp_dtype,                      \
+                                    ...)                            \
+  static const ::pten::KernelRegistrar PT_CONCATENATE(              \
+      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
+      kernel_name,                                                  \
+      BACKEND(backend),                                             \
+      DATALAYOUT(layout),                                           \
+      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
+      ::pten::KernelArgsParseFunctor<decltype(                      \
+          &meta_kernel_fn<cpp_dtype>)>::Parse,                      \
+      args_def_fn,                                                  \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype>));
+#define _PT_KERNEL_REGISTRAR_INIT_2(kernel_name,                    \
+                                    func_id,                        \
+                                    registrar_id,                   \
+                                    backend,                        \
+                                    layout,                         \
+                                    args_def_fn,                    \
+                                    meta_kernel_fn,                 \
+                                    cpp_dtype,                      \
+                                    ...)                            \
+  static const ::pten::KernelRegistrar PT_CONCATENATE(              \
+      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
+      kernel_name,                                                  \
+      BACKEND(backend),                                             \
+      DATALAYOUT(layout),                                           \
+      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
+      ::pten::KernelArgsParseFunctor<decltype(                      \
+          &meta_kernel_fn<cpp_dtype>)>::Parse,                      \
+      args_def_fn,                                                  \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
+  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_1(kernel_name,                \
+                                        func_id,                    \
+                                        PT_ID,                      \
+                                        backend,                    \
+                                        layout,                     \
+                                        args_def_fn,                \
+                                        meta_kernel_fn,             \
+                                        __VA_ARGS__))
+#define _PT_KERNEL_REGISTRAR_INIT_3(kernel_name,                    \
+                                    func_id,                        \
+                                    registrar_id,                   \
+                                    backend,                        \
+                                    layout,                         \
+                                    args_def_fn,                    \
+                                    meta_kernel_fn,                 \
+                                    cpp_dtype,                      \
+                                    ...)                            \
+  static const ::pten::KernelRegistrar PT_CONCATENATE(              \
+      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
+      kernel_name,                                                  \
+      BACKEND(backend),                                             \
+      DATALAYOUT(layout),                                           \
+      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
+      ::pten::KernelArgsParseFunctor<decltype(                      \
+          &meta_kernel_fn<cpp_dtype>)>::Parse,                      \
+      args_def_fn,                                                  \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
+  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_2(kernel_name,                \
+                                        func_id,                    \
+                                        PT_ID,                      \
+                                        backend,                    \
+                                        layout,                     \
+                                        args_def_fn,                \
+                                        meta_kernel_fn,             \
+                                        __VA_ARGS__))
+#define _PT_KERNEL_REGISTRAR_INIT_4(kernel_name,                    \
+                                    func_id,                        \
+                                    registrar_id,                   \
+                                    backend,                        \
+                                    layout,                         \
+                                    args_def_fn,                    \
+                                    meta_kernel_fn,                 \
+                                    cpp_dtype,                      \
+                                    ...)                            \
+  static const ::pten::KernelRegistrar PT_CONCATENATE(              \
+      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
+      kernel_name,                                                  \
+      BACKEND(backend),                                             \
+      DATALAYOUT(layout),                                           \
+      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
+      ::pten::KernelArgsParseFunctor<decltype(                      \
+          &meta_kernel_fn<cpp_dtype>)>::Parse,                      \
+      args_def_fn,                                                  \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
+  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_3(kernel_name,                \
+                                        func_id,                    \
+                                        PT_ID,                      \
+                                        backend,                    \
+                                        layout,                     \
+                                        args_def_fn,                \
+                                        meta_kernel_fn,             \
+                                        __VA_ARGS__))
+#define _PT_KERNEL_REGISTRAR_INIT_5(kernel_name,                    \
+                                    func_id,                        \
+                                    registrar_id,                   \
+                                    backend,                        \
+                                    layout,                         \
+                                    args_def_fn,                    \
+                                    meta_kernel_fn,                 \
+                                    cpp_dtype,                      \
+                                    ...)                            \
+  static const ::pten::KernelRegistrar PT_CONCATENATE(              \
+      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
+      kernel_name,                                                  \
+      BACKEND(backend),                                             \
+      DATALAYOUT(layout),                                           \
+      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
+      ::pten::KernelArgsParseFunctor<decltype(                      \
+          &meta_kernel_fn<cpp_dtype>)>::Parse,                      \
+      args_def_fn,                                                  \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
+  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_4(kernel_name,                \
+                                        func_id,                    \
+                                        PT_ID,                      \
+                                        backend,                    \
+                                        layout,                     \
+                                        args_def_fn,                \
+                                        meta_kernel_fn,             \
+                                        __VA_ARGS__))
+#define _PT_KERNEL_REGISTRAR_INIT_6(kernel_name,                    \
+                                    func_id,                        \
+                                    registrar_id,                   \
+                                    backend,                        \
+                                    layout,                         \
+                                    args_def_fn,                    \
+                                    meta_kernel_fn,                 \
+                                    cpp_dtype,                      \
+                                    ...)                            \
+  static const ::pten::KernelRegistrar PT_CONCATENATE(              \
+      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
+      kernel_name,                                                  \
+      BACKEND(backend),                                             \
+      DATALAYOUT(layout),                                           \
+      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
+      ::pten::KernelArgsParseFunctor<decltype(                      \
+          &meta_kernel_fn<cpp_dtype>)>::Parse,                      \
+      args_def_fn,                                                  \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
+  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_5(kernel_name,                \
+                                        func_id,                    \
+                                        PT_ID,                      \
+                                        backend,                    \
+                                        layout,                     \
+                                        args_def_fn,                \
+                                        meta_kernel_fn,             \
+                                        __VA_ARGS__))
+#define _PT_KERNEL_REGISTRAR_INIT_7(kernel_name,                    \
+                                    func_id,                        \
+                                    registrar_id,                   \
+                                    backend,                        \
+                                    layout,                         \
+                                    args_def_fn,                    \
+                                    meta_kernel_fn,                 \
+                                    cpp_dtype,                      \
+                                    ...)                            \
+  static const ::pten::KernelRegistrar PT_CONCATENATE(              \
+      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
+      kernel_name,                                                  \
+      BACKEND(backend),                                             \
+      DATALAYOUT(layout),                                           \
+      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
+      ::pten::KernelArgsParseFunctor<decltype(                      \
+          &meta_kernel_fn<cpp_dtype>)>::Parse,                      \
+      args_def_fn,                                                  \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
+  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_6(kernel_name,                \
+                                        func_id,                    \
+                                        PT_ID,                      \
+                                        backend,                    \
+                                        layout,                     \
+                                        args_def_fn,                \
+                                        meta_kernel_fn,             \
+                                        __VA_ARGS__))
+#define _PT_KERNEL_REGISTRAR_INIT_8(kernel_name,                    \
+                                    func_id,                        \
+                                    registrar_id,                   \
+                                    backend,                        \
+                                    layout,                         \
+                                    args_def_fn,                    \
+                                    meta_kernel_fn,                 \
+                                    cpp_dtype,                      \
+                                    ...)                            \
+  static const ::pten::KernelRegistrar PT_CONCATENATE(              \
+      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
+      kernel_name,                                                  \
+      BACKEND(backend),                                             \
+      DATALAYOUT(layout),                                           \
+      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
+      ::pten::KernelArgsParseFunctor<decltype(                      \
+          &meta_kernel_fn<cpp_dtype>)>::Parse,                      \
+      args_def_fn,                                                  \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
+  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_7(kernel_name,                \
+                                        func_id,                    \
+                                        PT_ID,                      \
+                                        backend,                    \
+                                        layout,                     \
+                                        args_def_fn,                \
+                                        meta_kernel_fn,             \
+                                        __VA_ARGS__))
+#define _PT_KERNEL_REGISTRAR_INIT_9(kernel_name,                    \
+                                    func_id,                        \
+                                    registrar_id,                   \
+                                    backend,                        \
+                                    layout,                         \
+                                    args_def_fn,                    \
+                                    meta_kernel_fn,                 \
+                                    cpp_dtype,                      \
+                                    ...)                            \
+  static const ::pten::KernelRegistrar PT_CONCATENATE(              \
+      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
+      kernel_name,                                                  \
+      BACKEND(backend),                                             \
+      DATALAYOUT(layout),                                           \
+      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
+      ::pten::KernelArgsParseFunctor<decltype(                      \
+          &meta_kernel_fn<cpp_dtype>)>::Parse,                      \
+      args_def_fn,                                                  \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
+  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_8(kernel_name,                \
+                                        func_id,                    \
+                                        PT_ID,                      \
+                                        backend,                    \
+                                        layout,                     \
+                                        args_def_fn,                \
+                                        meta_kernel_fn,             \
+                                        __VA_ARGS__))
+#define _PT_KERNEL_REGISTRAR_INIT_10(kernel_name,                   \
+                                     func_id,                       \
+                                     registrar_id,                  \
+                                     backend,                       \
+                                     layout,                        \
+                                     args_def_fn,                   \
+                                     meta_kernel_fn,                \
+                                     cpp_dtype,                     \
+                                     ...)                           \
+  static const ::pten::KernelRegistrar PT_CONCATENATE(              \
+      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
+      kernel_name,                                                  \
+      BACKEND(backend),                                             \
+      DATALAYOUT(layout),                                           \
+      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
+      ::pten::KernelArgsParseFunctor<decltype(                      \
+          &meta_kernel_fn<cpp_dtype>)>::Parse,                      \
+      args_def_fn,                                                  \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
+  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_9(kernel_name,                \
+                                        func_id,                    \
+                                        PT_ID,                      \
+                                        backend,                    \
+                                        layout,                     \
+                                        args_def_fn,                \
+                                        meta_kernel_fn,             \
+                                        __VA_ARGS__))
+#define _PT_KERNEL_REGISTRAR_INIT_11(kernel_name,                   \
+                                     func_id,                       \
+                                     registrar_id,                  \
+                                     backend,                       \
+                                     layout,                        \
+                                     args_def_fn,                   \
+                                     meta_kernel_fn,                \
+                                     cpp_dtype,                     \
+                                     ...)                           \
+  static const ::pten::KernelRegistrar PT_CONCATENATE(              \
+      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
+      kernel_name,                                                  \
+      BACKEND(backend),                                             \
+      DATALAYOUT(layout),                                           \
+      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
+      ::pten::KernelArgsParseFunctor<decltype(                      \
+          &meta_kernel_fn<cpp_dtype>)>::Parse,                      \
+      args_def_fn,                                                  \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
+  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_10(kernel_name,               \
+                                         func_id,                   \
+                                         PT_ID,                     \
+                                         backend,                   \
+                                         layout,                    \
+                                         args_def_fn,               \
+                                         meta_kernel_fn,            \
+                                         __VA_ARGS__))
+#define _PT_KERNEL_REGISTRAR_INIT_12(kernel_name,                   \
+                                     func_id,                       \
+                                     registrar_id,                  \
+                                     backend,                       \
+                                     layout,                        \
+                                     args_def_fn,                   \
+                                     meta_kernel_fn,                \
+                                     cpp_dtype,                     \
+                                     ...)                           \
+  static const ::pten::KernelRegistrar PT_CONCATENATE(              \
+      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
+      kernel_name,                                                  \
+      BACKEND(backend),                                             \
+      DATALAYOUT(layout),                                           \
+      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
+      ::pten::KernelArgsParseFunctor<decltype(                      \
+          &meta_kernel_fn<cpp_dtype>)>::Parse,                      \
+      args_def_fn,                                                  \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
+  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_11(kernel_name,               \
+                                         func_id,                   \
+                                         PT_ID,                     \
+                                         backend,                   \
+                                         layout,                    \
+                                         args_def_fn,               \
+                                         meta_kernel_fn,            \
+                                         __VA_ARGS__))
+#define _PT_KERNEL_REGISTRAR_INIT_13(kernel_name,                   \
+                                     func_id,                       \
+                                     registrar_id,                  \
+                                     backend,                       \
+                                     layout,                        \
+                                     args_def_fn,                   \
+                                     meta_kernel_fn,                \
+                                     cpp_dtype,                     \
+                                     ...)                           \
+  static const ::pten::KernelRegistrar PT_CONCATENATE(              \
+      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
+      kernel_name,                                                  \
+      BACKEND(backend),                                             \
+      DATALAYOUT(layout),                                           \
+      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
+      ::pten::KernelArgsParseFunctor<decltype(                      \
+          &meta_kernel_fn<cpp_dtype>)>::Parse,                      \
+      args_def_fn,                                                  \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
+  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_12(kernel_name,               \
+                                         func_id,                   \
+                                         PT_ID,                     \
+                                         backend,                   \
+                                         layout,                    \
+                                         args_def_fn,               \
+                                         meta_kernel_fn,            \
+                                         __VA_ARGS__))
+#define _PT_KERNEL_REGISTRAR_INIT_14(kernel_name,                   \
+                                     func_id,                       \
+                                     registrar_id,                  \
+                                     backend,                       \
+                                     layout,                        \
+                                     args_def_fn,                   \
+                                     meta_kernel_fn,                \
+                                     cpp_dtype,                     \
+                                     ...)                           \
+  static const ::pten::KernelRegistrar PT_CONCATENATE(              \
+      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
+      kernel_name,                                                  \
+      BACKEND(backend),                                             \
+      DATALAYOUT(layout),                                           \
+      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
+      ::pten::KernelArgsParseFunctor<decltype(                      \
+          &meta_kernel_fn<cpp_dtype>)>::Parse,                      \
+      args_def_fn,                                                  \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
+  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_13(kernel_name,               \
+                                         func_id,                   \
+                                         PT_ID,                     \
+                                         backend,                   \
+                                         layout,                    \
+                                         args_def_fn,               \
+                                         meta_kernel_fn,            \
+                                         __VA_ARGS__))
+#define _PT_KERNEL_REGISTRAR_INIT_15(kernel_name,                   \
+                                     func_id,                       \
+                                     registrar_id,                  \
+                                     backend,                       \
+                                     layout,                        \
+                                     args_def_fn,                   \
+                                     meta_kernel_fn,                \
+                                     cpp_dtype,                     \
+                                     ...)                           \
+  static const ::pten::KernelRegistrar PT_CONCATENATE(              \
+      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
+      kernel_name,                                                  \
+      BACKEND(backend),                                             \
+      DATALAYOUT(layout),                                           \
+      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
+      ::pten::KernelArgsParseFunctor<decltype(                      \
+          &meta_kernel_fn<cpp_dtype>)>::Parse,                      \
+      args_def_fn,                                                  \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
+  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_14(kernel_name,               \
+                                         func_id,                   \
+                                         PT_ID,                     \
+                                         backend,                   \
+                                         layout,                    \
+                                         args_def_fn,               \
+                                         meta_kernel_fn,            \
+                                         __VA_ARGS__))
+
+#define PT_REGISTER_KERNEL_STANDARD(                \
+    kernel_name, backend, layout, dtype, kernel_fn) \
+  _PT_REGISTER_KERNEL_STANDARD(                     \
+      kernel_name, PT_ID, backend, layout, dtype, kernel_fn)
+
+#define _PT_REGISTER_KERNEL_STANDARD(                                      \
+    kernel_name, func_id, backend, layout, dtype, kernel_fn)               \
+  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                       \
+      PT_CONCATENATE(pt_op_kernel_ns_check_, func_id),                     \
+      "_PT_REGISTER_KERNEL_STANDARD must be called in global namespace."); \
+  template decltype(kernel_fn) kernel_fn;                                  \
+  static void PT_CONCATENATE(__PT_KERNEL_args_def_FN_,                     \
+                             func_id)(::pten::Kernel*);                    \
+  static const ::pten::KernelRegistrar PT_CONCATENATE(__reg_pt_op_kernel_, \
+                                                      func_id)(            \
+      kernel_name,                                                         \
+      BACKEND(backend),                                                    \
+      DATALAYOUT(layout),                                                  \
+      DATATYPE(dtype),                                                     \
+      ::pten::KernelArgsParseFunctor<decltype(&kernel_fn)>::Parse,         \
+      args_def_fn,                                                         \
+      PT_KERNEL(kernel_fn));                                               \
+  void PT_CONCATENATE(__PT_KERNEL_args_def_FN_, func_id)(::pten::Kernel*)
+
+// use to declare symbol
+#define PT_REGISTER_MODULE(name) \
+  int RegisterSymbolsFor##name() { return 0; }
+
+#define PT_DECLARE_MODULE(name)          \
+  extern int RegisterSymbolsFor##name(); \
+  UNUSED static int use_kernel_module_##name = RegisterSymbolsFor##name()
+
+// only used in cpp tests
+
+#define PT_REGISTER_KERNEL_FOR_TEST(                              \
+    kernel_name, backend, layout, meta_kernel_fn, cpp_dtype, ...) \
+  _PT_REGISTER_KERNEL_FOR_TEST(kernel_name,                       \
+                               PT_ID,                             \
+                               backend,                           \
+                               layout,                            \
+                               meta_kernel_fn,                    \
+                               cpp_dtype,                         \
+                               __VA_ARGS__)
+
+#define _PT_REGISTER_KERNEL_FOR_TEST(                                      \
+    kernel_name, func_id, backend, layout, meta_kernel_fn, cpp_dtype, ...) \
+  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                       \
+      PT_CONCATENATE(pt_op_kernel_for_test_ns_check_, func_id),            \
+      "PT_REGISTER_KERNEL must be called in global namespace.");           \
+  static void PT_CONCATENATE(__PT_KERNEL_for_test_args_def_FN_,            \
+                             func_id)(::pten::Kernel*);                    \
+  PT_KERNEL_REGISTRAR_INIT(                                                \
+      kernel_name,                                                         \
+      func_id,                                                             \
+      backend,                                                             \
+      layout,                                                              \
+      &PT_CONCATENATE(__PT_KERNEL_for_test_args_def_FN_, func_id),         \
+      meta_kernel_fn,                                                      \
+      cpp_dtype,                                                           \
+      __VA_ARGS__);                                                        \
+  void PT_CONCATENATE(__PT_KERNEL_for_test_args_def_FN_,                   \
+                      func_id)(::pten::Kernel * kernel)
+
+#define PT_REGISTER_KERNEL_WITH_NO_TYPE(          \
+    kernel_name, backend, layout, meta_kernel_fn) \
+  _PT_REGISTER_KERNEL_WITH_NO_TYPE(               \
+      kernel_name, PT_ID, backend, layout, meta_kernel_fn)
+
+#define _PT_REGISTER_KERNEL_WITH_NO_TYPE(                               \
+    kernel_name, func_id, backend, layout, meta_kernel_fn)              \
+  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                    \
+      PT_CONCATENATE(pt_op_kernel_ns_check_, func_id),                  \
+      "PT_REGISTER_KERNEL must be called in global namespace.");        \
+  decltype(meta_kernel_fn) meta_kernel_fn;                              \
+  static void PT_CONCATENATE(__PT_KERNEL_args_def_FN_,                  \
+                             func_id)(::pten::Kernel*);                 \
+  static const ::pten::KernelRegistrar __reg_pt_op_kernel_##func_id(    \
+      kernel_name,                                                      \
+      BACKEND(backend),                                                 \
+      DATALAYOUT(layout),                                               \
+      ::pten::KernelArgsParseFunctor<decltype(&meta_kernel_fn)>::Parse, \
+      &PT_CONCATENATE(__PT_KERNEL_args_def_FN_, func_id),               \
+      PT_KERNEL(meta_kernel_fn));                                       \
+  void PT_CONCATENATE(__PT_KERNEL_args_def_FN_,                         \
+                      func_id)(::pten::Kernel * kernel)
+}  // namespace pten
diff --git a/paddle/pten/core/kernel_utils.h b/paddle/pten/core/kernel_utils.h
new file mode 100644
index 0000000000000..23143c06244ca
--- /dev/null
+++ b/paddle/pten/core/kernel_utils.h
@@ -0,0 +1,233 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/pten/common/scalar.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/kernel_context.h"
+#include "paddle/pten/core/kernel_def.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace pten {
+
+// TODO(shixiaowei): replaced by new DeviceContext later
+using CPUContext = paddle::platform::CPUDeviceContext;
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+using CUDAContext = paddle::platform::CUDADeviceContext;
+#endif
+#ifdef PADDLE_WITH_MKLDNN
+using MKLDNNContext = paddle::platform::MKLDNNDeviceContext;
+#endif
+#ifdef PADDLE_WITH_ASCEND_CL
+using NPUContext = paddle::platform::NPUDeviceContext;
+#endif
+#ifdef PADDLE_WITH_XPU
+using XPUContext = paddle::platform::XPUDeviceContext;
+#endif
+
+#define PT_KERNEL(...) \
+  ::pten::KernelImpl<decltype(&__VA_ARGS__), &__VA_ARGS__>::Compute
+
+#define PT_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(dev_ctx)           \
+  template <typename... Tail>                                                \
+  struct KernelCallHelper<const dev_ctx&, Tail...> {                         \
+    template <int dev_ctx_idx,                                               \
+              int in_idx,                                                    \
+              int attr_idx,                                                  \
+              int out_idx,                                                   \
+              typename... PreviousArgs>                                      \
+    static void Compute(KernelContext* ctx, PreviousArgs&... pargs) {        \
+      static_assert(in_idx == 0,                                             \
+                    "Kernel's DeviceContext should appear before Inputs.");  \
+      static_assert(                                                         \
+          attr_idx == 0,                                                     \
+          "Kernel's DeviceContext should appear before Attributes.");        \
+      static_assert(out_idx == 0,                                            \
+                    "Kernel's DeviceContext should appear before Outputs."); \
+      const dev_ctx& arg = ctx->GetDeviceContext<dev_ctx>();                 \
+      KernelCallHelper<Tail...>::                                            \
+          template Compute<dev_ctx_idx + 1, in_idx, attr_idx, out_idx>(      \
+              ctx, pargs..., arg);                                           \
+    }                                                                        \
+  }
+
+#define PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(tensor_type)           \
+  template <typename... Tail>                                           \
+  struct KernelCallHelper<const tensor_type&, Tail...> {                \
+    template <int dev_ctx_idx,                                          \
+              int in_idx,                                               \
+              int attr_idx,                                             \
+              int out_idx,                                              \
+              typename... PreviousArgs>                                 \
+    static void Compute(KernelContext* ctx, PreviousArgs&... pargs) {   \
+      static_assert(attr_idx == 0,                                      \
+                    "Kernel's Input should appear before Attributes."); \
+      static_assert(out_idx == 0,                                       \
+                    "Kernel's Input should appear before Outputs.");    \
+      const std::pair<int, int> range = ctx->InputRangeAt(in_idx);      \
+      const tensor_type& arg = ctx->InputAt<tensor_type>(range.first);  \
+      KernelCallHelper<Tail...>::                                       \
+          template Compute<dev_ctx_idx, in_idx + 1, attr_idx, out_idx>( \
+              ctx, pargs..., arg);                                      \
+    }                                                                   \
+  }
+
+#define PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(tensor_type)     \
+  template <typename... Tail>                                           \
+  struct KernelCallHelper<const std::vector<tensor_type>&, Tail...> {   \
+    template <int dev_ctx_idx,                                          \
+              int in_idx,                                               \
+              int attr_idx,                                             \
+              int out_idx,                                              \
+              typename... PreviousArgs>                                 \
+    static void Compute(KernelContext* ctx, PreviousArgs&... pargs) {   \
+      static_assert(attr_idx == 0,                                      \
+                    "Kernel's Input should appear before Attributes."); \
+      static_assert(out_idx == 0,                                       \
+                    "Kernel's Input should appear before Outputs.");    \
+      const std::pair<int, int> range = ctx->InputRangeAt(in_idx);      \
+      std::vector<tensor_type> arg = std::move(                         \
+          ctx->InputBetween<tensor_type>(range.first, range.second));   \
+      KernelCallHelper<Tail...>::                                       \
+          template Compute<dev_ctx_idx, in_idx + 1, attr_idx, out_idx>( \
+              ctx, pargs..., arg);                                      \
+    }                                                                   \
+  }
+
+#define PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(attr_type)           \
+  template <typename... Tail>                                             \
+  struct KernelCallHelper<attr_type, Tail...> {                           \
+    template <int dev_ctx_idx,                                            \
+              int in_idx,                                                 \
+              int attr_idx,                                               \
+              int out_idx,                                                \
+              typename... PreviousArgs>                                   \
+    static void Compute(KernelContext* ctx, PreviousArgs&... pargs) {     \
+      static_assert(out_idx == 0,                                         \
+                    "Kernel's Attributes should appear before Outputs."); \
+      attr_type arg = ctx->AttrAt<attr_type>(attr_idx);                   \
+      KernelCallHelper<Tail...>::                                         \
+          template Compute<dev_ctx_idx, in_idx, attr_idx + 1, out_idx>(   \
+              ctx, pargs..., arg);                                        \
+    }                                                                     \
+  }
+
+#define PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(tensor_type)           \
+  template <typename... Tail>                                            \
+  struct KernelCallHelper<tensor_type*, Tail...> {                       \
+    template <int dev_ctx_idx,                                           \
+              int in_idx,                                                \
+              int attr_idx,                                              \
+              int out_idx,                                               \
+              typename... PreviousArgs>                                  \
+    static void Compute(KernelContext* ctx, PreviousArgs&... pargs) {    \
+      const std::pair<int, int> range = ctx->OutputRangeAt(out_idx);     \
+      tensor_type* arg = ctx->MutableOutputAt<tensor_type>(range.first); \
+      KernelCallHelper<Tail...>::                                        \
+          template Compute<dev_ctx_idx, in_idx, attr_idx, out_idx + 1>(  \
+              ctx, pargs..., arg);                                       \
+    }                                                                    \
+  }
+
+#define PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(tensor_type)          \
+  template <typename... Tail>                                                 \
+  struct KernelCallHelper<std::vector<tensor_type*>, Tail...> {               \
+    template <int dev_ctx_idx,                                                \
+              int in_idx,                                                     \
+              int attr_idx,                                                   \
+              int out_idx,                                                    \
+              typename... PreviousArgs>                                       \
+    static void Compute(KernelContext* ctx, PreviousArgs&... pargs) {         \
+      const std::pair<int, int> range = ctx->OutputRangeAt(out_idx);          \
+      std::vector<tensor_type*> arg = std::move(                              \
+          ctx->MutableOutputBetween<tensor_type>(range.first, range.second)); \
+      KernelCallHelper<Tail...>::                                             \
+          template Compute<dev_ctx_idx, in_idx, attr_idx, out_idx + 1>(       \
+              ctx, pargs..., arg);                                            \
+    }                                                                         \
+  }
+
+template <typename T>
+struct TypeTag {};
+
+template <typename Fn, Fn fn>
+struct KernelImpl;
+
+template <typename Return, typename... Args, Return (*kernel_fn)(Args...)>
+struct KernelImpl<Return (*)(Args...), kernel_fn> {
+  static void Compute(KernelContext* ctx) {
+    KernelCallHelper<Args..., TypeTag<int>>::template Compute<0, 0, 0, 0>(ctx);
+  }
+
+ private:
+  template <typename... RemainingArgs>
+  struct KernelCallHelper;
+
+  /* DeviceContext Helpers */
+
+  PT_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(CPUContext);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  PT_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(CUDAContext);
+#endif
+#ifdef PADDLE_WITH_ASCEND_CL
+  PT_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(NPUContext);
+#endif
+#ifdef PADDLE_WITH_XPU
+  PT_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(XPUContext);
+#endif
+
+  /* Input Helpers */
+
+  PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(DenseTensor);
+  PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(DenseTensor);
+  // TODO(chenweihang): adapt SelectedRows
+  // PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(SelectedRowsTensor);
+
+  /* Attribute Helpers */
+
+  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(bool);
+  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(float);
+  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(double);
+  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(int);
+  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(int64_t);
+  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(paddle::platform::float16);
+  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const Scalar&);
+  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<int64_t>&);
+
+  /* Output Helpers */
+
+  PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(DenseTensor);
+  PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(DenseTensor);
+  // TODO(chenweihang): adapt SelectedRows
+  // PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(SelectedRowsTensor);
+
+  /* End case */
+  template <typename T>
+  struct KernelCallHelper<TypeTag<T>> {
+    template <int dev_ctx_idx, int in_idx, int attr_idx, int out_idx>
+    static void Compute(KernelContext* ctx, Args&... args) {
+      static_assert(dev_ctx_idx > 0,
+                    "Kernel should pass DeviceContext as argument.");
+      static_assert(out_idx > 0, "Kernel should have output argument.");
+      // TODO(chenweihang): check dev_ctx, in, attr, out number
+      return kernel_fn(args...);
+    }
+  };
+};
+
+}  // namespace pten
diff --git a/paddle/pten/core/storage.cc b/paddle/pten/core/storage.cc
new file mode 100644
index 0000000000000..5cac122b7dee6
--- /dev/null
+++ b/paddle/pten/core/storage.cc
@@ -0,0 +1,25 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/core/storage.h"
+
+namespace pten {
+
+void TensorStorage::Realloc(size_t size) {
+  data_.Clear();
+  data_ = Allocate(alloc_, size);
+  size_ = size;
+}
+
+}  // namespace pten
diff --git a/paddle/pten/core/storage.h b/paddle/pten/core/storage.h
new file mode 100644
index 0000000000000..430572e253d6e
--- /dev/null
+++ b/paddle/pten/core/storage.h
@@ -0,0 +1,82 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <cstddef>
+
+#include "boost/intrusive_ptr.hpp"
+#include "paddle/pten/core/utils/intrusive_ptr.h"
+#include "paddle/pten/core/utils/intrusive_ref_counter.h"
+#include "paddle/pten/core/utils/type_info.h"
+
+#include "paddle/fluid/platform/place.h"
+#include "paddle/pten/core/allocator.h"
+
+namespace pten {
+
+/// \brief The interface of contiguous storage used for the dense tensor.
+/// It should be used in conjunction with the intrusive pointer. We prohibit
+/// all default copy operations to ensure the integrity of the package.
+class Storage : public intrusive_ref_counter<Storage> {
+ public:
+  using Place = paddle::platform::Place;
+  Storage() = default;
+  Storage(const Storage&) = delete;
+
+  explicit Storage(Allocation&& data) : data_(std::move(data)) {}
+
+  virtual ~Storage() = default;
+
+  /// \brief Get the mutable data pointer of the storage.
+  /// This function is set to inline to improve performance.
+  /// \return The mutable data pointer of the storage.
+  void* data() const noexcept { return data_.operator->(); }
+
+  virtual size_t size() const = 0;
+  virtual const Place& place() const = 0;
+  virtual bool OwnsMemory() const = 0;
+  virtual void Realloc(size_t n) = 0;
+
+ protected:
+  Allocation data_;
+};
+
+class TensorStorage : public Storage {
+ public:
+  using Place = paddle::platform::Place;
+
+  explicit TensorStorage(const std::shared_ptr<Allocator>& a) : alloc_(a) {}
+  TensorStorage(const std::shared_ptr<Allocator>& a, size_t size)
+      : Storage(Allocate(a, size)), alloc_(a), size_(size) {}
+
+  ~TensorStorage() = default;
+
+  static const char* name() { return "TensorStorage"; }
+
+  void Realloc(size_t size) override;
+
+  size_t size() const noexcept override { return size_; }
+  const Place& place() const override { return data_.place(); }
+  bool OwnsMemory() const noexcept override { return true; }
+  const std::shared_ptr<Allocator>& allocator() const noexcept {
+    return alloc_;
+  }
+
+ private:
+  const std::shared_ptr<Allocator> alloc_;
+  int64_t size_{0};
+};
+
+}  // namespace pten
diff --git a/paddle/pten/core/tensor_base.cc b/paddle/pten/core/tensor_base.cc
new file mode 100644
index 0000000000000..f9169674a4bbe
--- /dev/null
+++ b/paddle/pten/core/tensor_base.cc
@@ -0,0 +1,18 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/core/tensor_base.h"
+#include "paddle/pten/core/utils/type_registry.h"
+
+namespace pten {}
diff --git a/paddle/pten/core/tensor_base.h b/paddle/pten/core/tensor_base.h
new file mode 100644
index 0000000000000..79fd742aea10b
--- /dev/null
+++ b/paddle/pten/core/tensor_base.h
@@ -0,0 +1,75 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/pten/common/backend.h"
+#include "paddle/pten/common/data_type.h"
+#include "paddle/pten/common/layout.h"
+#include "paddle/pten/core/storage.h"
+#include "paddle/pten/core/utils/type_registry.h"
+
+namespace pten {
+
+class TensorBase {
+ public:
+  using DataType = paddle::experimental::DataType;
+  using DataLayout = paddle::experimental::DataLayout;
+  using DDim = paddle::framework::DDim;
+  using Place = paddle::platform::Place;
+
+  virtual ~TensorBase() = default;
+
+  /// \brief Returns the number of elements contained in tensor.
+  /// \return The number of elements contained in tensor.
+  virtual int64_t numel() const = 0;
+
+  /// \brief Returns the dims of the tensor.
+  /// \return The dims of the tensor.
+  virtual const DDim& dims() const = 0;
+
+  /// \brief Returns the data type of the tensor.
+  /// \return The data type of the tensor.
+  virtual DataType data_type() const = 0;
+
+  /// \brief Returns the data layout of the tensor.
+  /// \return The data layout of the tensor.
+  virtual DataLayout layout() const = 0;
+
+  /// \brief Returns the data place of the tensor.
+  /// \return The data place of the tensor.
+  virtual const Place& place() const = 0;
+
+  /// \brief Test whether the metadata is valid.
+  /// \return Whether the metadata is valid.
+  virtual bool valid() const = 0;
+
+  /// \brief Test whether the storage is allocated.
+  /// return Whether the storage is allocated.
+  virtual bool initialized() const = 0;
+
+  /// \brief Return the type information of the derived class to support
+  /// safely downcast in non-rtti environment.
+  /// return The type information of the derived class.
+  TypeInfo<TensorBase> type_info() const { return type_info_; }
+
+ private:
+  template <typename T, typename U>
+  friend class TypeInfoTraits;
+  TypeInfo<TensorBase> type_info_{TypeInfo<TensorBase>::kUnknownType};
+};
+
+}  // namespace pten
diff --git a/paddle/pten/core/tensor_meta.h b/paddle/pten/core/tensor_meta.h
new file mode 100644
index 0000000000000..85afc3f2f01ea
--- /dev/null
+++ b/paddle/pten/core/tensor_meta.h
@@ -0,0 +1,85 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+
+#include "paddle/pten/common/backend.h"
+#include "paddle/pten/common/data_type.h"
+#include "paddle/pten/common/layout.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/framework/ddim.h"
+// Note: mixed_vector include many header now, LoD will be
+// used on CUDA device? Can we use small_vector here?
+// #include "paddle/fluid/framework/mixed_vector.h"
+
+namespace pten {
+
+using DDim = paddle::framework::DDim;
+using LoD = std::vector<std::vector<size_t>>;
+
+/// \brief The meta data of dense tensor. Take the structure type
+/// and use all default operations.
+///
+struct DenseTensorMeta {
+  using DataType = paddle::experimental::DataType;
+  using DataLayout = paddle::experimental::DataLayout;
+
+  DenseTensorMeta() = default;
+  DenseTensorMeta(DataType type, const DDim& dims);
+  DenseTensorMeta(DataType type, const DDim& dims, DataLayout layout);
+  DenseTensorMeta(DataType type,
+                  const DDim& dims,
+                  DataLayout layout,
+                  const std::vector<std::vector<size_t>>& lod);
+
+  /// \brief Test whether the metadata is valid. Does not throw exceptions.
+  /// \return Whether the metadata is valid.
+  bool valid() const noexcept;
+
+  /// During the entire life cycle of a DenseTensor, the following attributes
+  /// marked with `const` are expected to remain unchanged.
+  const bool is_scalar{false};
+  DDim dims;
+  const DataType type{DataType::UNDEFINED};
+  const DataLayout layout{DataLayout::NCHW};
+  LoD lod;
+};
+
+inline DenseTensorMeta::DenseTensorMeta(DataType type, const DDim& dims)
+    : dims(dims), type(type) {}
+
+inline DenseTensorMeta::DenseTensorMeta(DataType type,
+                                        const DDim& dims,
+                                        DataLayout layout)
+    : dims(dims), type(type), layout(layout) {}
+
+inline DenseTensorMeta::DenseTensorMeta(
+    DataType type,
+    const DDim& dims,
+    DataLayout layout,
+    const std::vector<std::vector<size_t>>& lod)
+    : dims(dims), type(type), layout(layout), lod(lod) {}
+
+inline bool DenseTensorMeta::valid() const noexcept {
+  bool valid{true};
+  valid = valid && (type != DataType::UNDEFINED);
+  valid = valid && (layout != DataLayout::UNDEFINED);
+  valid = valid && (is_scalar || product(dims) >= 0);
+  return valid;
+}
+
+}  // namespace pten
diff --git a/paddle/pten/core/tensor_status.h b/paddle/pten/core/tensor_status.h
new file mode 100644
index 0000000000000..e426a27eabb88
--- /dev/null
+++ b/paddle/pten/core/tensor_status.h
@@ -0,0 +1,62 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/pten/common/backend.h"
+#include "paddle/pten/common/data_type.h"
+#include "paddle/pten/common/layout.h"
+namespace pten {
+class TensorInplaceVersion {
+ public:
+  explicit TensorInplaceVersion(uint32_t inplace_version = 0)
+      : inplace_version_(inplace_version) {}
+  bool IsUnique() const { return inplace_version_ == 0; }
+  void Bump() { ++inplace_version_; }
+  uint32_t CurrentVersion() const { return inplace_version_; }
+
+ private:
+  uint32_t inplace_version_;
+};
+
+/**
+ * The Status data member of DenseTensor.
+ *
+ * Here the `static` represents information describing the status of Tensor,
+ * such as version counter, or other bool status members.
+ *
+ * Note: TensorStatus is a struct, the members are named like
+ * ordinary nonmember variables, such as `type` instead of `type_`.
+ * And we direct access its members, in addition to constructor, destructor
+ * and functions for setting data members, can not provide other functions.
+ *
+ * Note: polish impl later
+ */
+struct TensorStatus {
+  TensorStatus() = default;
+  TensorStatus(const TensorStatus&) = default;
+  TensorStatus(TensorStatus&&) = default;
+
+  TensorStatus& operator=(const TensorStatus&) = delete;
+  TensorStatus& operator=(TensorStatus&&) = delete;
+
+  TensorInplaceVersion inplace_version_counter{0};
+
+  /**
+   * For Scalar Tensor design
+   */
+  bool is_scalar{false};
+};
+
+}  // namespace pten
diff --git a/paddle/pten/core/utils/intrusive_ptr.h b/paddle/pten/core/utils/intrusive_ptr.h
new file mode 100644
index 0000000000000..f0e94fadac973
--- /dev/null
+++ b/paddle/pten/core/utils/intrusive_ptr.h
@@ -0,0 +1,158 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <utility>
+#include "glog/logging.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace pten {
+
+template <typename T>
+class intrusive_ptr {
+ public:
+  using this_type = intrusive_ptr;
+  constexpr intrusive_ptr() noexcept = default;
+
+  ~intrusive_ptr() {
+    if (px) {
+      intrusive_ptr_release(px);
+    }
+  }
+
+  intrusive_ptr(intrusive_ptr&& rhs) noexcept : px(rhs.px) { rhs.px = nullptr; }
+
+  template <typename U,
+            typename = std::enable_if_t<std::is_base_of<T, U>::value>>
+  intrusive_ptr(intrusive_ptr<U>&& rhs) noexcept : px(rhs.get()) {
+    rhs.reset();
+  }
+
+  void reset() { this_type().swap(*this); }
+
+  void reset(T* rhs) { this_type(rhs).swap(*this); }
+
+  void reset(T* rhs, bool add_ref) { this_type(rhs, add_ref).swap(*this); }
+
+  T* get() const noexcept { return px; }
+
+  T* detach() noexcept {
+    T* ret = px;
+    px = nullptr;
+    return ret;
+  }
+
+  T& operator*() const {
+    PADDLE_ENFORCE_NOT_NULL(
+        px,
+        paddle::platform::errors::PreconditionNotMet(
+            "The pointer must be non-null before the dereference operation."));
+    return *px;
+  }
+
+  T* operator->() const {
+    PADDLE_ENFORCE_NOT_NULL(
+        px,
+        paddle::platform::errors::PreconditionNotMet(
+            "The pointer must be non-null before the dereference operation."));
+    return px;
+  }
+
+  void swap(intrusive_ptr& rhs) noexcept {
+    T* tmp = px;
+    px = rhs.px;
+    rhs.px = tmp;
+  }
+
+ private:
+  template <typename U,
+            typename = std::enable_if_t<std::is_base_of<T, U>::value>>
+  explicit intrusive_ptr(U* p, bool add_ref = true) : px(p) {
+    if (px && add_ref) {
+      intrusive_ptr_add_ref(px);
+    }
+  }
+
+  template <typename R, typename... Args>
+  friend intrusive_ptr<R> make_intrusive(Args&&...);
+  template <typename R>
+  friend intrusive_ptr<R> copy_intrusive(const intrusive_ptr<R>&);
+
+  T* px{nullptr};
+};
+
+template <typename T, typename U>
+inline bool operator==(const intrusive_ptr<T>& a,
+                       const intrusive_ptr<U>& b) noexcept {
+  return a.get() == b.get();
+}
+
+template <typename T, typename U>
+inline bool operator!=(const intrusive_ptr<T>& a,
+                       const intrusive_ptr<U>& b) noexcept {
+  return a.get() != b.get();
+}
+
+template <typename T, typename U>
+inline bool operator==(const intrusive_ptr<T>& a, U* b) noexcept {
+  return a.get() == b;
+}
+
+template <typename T, typename U>
+inline bool operator!=(const intrusive_ptr<T>& a, U* b) noexcept {
+  return a.get() != b;
+}
+
+template <typename T, typename U>
+inline bool operator==(T* a, const intrusive_ptr<U>& b) noexcept {
+  return a == b.get();
+}
+
+template <typename T, typename U>
+inline bool operator!=(T* a, const intrusive_ptr<U>& b) noexcept {
+  return a != b.get();
+}
+
+template <typename T>
+inline bool operator==(const intrusive_ptr<T>& p, std::nullptr_t) noexcept {
+  return p.get() == nullptr;
+}
+
+template <typename T>
+inline bool operator==(std::nullptr_t, const intrusive_ptr<T>& p) noexcept {
+  return p.get() == nullptr;
+}
+
+template <typename T>
+inline bool operator!=(const intrusive_ptr<T>& p, std::nullptr_t) noexcept {
+  return p.get() != nullptr;
+}
+
+template <typename T>
+inline bool operator!=(std::nullptr_t, const intrusive_ptr<T>& p) noexcept {
+  return p.get() != nullptr;
+}
+
+template <typename T, typename... Args>
+inline intrusive_ptr<T> make_intrusive(Args&&... args) {
+  return intrusive_ptr<T>(new T(std::forward<Args>(args)...), false);
+}
+
+template <typename T>
+inline intrusive_ptr<T> copy_intrusive(const intrusive_ptr<T>& rhs) {
+  return intrusive_ptr<T>(rhs.get(), true);
+}
+
+}  // namespace pten
diff --git a/paddle/pten/core/utils/intrusive_ref_counter.h b/paddle/pten/core/utils/intrusive_ref_counter.h
new file mode 100644
index 0000000000000..8e18c82197eb6
--- /dev/null
+++ b/paddle/pten/core/utils/intrusive_ref_counter.h
@@ -0,0 +1,64 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <atomic>
+
+namespace pten {
+
+template <typename DerivedT>
+class intrusive_ref_counter;
+template <typename DerivedT>
+void intrusive_ptr_add_ref(const intrusive_ref_counter<DerivedT>* p) noexcept;
+template <typename DerivedT>
+void intrusive_ptr_release(const intrusive_ref_counter<DerivedT>* p) noexcept;
+
+template <typename DerivedT>
+class intrusive_ref_counter {
+ public:
+  constexpr intrusive_ref_counter() noexcept : ref_(1) {}
+  virtual ~intrusive_ref_counter() = default;
+
+  unsigned int use_count() const noexcept { return ref_.load(); }
+
+ protected:
+  intrusive_ref_counter(const intrusive_ref_counter&) = delete;
+  intrusive_ref_counter& operator=(const intrusive_ref_counter&) = delete;
+
+  friend void intrusive_ptr_add_ref<DerivedT>(
+      const intrusive_ref_counter<DerivedT>* p) noexcept;
+  friend void intrusive_ptr_release<DerivedT>(
+      const intrusive_ref_counter<DerivedT>* p) noexcept;
+
+ private:
+  mutable std::atomic_int_fast32_t ref_;
+};
+
+template <typename DerivedT>
+inline void intrusive_ptr_add_ref(
+    const intrusive_ref_counter<DerivedT>* p) noexcept {
+  p->ref_.fetch_add(1, std::memory_order_relaxed);
+}
+
+template <typename DerivedT>
+inline void intrusive_ptr_release(
+    const intrusive_ref_counter<DerivedT>* p) noexcept {
+  if (p->ref_.load(std::memory_order_acquire) == 0 ||
+      p->ref_.fetch_sub(1) == 0) {
+    delete static_cast<const DerivedT*>(p);
+  }
+}
+
+}  // namespace pten
diff --git a/paddle/pten/core/utils/type_info.h b/paddle/pten/core/utils/type_info.h
new file mode 100644
index 0000000000000..4e4084a4c785b
--- /dev/null
+++ b/paddle/pten/core/utils/type_info.h
@@ -0,0 +1,59 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+
+namespace pten {
+
+template <typename BaseT>
+class TypeRegistry;
+
+template <typename BaseT>
+class TypeInfo {
+ public:
+  const std::string& name() const;
+
+  int8_t id() const { return id_; }
+
+  bool operator==(TypeInfo other) const { return id_ == other.id(); }
+  bool operator!=(TypeInfo other) const { return id_ != other.id(); }
+
+  static const TypeInfo kUnknownType;
+
+ private:
+  friend class TypeRegistry<BaseT>;
+  explicit TypeInfo(int8_t id) : id_(id) {}
+  int8_t id_;
+};
+
+template <typename BaseT, typename DerivedT>
+class TypeInfoTraits {
+ public:
+  static const TypeInfo<BaseT> kType;
+  TypeInfoTraits() {
+    static_cast<BaseT*>(static_cast<DerivedT*>(this))->type_info_ = kType;
+  }
+  static bool classof(const BaseT* obj) { return obj->type_info() == kType; }
+};
+
+template <typename BaseT>
+TypeInfo<BaseT> RegisterStaticType(const std::string& type);
+
+template <typename BaseT, typename DerivedT>
+const TypeInfo<BaseT> TypeInfoTraits<BaseT, DerivedT>::kType =
+    RegisterStaticType<BaseT>(DerivedT::name());
+
+}  // namespace pten
diff --git a/paddle/pten/core/utils/type_registry.h b/paddle/pten/core/utils/type_registry.h
new file mode 100644
index 0000000000000..82eb9ae52bd7e
--- /dev/null
+++ b/paddle/pten/core/utils/type_registry.h
@@ -0,0 +1,84 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <cassert>
+#include <mutex>
+#include <string>
+
+#include "paddle/pten/core/utils/type_info.h"
+
+namespace pten {
+
+template <typename BaseT>
+class TypeRegistry {
+ public:
+  TypeRegistry(const TypeRegistry&) = delete;
+  TypeRegistry& operator=(const TypeRegistry&) = delete;
+
+  static TypeRegistry& GetInstance();
+
+  TypeInfo<BaseT> RegisterType(const std::string& type);
+  const std::string& GetTypeName(TypeInfo<BaseT> info) const;
+
+ private:
+  TypeRegistry() = default;
+  mutable std::mutex mutex_;
+  std::vector<std::string> names_;
+  std::map<std::string, int8_t> name_to_id_;
+};
+
+template <typename BaseT>
+TypeRegistry<BaseT>& TypeRegistry<BaseT>::GetInstance() {
+  static TypeRegistry<BaseT> registry;
+  return registry;
+}
+
+template <typename BaseT>
+TypeInfo<BaseT> TypeRegistry<BaseT>::RegisterType(const std::string& type) {
+  std::lock_guard<std::mutex> guard(mutex_);
+  assert(name_to_id_.find(type) == name_to_id_.end());
+  assert(names_.size() < std::numeric_limits<int8_t>::max());
+  int8_t id = names_.size();
+  names_.emplace_back(type);
+  name_to_id_[type] = id;
+  return TypeInfo<BaseT>(id);
+}
+
+template <typename BaseT>
+const std::string& TypeRegistry<BaseT>::GetTypeName(
+    TypeInfo<BaseT> info) const {
+  std::lock_guard<std::mutex> guard(mutex_);
+  int8_t id = info.id();
+  assert(id >= 0);
+  assert(static_cast<size_t>(id) < names_.size());
+  return names_[id];
+}
+
+template <typename BaseT>
+TypeInfo<BaseT> RegisterStaticType(const std::string& type) {
+  return TypeRegistry<BaseT>::GetInstance().RegisterType(type);
+}
+
+template <typename BaseT>
+const std::string& TypeInfo<BaseT>::name() const {
+  return TypeRegistry<BaseT>::GetInstance().GetTypeName(*this);
+}
+
+template <typename BaseT>
+const TypeInfo<BaseT> TypeInfo<BaseT>::kUnknownType =
+    RegisterStaticType<BaseT>("Unknown");
+
+}  // namespace pten
diff --git a/paddle/pten/include/core.h b/paddle/pten/include/core.h
new file mode 100644
index 0000000000000..9a042753d1f73
--- /dev/null
+++ b/paddle/pten/include/core.h
@@ -0,0 +1,22 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+// See Note: [ How do we organize the kernel directory ]
+#include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/kernel_context.h"
+#include "paddle/pten/core/kernel_factory.h"
+#include "paddle/pten/core/tensor_meta.h"
diff --git a/paddle/pten/include/creation.h b/paddle/pten/include/creation.h
new file mode 100644
index 0000000000000..0fb3f905e03fb
--- /dev/null
+++ b/paddle/pten/include/creation.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/pten/api/lib/utils/allocator.h"
+#include "paddle/pten/include/infershape.h"
+#include "paddle/pten/kernels/cpu/creation.h"
+#include "paddle/pten/kernels/cuda/creation.h"
+
+namespace pten {
+
+// TODO(YuanRisheng) This function name should be same as User API name.
+// TODO(zyfncg) Automatic code generation
+template <typename T, typename ContextT>
+DenseTensor FillAnyLike(const ContextT& dev_ctx,
+                        const DenseTensor& x,
+                        const Scalar& val) {
+  auto out_meta = UnchangedInferShape(x.meta());
+  const auto allocator =
+      std::make_shared<paddle::experimental::DefaultAllocator>(
+          dev_ctx.GetPlace());
+  pten::DenseTensor dense_out(allocator, out_meta);
+  FillAnyLike<T>(dev_ctx, x, val, &dense_out);
+  return dense_out;
+}
+
+}  // namespace pten
diff --git a/paddle/pten/include/infershape.h b/paddle/pten/include/infershape.h
new file mode 100644
index 0000000000000..d8dd2837a72d9
--- /dev/null
+++ b/paddle/pten/include/infershape.h
@@ -0,0 +1,20 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+// See Note: [ How do we organize the kernel directory ]
+#include "paddle/pten/infershape/binary.h"
+#include "paddle/pten/infershape/nary.h"
+#include "paddle/pten/infershape/unary.h"
diff --git a/paddle/pten/include/linalg.h b/paddle/pten/include/linalg.h
new file mode 100644
index 0000000000000..70eebac5b6841
--- /dev/null
+++ b/paddle/pten/include/linalg.h
@@ -0,0 +1,38 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+// See Note: [ How do we organize the kernel directory ]
+#include "paddle/pten/api/lib/utils/allocator.h"
+#include "paddle/pten/include/infershape.h"
+#include "paddle/pten/kernels/cpu/linalg.h"
+#include "paddle/pten/kernels/cuda/linalg.h"
+
+namespace pten {
+
+template <typename T, typename ContextT>
+DenseTensor Dot(const ContextT& dev_ctx,
+                const DenseTensor& x,
+                const DenseTensor& y) {
+  auto out_meta = DotInferShape(x.meta(), y.meta());
+  const auto allocator =
+      std::make_shared<paddle::experimental::DefaultAllocator>(
+          dev_ctx.GetPlace());
+  pten::DenseTensor dense_out(allocator, out_meta);
+  Dot<T>(dev_ctx, x, y, &dense_out);
+  return dense_out;
+}
+
+}  // namespace pten
diff --git a/paddle/pten/include/manipulation.h b/paddle/pten/include/manipulation.h
new file mode 100644
index 0000000000000..e10f296dbd0f9
--- /dev/null
+++ b/paddle/pten/include/manipulation.h
@@ -0,0 +1,40 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+// See Note: [ How do we organize the kernel directory ]
+#include "paddle/pten/api/lib/utils/allocator.h"
+#include "paddle/pten/include/infershape.h"
+#include "paddle/pten/kernels/cpu/manipulation.h"
+#include "paddle/pten/kernels/cuda/manipulation.h"
+#include "paddle/pten/kernels/xpu/manipulation.h"
+
+namespace pten {
+
+template <typename T, typename ContextT>
+DenseTensor Flatten(const ContextT& dev_ctx,
+                    const DenseTensor& x,
+                    int start_axis,
+                    int stop_axis) {
+  auto out_meta = FlattenInferShape(x.meta(), start_axis, stop_axis);
+  const auto allocator =
+      std::make_shared<paddle::experimental::DefaultAllocator>(
+          dev_ctx.GetPlace());
+  pten::DenseTensor dense_out(allocator, out_meta);
+  Flatten<T>(dev_ctx, x, start_axis, stop_axis, &dense_out);
+  return dense_out;
+}
+
+}  // namespace pten
diff --git a/paddle/pten/include/math.h b/paddle/pten/include/math.h
new file mode 100644
index 0000000000000..0a61a94aa8d69
--- /dev/null
+++ b/paddle/pten/include/math.h
@@ -0,0 +1,76 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+// See Note: [ How do we organize the kernel directory ]
+#include "paddle/pten/api/lib/utils/allocator.h"
+#include "paddle/pten/include/infershape.h"
+#include "paddle/pten/kernels/cpu/math.h"
+#include "paddle/pten/kernels/cuda/math.h"
+
+namespace pten {
+
+template <typename T, typename ContextT>
+DenseTensor Sign(const ContextT& dev_ctx, const DenseTensor& x) {
+  auto out_meta = UnchangedInferShape(x.meta());
+  const auto allocator =
+      std::make_shared<paddle::experimental::DefaultAllocator>(
+          dev_ctx.GetPlace());
+  pten::DenseTensor dense_out(allocator, out_meta);
+  Sign<T>(dev_ctx, x, &dense_out);
+  return dense_out;
+}
+
+template <typename T, typename ContextT>
+DenseTensor Mean(const ContextT& dev_ctx, const DenseTensor& x) {
+  auto out_meta = ReductionInferShape(x.meta());
+  const auto allocator =
+      std::make_shared<paddle::experimental::DefaultAllocator>(
+          dev_ctx.GetPlace());
+  pten::DenseTensor dense_out(allocator, out_meta);
+  Mean<T>(dev_ctx, x, &dense_out);
+  return dense_out;
+}
+
+template <typename T, typename ContextT>
+DenseTensor Scale(const ContextT& dev_ctx,
+                  const DenseTensor& x,
+                  float scale,
+                  float bias,
+                  bool bias_after_scale) {
+  auto out_meta = UnchangedInferShape(x.meta());
+  const auto allocator =
+      std::make_shared<paddle::experimental::DefaultAllocator>(
+          dev_ctx.GetPlace());
+  pten::DenseTensor dense_out(allocator, out_meta);
+  Scale<T>(dev_ctx, x, scale, bias, bias_after_scale, &dense_out);
+  return dense_out;
+}
+
+template <typename T, typename ContextT>
+DenseTensor Scale(const ContextT& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& scale,
+                  float bias,
+                  bool bias_after_scale) {
+  auto out_meta = UnchangedInferShape(x.meta());
+  const auto allocator =
+      std::make_shared<paddle::experimental::DefaultAllocator>(
+          dev_ctx.GetPlace());
+  pten::DenseTensor dense_out(allocator, out_meta);
+  ScaleHost<T>(dev_ctx, x, scale, bias, bias_after_scale, &dense_out);
+  return dense_out;
+}
+}  // namespace pten
diff --git a/paddle/pten/infershape/CMakeLists.txt b/paddle/pten/infershape/CMakeLists.txt
new file mode 100644
index 0000000000000..b32ec0a51c736
--- /dev/null
+++ b/paddle/pten/infershape/CMakeLists.txt
@@ -0,0 +1,3 @@
+cc_library(nary SRCS nary.cc DEPS convert_utils)
+cc_library(unary SRCS unary.cc DEPS convert_utils)
+cc_library(binary SRCS binary.cc DEPS convert_utils)
diff --git a/paddle/pten/infershape/binary.cc b/paddle/pten/infershape/binary.cc
new file mode 100644
index 0000000000000..c17e087158183
--- /dev/null
+++ b/paddle/pten/infershape/binary.cc
@@ -0,0 +1,132 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/pten/infershape/binary.h"
+
+namespace pten {
+
+DenseTensorMeta DotInferShape(const DenseTensorMeta& x_meta,
+                              const DenseTensorMeta& y_meta) {
+  auto x_dims = x_meta.dims;
+  auto x_rank = static_cast<size_t>(x_dims.size());
+  PADDLE_ENFORCE_EQ(true,
+                    1 == x_rank || 2 == x_rank,
+                    paddle::platform::errors::PreconditionNotMet(
+                        "ShapeError: The dimensions of input tensor X (%s) "
+                        "should be 1 or 2",
+                        x_dims.to_str()));
+
+  auto y_dims = y_meta.dims;
+  PADDLE_ENFORCE_EQ(
+      true,
+      x_rank == (size_t)y_dims.size(),
+      paddle::platform::errors::PreconditionNotMet(
+          "ShapeError: The shape of input tensor Y: %s should match with "
+          "input tenosr X: %s",
+          y_dims.to_str(),
+          x_dims.to_str()));
+  bool shape_match = true;
+  for (size_t i = 0; i < x_rank; ++i) {
+    if (x_dims[i] != y_dims[i]) {
+      shape_match = false;
+      break;
+    }
+  }
+
+  PADDLE_ENFORCE_EQ(true,
+                    shape_match,
+                    paddle::platform::errors::PreconditionNotMet(
+                        "ShapeError: The shape of input tensor X: %s should "
+                        "be exactly the same "
+                        "with input tensor Y: %s",
+                        x_dims.to_str(),
+                        y_dims.to_str()));
+
+  x_dims[x_dims.size() - 1] = 1;
+  DenseTensorMeta return_meta(x_meta.type, x_dims, x_meta.layout);
+  return return_meta;
+}
+
+DenseTensorMeta MatmulInferShape(const DenseTensorMeta& x_meta,
+                                 const DenseTensorMeta& y_meta,
+                                 bool trans_x,
+                                 bool trans_y) {
+  std::vector<int64_t> dims_x = paddle::framework::vectorize(x_meta.dims);
+  std::vector<int64_t> dims_y = paddle::framework::vectorize(y_meta.dims);
+  auto ndims_x = dims_x.size();
+  auto ndims_y = dims_y.size();
+  PADDLE_ENFORCE_GT(ndims_x,
+                    0,
+                    paddle::platform::errors::InvalidArgument(
+                        "The Input(x) dims size must be greater than 0,"
+                        " but reviced dims size is 0. "));
+  PADDLE_ENFORCE_GT(ndims_y,
+                    0,
+                    paddle::platform::errors::InvalidArgument(
+                        "The Input(y) dims size must be greater than 0,"
+                        " but reviced dims size is 0. "));
+
+  bool x_broadcasted = false, y_broadcasted = false;
+  if (ndims_x == 1) {
+    dims_x.insert(dims_x.begin(), 1);
+    ndims_x = 2;
+    x_broadcasted = true;
+  }
+
+  if (ndims_y == 1) {
+    dims_y.push_back(1);
+    ndims_y = 2;
+    y_broadcasted = true;
+  }
+
+  size_t M, N;
+  if (trans_x) {
+    M = dims_x[ndims_x - 1];
+  } else {
+    M = dims_x[ndims_x - 2];
+  }
+  if (trans_y) {
+    N = dims_y[ndims_y - 2];
+  } else {
+    N = dims_y[ndims_y - 1];
+  }
+
+  std::vector<int64_t> new_dims;
+  if (ndims_x > ndims_y) {
+    new_dims.assign(dims_x.begin(), dims_x.end() - 2);
+  } else if (ndims_x < ndims_y) {
+    new_dims.assign(dims_y.begin(), dims_y.end() - 2);
+  } else {
+    new_dims.reserve(ndims_x);
+    for (size_t i = 0; i < ndims_x - 2; ++i) {
+      new_dims.push_back(std::max(dims_x[i], dims_y[i]));
+    }
+  }
+  if (!x_broadcasted) {
+    new_dims.push_back(M);
+  }
+  if (!y_broadcasted) {
+    new_dims.push_back(N);
+  }
+  if (x_broadcasted && y_broadcasted) {
+    new_dims.push_back(1);
+  }
+
+  auto ddim_out = paddle::framework::make_ddim(new_dims);
+
+  return {x_meta.type, ddim_out, x_meta.layout};
+}
+
+}  // namespace pten
diff --git a/paddle/pten/infershape/binary.h b/paddle/pten/infershape/binary.h
new file mode 100644
index 0000000000000..f58e5503f22a1
--- /dev/null
+++ b/paddle/pten/infershape/binary.h
@@ -0,0 +1,44 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/pten/core/tensor_meta.h"
+
+namespace pten {
+
+// Common InferShape Functions for binary operators, The format like:
+//
+//   1. DenseTensorMeta [OpName]InferShape(const DenseTensorMeta& x_meta, ...)
+//   {}
+//   2. std::pair<DenseTensorMeta, DenseTensorMeta> [OpName]InferShape(const
+//   DenseTensorMeta&
+//   x_meta, ...) {}
+//   3. std::tuple<DenseTensorMeta, DenseTensorMeta, DenseTensorMeta>
+//   [OpName]InferShape(const
+//   DenseTensorMeta& x_meta, ...)
+//  NOTE: The name "InferShape" may be not appropriate. "InferMeta" may be good.
+//  Because functions in this file
+//  not only can infer shape, but alse need infer lod or other useful data.
+
+DenseTensorMeta DotInferShape(const DenseTensorMeta& x_meta,
+                              const DenseTensorMeta& y_meta);
+
+DenseTensorMeta MatmulInferShape(const DenseTensorMeta& x_meta,
+                                 const DenseTensorMeta& y_meta,
+                                 bool trans_x,
+                                 bool trans_y);
+
+}  // namespace pten
diff --git a/paddle/pten/infershape/nary.cc b/paddle/pten/infershape/nary.cc
new file mode 100644
index 0000000000000..b8745dd9b83af
--- /dev/null
+++ b/paddle/pten/infershape/nary.cc
@@ -0,0 +1,27 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/pten/infershape/nary.h"
+
+namespace pten {
+
+DenseTensorMeta FullInferShape(const std::vector<int64_t>& shape,
+                               DataType dtype,
+                               DataLayout layout) {
+  const auto& out_dims = paddle::framework::make_ddim(shape);
+  return {dtype, out_dims, layout};
+}
+
+}  // namespace pten
diff --git a/paddle/pten/infershape/nary.h b/paddle/pten/infershape/nary.h
new file mode 100644
index 0000000000000..8900e0ed71c9f
--- /dev/null
+++ b/paddle/pten/infershape/nary.h
@@ -0,0 +1,34 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/pten/core/tensor_meta.h"
+
+namespace pten {
+
+// Common InferShape Functions for 0-nary operators(no input tensor), The format
+// like:
+//
+//   1. DenseTensorMeta [OpName]InferShape( ...)
+//  NOTE: The name "InferShape" may be not appropriate. "InferMeta" may be good.
+//  Because functions in this file
+//  not only can infer shape, but alse need infer lod or other useful data.
+
+DenseTensorMeta FullInferShape(const std::vector<int64_t>& shape,
+                               DataType dtype,
+                               DataLayout layout);
+
+}  // namespace pten
diff --git a/paddle/pten/infershape/unary.cc b/paddle/pten/infershape/unary.cc
new file mode 100644
index 0000000000000..4e743261b5906
--- /dev/null
+++ b/paddle/pten/infershape/unary.cc
@@ -0,0 +1,77 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/pten/infershape/unary.h"
+
+namespace pten {
+
+DenseTensorMeta UnchangedInferShape(const DenseTensorMeta& x_meta) {
+  return x_meta;
+}
+
+DenseTensorMeta ReductionInferShape(const DenseTensorMeta& x_meta) {
+  const auto& out_dims = paddle::framework::make_ddim({1});
+  DenseTensorMeta return_meta(x_meta.type, out_dims, x_meta.layout);
+  return return_meta;
+}
+
+DenseTensorMeta FlattenInferShape(const DenseTensorMeta& x_meta,
+                                  int start_axis,
+                                  int stop_axis) {
+  auto& x_dims = x_meta.dims;
+  int in_dims_size = x_dims.size();
+  if (start_axis < 0) {
+    start_axis = start_axis + in_dims_size;
+  }
+  if (stop_axis < 0) {
+    stop_axis = stop_axis + in_dims_size;
+  }
+  PADDLE_ENFORCE_GE(stop_axis,
+                    start_axis,
+                    paddle::platform::errors::InvalidArgument(
+                        "The stop_axis should be greater"
+                        "than or equal to start_axis."));
+
+  int64_t outer = 1;
+  std::vector<int32_t> out_shape;
+  out_shape.reserve(in_dims_size - stop_axis + start_axis);
+
+  for (int i = 0; i < start_axis; ++i) {
+    out_shape.push_back(x_dims[i]);
+  }
+  for (int i = start_axis; i <= stop_axis; i++) {
+    if (x_dims[i] == -1 || outer == -1) {
+      outer = -1;
+    } else {
+      outer *= x_dims[i];
+    }
+  }
+  out_shape.push_back(outer);
+  for (int i = stop_axis + 1; i < in_dims_size; i++) {
+    out_shape.push_back(x_dims[i]);
+  }
+  const auto& out_dims = paddle::framework::make_ddim(out_shape);
+  DenseTensorMeta return_meta(x_meta.type, out_dims, x_meta.layout);
+
+  if (x_dims[0] == return_meta.dims[0]) {
+    // Only pass LoD when the first dimension of output and Input(X)
+    // are the same.
+    return_meta.lod = x_meta.lod;
+  }
+
+  return return_meta;
+}
+
+}  // namespace pten
diff --git a/paddle/pten/infershape/unary.h b/paddle/pten/infershape/unary.h
new file mode 100644
index 0000000000000..1db0b094eba3a
--- /dev/null
+++ b/paddle/pten/infershape/unary.h
@@ -0,0 +1,44 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/pten/core/tensor_meta.h"
+
+namespace pten {
+
+// Common InferShape Functions for unary operators, The format like:
+//
+//   1. DenseTensorMeta [OpName]InferShape(const DenseTensorMeta& x_meta, ...)
+//   {}
+//   2. std::pair<DenseTensorMeta, DenseTensorMeta> [OpName]InferShape(const
+//   DenseTensorMeta&
+//   x_meta, ...) {}
+//   3. std::tuple<DenseTensorMeta, DenseTensorMeta, DenseTensorMeta>
+//   [OpName]InferShape(const
+//   DenseTensorMeta& x_meta, ...)
+//  NOTE: The name "InferShape" may be not appropriate. "InferMeta" may be good.
+//  Because functions in this file
+//  not only can infer shape, but alse need infer lod or other useful data.
+
+DenseTensorMeta UnchangedInferShape(const DenseTensorMeta& x_meta);
+
+DenseTensorMeta ReductionInferShape(const DenseTensorMeta& x_meta);
+
+DenseTensorMeta FlattenInferShape(const DenseTensorMeta& x_meta,
+                                  int start_axis,
+                                  int stop_axis);
+
+}  // namespace pten
diff --git a/paddle/pten/kernels/CMakeLists.txt b/paddle/pten/kernels/CMakeLists.txt
new file mode 100644
index 0000000000000..486fd73c00f33
--- /dev/null
+++ b/paddle/pten/kernels/CMakeLists.txt
@@ -0,0 +1,20 @@
+# pten basic functions called by kernels
+add_subdirectory(functions)
+# pten kernels for diff device
+add_subdirectory(cpu)
+if(WITH_GPU OR WITH_ROCM)
+  # TODO(chenweihang): if hip can split from cuda impl, we should add hip dir
+  add_subdirectory(cuda)
+endif()
+# TODO(chenweihang): migrate MKLDNN Kernel in the second phase of the project
+if(WITH_MKLDNN)
+  add_subdirectory(mkldnn)
+endif()
+# TODO(chenweihang): migrate NPU Kernel in the second phase of the project
+if(WITH_ASCEND_CL)
+  add_subdirectory(npu)
+endif()
+# TODO(chenweihang): migrate XPU Kernel in the second phase of the project
+if(WITH_XPU)
+  add_subdirectory(xpu)
+endif()
diff --git a/paddle/pten/kernels/cpu/CMakeLists.txt b/paddle/pten/kernels/cpu/CMakeLists.txt
new file mode 100644
index 0000000000000..2c4a424e48492
--- /dev/null
+++ b/paddle/pten/kernels/cpu/CMakeLists.txt
@@ -0,0 +1,5 @@
+cc_library(math_cpu SRCS math.cc DEPS dense_tensor kernel_context kernel_factory eigen_function)
+cc_library(linalg_cpu SRCS linalg.cc DEPS dense_tensor kernel_context kernel_factory)
+cc_library(creation_cpu SRCS creation.cc DEPS dense_tensor kernel_context kernel_factory eigen_function)
+cc_library(utils_cpu SRCS utils.cc DEPS dense_tensor kernel_context kernel_factory memory convert_utils)
+cc_library(manipulation_cpu SRCS manipulation.cc DEPS dense_tensor kernel_context kernel_factory utils_cpu unary)
diff --git a/paddle/pten/kernels/cpu/creation.cc b/paddle/pten/kernels/cpu/creation.cc
new file mode 100644
index 0000000000000..2ab2537a84437
--- /dev/null
+++ b/paddle/pten/kernels/cpu/creation.cc
@@ -0,0 +1,90 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/pten/kernels/cpu/creation.h"
+
+#include "paddle/pten/core/kernel_registry.h"
+#include "paddle/pten/kernels/functions/eigen/fill.h"
+
+namespace pten {
+
+template <typename T>
+void FillAnyLike(const CPUContext& dev_ctx,
+                 const DenseTensor& x,
+                 const Scalar& val,
+                 DenseTensor* out) {
+  auto value = val.to<float>();
+  using CommonType = typename std::common_type<
+      float,
+      typename std::conditional<
+          std::is_same<T, paddle::platform::float16>::value,
+          float,
+          T>::type>::type;
+
+  auto common_type_value = static_cast<CommonType>(value);
+
+  PADDLE_ENFORCE_EQ(
+      (common_type_value >=
+       static_cast<CommonType>(std::numeric_limits<T>::lowest())) &&
+          (common_type_value <=
+           static_cast<CommonType>(std::numeric_limits<T>::max())),
+      true,
+      paddle::platform::errors::InvalidArgument(
+          "The filled value is out of range for target type, "
+          "current kernel type is %s, the range should between %f "
+          "and %f, but now value is %f.",
+          typeid(T).name(),
+          static_cast<CommonType>(std::numeric_limits<T>::lowest()),
+          static_cast<CommonType>(std::numeric_limits<T>::max()),
+          static_cast<float>(value)));
+  eigen::fill<CPUContext, T>(dev_ctx, out, value);
+}
+
+template <typename T>
+void FillConstant(const CPUContext& dev_ctx,
+                  const Scalar& val,
+                  DenseTensor* out) {
+  eigen::fill<CPUContext, T>(dev_ctx, out, val.to<T>());
+}
+
+}  // namespace pten
+
+PT_REGISTER_MODULE(CreationCPU);
+
+PT_REGISTER_KERNEL("fill_any_like",
+                   CPU,
+                   ANY,
+                   pten::FillAnyLike,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   paddle::platform::float16) {}
+
+PT_REGISTER_KERNEL("fill_constant.scalar",
+                   CPU,
+                   ANY,
+                   pten::FillConstant,
+                   float,
+                   double,
+                   uint8_t,
+                   int16_t,
+                   int,
+                   int64_t,
+                   bool,
+                   paddle::platform::float16,
+                   paddle::platform::bfloat16,
+                   paddle::platform::complex<float>,
+                   paddle::platform::complex<double>) {}
diff --git a/paddle/pten/kernels/cpu/creation.h b/paddle/pten/kernels/cpu/creation.h
new file mode 100644
index 0000000000000..6d7732033aed9
--- /dev/null
+++ b/paddle/pten/kernels/cpu/creation.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/pten/common/scalar.h"
+#include "paddle/pten/core/dense_tensor.h"
+
+#include "paddle/fluid/platform/device_context.h"
+
+namespace pten {
+
+using CPUContext = paddle::platform::CPUDeviceContext;
+
+template <typename T>
+void FillAnyLike(const CPUContext& dev_ctx,
+                 const DenseTensor& x,
+                 const Scalar& val,
+                 DenseTensor* out);
+
+template <typename T>
+void FillConstant(const CPUContext& dev_ctx,
+                  const Scalar& val,
+                  DenseTensor* out);
+
+}  // namespace pten
diff --git a/paddle/pten/kernels/cpu/linalg.cc b/paddle/pten/kernels/cpu/linalg.cc
new file mode 100644
index 0000000000000..ced13dc41d1ae
--- /dev/null
+++ b/paddle/pten/kernels/cpu/linalg.cc
@@ -0,0 +1,91 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/pten/kernels/cpu/linalg.h"
+
+#include "paddle/pten/core/kernel_registry.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/platform/complex.h"
+
+#include "paddle/pten/kernels/functions/math/matmul_func.h"
+
+namespace pten {
+
+template <typename T>
+void Dot(const CPUContext& dev_ctx,
+         const DenseTensor& x,
+         const DenseTensor& y,
+         DenseTensor* out) {
+  auto const *x_ptr = x.data<T>(), *x_ptr_ = &x_ptr[0];
+  auto const *y_ptr = y.data<T>(), *y_ptr_ = &y_ptr[0];
+  auto* z = out->mutable_data<T>();
+
+  // Loop over the total N elements of both operands while sum-reducing every
+  // B pairs along the way where B is the dimension of the least ordered axis
+  auto&& d = x.dims();
+  auto const N = x.numel();
+  auto const B = d[d.size() - 1];
+
+  for (int j = 0; j < N / B; j++) {
+    T ss = 0;
+    for (int i = 0; i < B; i++) ss += (*x_ptr_++) * (*y_ptr_++);
+    z[j] = ss;
+  }
+}
+
+template <typename T>
+void Matmul(const CPUContext& dev_ctx,
+            const DenseTensor& x,
+            const DenseTensor& y,
+            bool transpose_x,
+            bool transpose_y,
+            DenseTensor* out) {
+  PADDLE_ENFORCE_NE(paddle::framework::product(x.dims()),
+                    0,
+                    paddle::platform::errors::InvalidArgument(
+                        "The Input(X) dims size must not be equal 0,"
+                        " but reviced dims size is 0. "));
+  PADDLE_ENFORCE_NE(paddle::framework::product(y.dims()),
+                    0,
+                    paddle::platform::errors::InvalidArgument(
+                        "The Input(Y) dims size must not be equal 0,"
+                        " but reviced dims size is 0. "));
+  math::MatMulFunction<CPUContext, T>(
+      dev_ctx, x, y, out, transpose_x, transpose_y);
+}
+
+}  // namespace pten
+
+PT_REGISTER_MODULE(LinalgCPU);
+
+using complex64 = ::paddle::platform::complex<float>;
+using complex128 = ::paddle::platform::complex<double>;
+
+PT_REGISTER_KERNEL("dot",
+                   CPU,
+                   ANY,
+                   pten::Dot,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   complex64,
+                   complex128) {}
+
+PT_REGISTER_KERNEL(
+    "matmul_v2", CPU, ANY, pten::Matmul, float, double, complex64, complex128) {
+}
diff --git a/paddle/pten/kernels/cpu/linalg.h b/paddle/pten/kernels/cpu/linalg.h
new file mode 100644
index 0000000000000..a954033866f17
--- /dev/null
+++ b/paddle/pten/kernels/cpu/linalg.h
@@ -0,0 +1,40 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/pten/core/dense_tensor.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/platform/device_context.h"
+
+namespace pten {
+
+using CPUContext = paddle::platform::CPUDeviceContext;
+
+template <typename T>
+void Dot(const CPUContext& dev_ctx,
+         const DenseTensor& x,
+         const DenseTensor& y,
+         DenseTensor* out);
+
+template <typename T>
+void Matmul(const CPUContext& dev_ctx,
+            const DenseTensor& x,
+            const DenseTensor& y,
+            bool transpose_x,
+            bool transpose_y,
+            DenseTensor* out);
+
+}  // namespace pten
diff --git a/paddle/pten/kernels/cpu/manipulation.cc b/paddle/pten/kernels/cpu/manipulation.cc
new file mode 100644
index 0000000000000..87c76149f127f
--- /dev/null
+++ b/paddle/pten/kernels/cpu/manipulation.cc
@@ -0,0 +1,80 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/pten/kernels/cpu/manipulation.h"
+#include "paddle/pten/infershape/unary.h"
+#include "paddle/pten/kernels/cpu/utils.h"
+
+namespace pten {
+
+template <typename T>
+void Flatten(const CPUContext& dev_ctx,
+             const DenseTensor& x,
+             int start_axis,
+             int stop_axis,
+             DenseTensor* out) {
+  auto out_dims = out->dims();
+  pten::Copy(dev_ctx, x, out);
+  out->Resize(out_dims);
+}
+
+// TODO(yuanrisheng): this kernel is for training and xshape is a Intermediate
+// Output Tensor，
+// is there a more flexible way to deal with this case?
+template <typename T>
+void FlattenWithXShape(const CPUContext& dev_ctx,
+                       const DenseTensor& x,
+                       int start_axis,
+                       int stop_axis,
+                       DenseTensor* out,
+                       DenseTensor* xshape) {
+  Flatten<T>(dev_ctx, x, start_axis, stop_axis, out);
+  const auto& in_dims = x.meta().dims;
+  std::vector<int64_t> xshape_dims(in_dims.size() + 1);
+  xshape_dims[0] = 0;
+  for (int i = 0; i < in_dims.size(); ++i) {
+    xshape_dims[i + 1] = in_dims[i];
+  }
+  xshape->Resize(paddle::framework::make_ddim(xshape_dims));
+  xshape->set_lod(x.lod());
+}
+
+}  // namespace pten
+
+// TODO(chenweihang): replace by better impl
+PT_REGISTER_MODULE(ManipulationCPU);
+
+// TODO(yuanrisheng): "flatten_contiguous_range" is compatible with old kernel
+// architecture, kernel_name should be "flatten".
+PT_REGISTER_KERNEL("flatten_contiguous_range",
+                   CPU,
+                   ANY,
+                   pten::Flatten,
+                   float,
+                   double,
+                   uint8_t,
+                   int8_t,
+                   int,
+                   int64_t) {}
+
+PT_REGISTER_KERNEL("flatten_contiguous_range.mid",
+                   CPU,
+                   ANY,
+                   pten::FlattenWithXShape,
+                   float,
+                   double,
+                   uint8_t,
+                   int8_t,
+                   int,
+                   int64_t) {}
diff --git a/paddle/pten/kernels/cpu/manipulation.h b/paddle/pten/kernels/cpu/manipulation.h
new file mode 100644
index 0000000000000..22dfb0d8fccba
--- /dev/null
+++ b/paddle/pten/kernels/cpu/manipulation.h
@@ -0,0 +1,34 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/kernel_registry.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/platform/device_context.h"
+
+namespace pten {
+
+using CPUContext = paddle::platform::CPUDeviceContext;
+
+template <typename T>
+void Flatten(const CPUContext& dev_ctx,
+             const DenseTensor& x,
+             int start_axis,
+             int stop_axis,
+             DenseTensor* out);
+
+}  // namespace pten
diff --git a/paddle/pten/kernels/cpu/math.cc b/paddle/pten/kernels/cpu/math.cc
new file mode 100644
index 0000000000000..0682479993f35
--- /dev/null
+++ b/paddle/pten/kernels/cpu/math.cc
@@ -0,0 +1,99 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/pten/kernels/cpu/math.h"
+
+#include "paddle/pten/kernels/functions/eigen/mean.h"
+#include "paddle/pten/kernels/functions/eigen/scale.h"
+#include "paddle/pten/kernels/functions/eigen/sign.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/platform/bfloat16.h"
+
+namespace pten {
+
+template <typename T>
+void Sign(const CPUContext& dev_ctx, const DenseTensor& x, DenseTensor* out) {
+  eigen::Sign<CPUContext, T>(dev_ctx, x, out);
+}
+
+template <typename T>
+void Mean(const CPUContext& dev_ctx, const DenseTensor& x, DenseTensor* out) {
+  eigen::Mean<CPUContext, T>(dev_ctx, x, out);
+}
+
+template <typename T>
+void Scale(const CPUContext& dev_ctx,
+           const DenseTensor& x,
+           float scale,
+           float bias,
+           bool bias_after_scale,
+           DenseTensor* out) {
+  eigen::Scale<CPUContext, T>(dev_ctx, x, scale, bias, bias_after_scale, out);
+}
+
+// TODO(chenweihang): now the ScaleTensor's dtype are same as x, so we cannot
+// register its dtype def
+template <typename T>
+void ScaleHost(const CPUContext& dev_ctx,
+               const DenseTensor& x,
+               const DenseTensor& scale,
+               float bias,
+               bool bias_after_scale,
+               DenseTensor* out) {
+  eigen::Scale<CPUContext, T>(dev_ctx,
+                              x,
+                              static_cast<float>(*scale.data<T>()),
+                              bias,
+                              bias_after_scale,
+                              out);
+}
+
+}  // namespace pten
+
+// TODO(chenweihang): replace by better impl
+PT_REGISTER_MODULE(MathCPU);
+
+// NOTE(chenweihang): using bfloat16 will cause redefine with xpu bfloat16
+// using bfloat16 = ::paddle::platform::bfloat16;
+
+PT_REGISTER_KERNEL("sign", CPU, ANY, pten::Sign, float, double) {}
+PT_REGISTER_KERNEL("mean", CPU, ANY, pten::Mean, float, double) {}
+PT_REGISTER_KERNEL("scale",
+                   CPU,
+                   ANY,
+                   pten::Scale,
+                   float,
+                   double,
+                   paddle::platform::bfloat16,
+                   uint8_t,
+                   int8_t,
+                   int16_t,
+                   int,
+                   int64_t) {}
+PT_REGISTER_KERNEL("scale.host",
+                   CPU,
+                   ANY,
+                   pten::ScaleHost,
+                   float,
+                   double,
+                   paddle::platform::bfloat16,
+                   uint8_t,
+                   int8_t,
+                   int16_t,
+                   int,
+                   int64_t) {
+  kernel->InputAt(1).SetBackend(pten::Backend::CPU);
+}
diff --git a/paddle/pten/kernels/cpu/math.h b/paddle/pten/kernels/cpu/math.h
new file mode 100644
index 0000000000000..3013ad9d04d0b
--- /dev/null
+++ b/paddle/pten/kernels/cpu/math.h
@@ -0,0 +1,49 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/kernel_registry.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/platform/device_context.h"
+
+namespace pten {
+
+using CPUContext = paddle::platform::CPUDeviceContext;
+
+template <typename T>
+void Sign(const CPUContext& dev_ctx, const DenseTensor& x, DenseTensor* out);
+
+template <typename T>
+void Mean(const CPUContext& dev_ctx, const DenseTensor& x, DenseTensor* out);
+
+template <typename T>
+void Scale(const CPUContext& dev_ctx,
+           const DenseTensor& x,
+           float scale,
+           float bias,
+           bool bias_after_scale,
+           DenseTensor* out);
+
+template <typename T>
+void ScaleHost(const CPUContext& dev_ctx,
+               const DenseTensor& x,
+               const DenseTensor& scale,
+               float bias,
+               bool bias_after_scale,
+               DenseTensor* out);
+
+}  // namespace pten
diff --git a/paddle/pten/kernels/cpu/utils.cc b/paddle/pten/kernels/cpu/utils.cc
new file mode 100644
index 0000000000000..1f9d675deafa2
--- /dev/null
+++ b/paddle/pten/kernels/cpu/utils.cc
@@ -0,0 +1,57 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/kernels/cpu/utils.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/pten/common/data_type.h"
+#include "paddle/pten/core/convert_utils.h"
+
+namespace pten {
+
+void Copy(const CPUContext& dev_ctx, const DenseTensor& src, DenseTensor* dst) {
+  auto* src_ptr = src.data();
+  auto* dst_ptr = dst->mutable_data();
+  const auto& src_place = src.place();
+  const auto& dst_place = dst->place();
+
+  if (src_ptr == dst_ptr && src_place == dst_place) {
+    VLOG(3) << "Skip copy the same data async from " << src_place << " to "
+            << dst_place;
+    return;
+  }
+  VLOG(4) << "src:" << src_ptr << ", dst:" << dst_ptr;
+
+  VLOG(3) << "TensorCopy " << src.dims() << " from " << src.place() << " to "
+          << dst_place;
+  dst->Resize(src.dims());
+  CHECK(dst->layout() == src.layout());
+  auto size = src.numel() * paddle::framework::SizeOfType(
+                                TransToProtoVarType(src.data_type()));
+
+  if (paddle::platform::is_cpu_place(src_place) &&
+      paddle::platform::is_cpu_place(dst_place)) {
+    paddle::memory::Copy(BOOST_GET_CONST(paddle::platform::CPUPlace, dst_place),
+                         dst_ptr,
+                         BOOST_GET_CONST(paddle::platform::CPUPlace, src_place),
+                         src_ptr,
+                         size);
+  }
+}
+
+}  // namespace pten
+
+// TODO(chenweihang): replace by better impl
+PT_REGISTER_MODULE(UtilsCPU);
+
+PT_REGISTER_KERNEL_WITH_NO_TYPE("copy", CPU, ANY, pten::Copy) {}
diff --git a/paddle/pten/kernels/cpu/utils.h b/paddle/pten/kernels/cpu/utils.h
new file mode 100644
index 0000000000000..38f601b4cf91f
--- /dev/null
+++ b/paddle/pten/kernels/cpu/utils.h
@@ -0,0 +1,28 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/kernel_registry.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/platform/device_context.h"
+namespace pten {
+
+using CPUContext = paddle::platform::CPUDeviceContext;
+
+void Copy(const CPUContext& dev_ctx, const DenseTensor& src, DenseTensor* dst);
+
+}  // namespace pten
diff --git a/paddle/pten/kernels/cuda/CMakeLists.txt b/paddle/pten/kernels/cuda/CMakeLists.txt
new file mode 100644
index 0000000000000..9e86d9521c99a
--- /dev/null
+++ b/paddle/pten/kernels/cuda/CMakeLists.txt
@@ -0,0 +1,13 @@
+if(WITH_GPU)
+  nv_library(math_cuda SRCS math.cu DEPS eigen_function dense_tensor convert_utils kernel_context kernel_factory)
+  nv_library(linalg_cuda SRCS linalg.cu DEPS eigen_function dense_tensor kernel_context kernel_factory)
+  nv_library(creation_cuda SRCS creation.cu DEPS eigen_function dense_tensor kernel_context kernel_factory)
+  nv_library(utils_cuda SRCS utils.cu DEPS dense_tensor kernel_context kernel_factory memory convert_utils)
+  nv_library(manipulation_cuda SRCS manipulation.cu DEPS dense_tensor kernel_context kernel_factory utils_cuda unary)
+elseif(WITH_ROCM)
+  hip_library(math_cuda SRCS math.cu DEPS eigen_function dense_tensor convert_utils kernel_context kernel_factory)
+  hip_library(linalg_cuda SRCS linalg.cu DEPS eigen_function dense_tensor kernel_context kernel_factory)
+  hip_library(creation_cuda SRCS creation.cu DEPS eigen_function dense_tensor kernel_context kernel_factory)
+  hip_library(utils_cuda SRCS utils.cu DEPS dense_tensor kernel_context kernel_factory memory convert_utils)
+  hip_library(manipulation_cuda SRCS manipulation.cu DEPS dense_tensor kernel_context kernel_factory utils_cuda unary)
+endif()
diff --git a/paddle/pten/kernels/cuda/creation.cu b/paddle/pten/kernels/cuda/creation.cu
new file mode 100644
index 0000000000000..b96b5ebea9b70
--- /dev/null
+++ b/paddle/pten/kernels/cuda/creation.cu
@@ -0,0 +1,90 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/pten/kernels/cuda/creation.h"
+
+#include "paddle/pten/core/kernel_registry.h"
+#include "paddle/pten/kernels/functions/eigen/fill.h"
+
+namespace pten {
+
+template <typename T>
+void FillAnyLike(const CUDAContext& dev_ctx,
+                 const DenseTensor& x,
+                 const Scalar& val,
+                 DenseTensor* out) {
+  auto value = val.to<float>();
+  using CommonType = typename std::common_type<
+      float,
+      typename std::conditional<
+          std::is_same<T, paddle::platform::float16>::value,
+          float,
+          T>::type>::type;
+
+  auto common_type_value = static_cast<CommonType>(value);
+
+  PADDLE_ENFORCE_EQ(
+      (common_type_value >=
+       static_cast<CommonType>(std::numeric_limits<T>::lowest())) &&
+          (common_type_value <=
+           static_cast<CommonType>(std::numeric_limits<T>::max())),
+      true,
+      paddle::platform::errors::InvalidArgument(
+          "The filled value is out of range for target type, "
+          "current kernel type is %s, the range should between %f "
+          "and %f, but now value is %f.",
+          typeid(T).name(),
+          static_cast<CommonType>(std::numeric_limits<T>::lowest()),
+          static_cast<CommonType>(std::numeric_limits<T>::max()),
+          static_cast<float>(value)));
+
+  eigen::fill<CUDAContext, T>(dev_ctx, out, val.to<float>());
+}
+
+template <typename T>
+void FillConstant(const CUDAContext& dev_ctx,
+                  const Scalar& val,
+                  DenseTensor* out) {
+  eigen::fill<CUDAContext, T>(dev_ctx, out, val.to<T>());
+}
+
+}  // namespace pten
+
+PT_REGISTER_MODULE(CreationCUDA);
+
+PT_REGISTER_KERNEL("fill_any_like",
+                   CUDA,
+                   ANY,
+                   pten::FillAnyLike,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   paddle::platform::float16) {}
+
+PT_REGISTER_KERNEL("fill_constant.scalar",
+                   CUDA,
+                   ANY,
+                   pten::FillConstant,
+                   float,
+                   double,
+                   uint8_t,
+                   int16_t,
+                   int,
+                   int64_t,
+                   bool,
+                   paddle::platform::float16,
+                   paddle::platform::complex<float>,
+                   paddle::platform::complex<double>) {}
diff --git a/paddle/pten/kernels/cuda/creation.h b/paddle/pten/kernels/cuda/creation.h
new file mode 100644
index 0000000000000..025cd6ba51b5d
--- /dev/null
+++ b/paddle/pten/kernels/cuda/creation.h
@@ -0,0 +1,42 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+// CUDA and HIP use same api
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+
+#include "paddle/pten/common/scalar.h"
+#include "paddle/pten/core/dense_tensor.h"
+
+#include "paddle/fluid/platform/device_context.h"
+
+namespace pten {
+
+using CUDAContext = paddle::platform::CUDADeviceContext;
+
+template <typename T>
+void FillAnyLike(const CUDAContext& dev_ctx,
+                 const DenseTensor& x,
+                 const Scalar& val,
+                 DenseTensor* out);
+
+template <typename T>
+void FillConstant(const CUDAContext& dev_ctx,
+                  const Scalar& val,
+                  DenseTensor* out);
+
+}  // namespace pten
+
+#endif
diff --git a/paddle/pten/kernels/cuda/linalg.cu b/paddle/pten/kernels/cuda/linalg.cu
new file mode 100644
index 0000000000000..6811afa8a49ff
--- /dev/null
+++ b/paddle/pten/kernels/cuda/linalg.cu
@@ -0,0 +1,82 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/pten/kernels/cuda/linalg.h"
+
+#include "paddle/pten/core/kernel_registry.h"
+#include "paddle/pten/kernels/functions/eigen/dot.h"
+#include "paddle/pten/kernels/functions/math/matmul_func.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/platform/complex.h"
+
+namespace pten {
+
+template <typename T>
+void Dot(const CUDAContext& dev_ctx,
+         const DenseTensor& x,
+         const DenseTensor& y,
+         DenseTensor* out) {
+  eigen::Dot<CUDAContext, T>(dev_ctx, x, y, out);
+}
+
+template <typename T>
+void Matmul(const CUDAContext& dev_ctx,
+            const DenseTensor& x,
+            const DenseTensor& y,
+            bool transpose_x,
+            bool transpose_y,
+            DenseTensor* out) {
+  PADDLE_ENFORCE_NE(paddle::framework::product(x.dims()),
+                    0,
+                    paddle::platform::errors::InvalidArgument(
+                        "The Input(X) dims size must not be equal 0,"
+                        " but reviced dims size is 0. "));
+  PADDLE_ENFORCE_NE(paddle::framework::product(y.dims()),
+                    0,
+                    paddle::platform::errors::InvalidArgument(
+                        "The Input(Y) dims size must not be equal 0,"
+                        " but reviced dims size is 0. "));
+  math::MatMulFunction<CUDAContext, T>(
+      dev_ctx, x, y, out, transpose_x, transpose_y);
+}
+
+}  // namespace pten
+
+PT_REGISTER_MODULE(LinalgCUDA);
+
+using float16 = paddle::platform::float16;
+using complex64 = ::paddle::platform::complex<float>;
+using complex128 = ::paddle::platform::complex<double>;
+
+PT_REGISTER_KERNEL("dot",
+                   CUDA,
+                   ANY,
+                   pten::Dot,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   complex64,
+                   complex128) {}
+
+PT_REGISTER_KERNEL("matmul_v2",
+                   CUDA,
+                   ANY,
+                   pten::Matmul,
+                   float,
+                   double,
+                   float16,
+                   complex64,
+                   complex128) {}
diff --git a/paddle/pten/kernels/cuda/linalg.h b/paddle/pten/kernels/cuda/linalg.h
new file mode 100644
index 0000000000000..a6489efa72eee
--- /dev/null
+++ b/paddle/pten/kernels/cuda/linalg.h
@@ -0,0 +1,45 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+// CUDA and HIP use same api
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+
+#include "paddle/pten/core/dense_tensor.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/platform/device_context.h"
+
+namespace pten {
+
+using CUDAContext = paddle::platform::CUDADeviceContext;
+
+template <typename T>
+void Dot(const CUDAContext& dev_ctx,
+         const DenseTensor& x,
+         const DenseTensor& y,
+         DenseTensor* out);
+
+template <typename T>
+void Matmul(const CUDAContext& dev_ctx,
+            const DenseTensor& x,
+            const DenseTensor& y,
+            bool transpose_x,
+            bool transpose_y,
+            DenseTensor* out);
+
+}  // namespace pten
+
+#endif
diff --git a/paddle/pten/kernels/cuda/manipulation.cu b/paddle/pten/kernels/cuda/manipulation.cu
new file mode 100644
index 0000000000000..38111f2b8c02f
--- /dev/null
+++ b/paddle/pten/kernels/cuda/manipulation.cu
@@ -0,0 +1,82 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/pten/infershape/unary.h"
+#include "paddle/pten/kernels/cuda/manipulation.h"
+#include "paddle/pten/kernels/cuda/utils.h"
+
+namespace pten {
+
+template <typename T>
+void Flatten(const CUDAContext& dev_ctx,
+             const DenseTensor& x,
+             int start_axis,
+             int stop_axis,
+             DenseTensor* out) {
+  auto out_dims = out->dims();
+  pten::Copy(dev_ctx, x, out);
+  out->Resize(out_dims);
+}
+
+// TODO(yuanrisheng): this kernel is for training and xshape is a Intermediate
+// Output Tensor，
+// is there a more flexible way to deal with this case?
+template <typename T>
+void FlattenWithXShape(const CUDAContext& dev_ctx,
+                       const DenseTensor& x,
+                       int start_axis,
+                       int stop_axis,
+                       DenseTensor* out,
+                       DenseTensor* xshape) {
+  Flatten<T>(dev_ctx, x, start_axis, stop_axis, out);
+  const auto& in_dims = x.meta().dims;
+  std::vector<int64_t> xshape_dims(in_dims.size() + 1);
+  xshape_dims[0] = 0;
+  for (int i = 0; i < in_dims.size(); ++i) {
+    xshape_dims[i + 1] = in_dims[i];
+  }
+  xshape->Resize(paddle::framework::make_ddim(xshape_dims));
+  xshape->set_lod(x.lod());
+}
+
+}  // namespace pten
+
+// TODO(chenweihang): replace by better impl
+PT_REGISTER_MODULE(ManipulationCUDA);
+
+using float16 = paddle::platform::float16;
+// TODO(yuanrisheng): "flatten_contiguous_range" is compatible with old kernel
+// architecture, kernel_name should be "flatten".
+PT_REGISTER_KERNEL("flatten_contiguous_range",
+                   CUDA,
+                   ANY,
+                   pten::Flatten,
+                   float,
+                   float16,
+                   double,
+                   uint8_t,
+                   int8_t,
+                   int,
+                   int64_t) {}
+
+PT_REGISTER_KERNEL("flatten_contiguous_range.mid",
+                   CUDA,
+                   ANY,
+                   pten::FlattenWithXShape,
+                   float,
+                   double,
+                   uint8_t,
+                   int8_t,
+                   int,
+                   int64_t) {}
diff --git a/paddle/pten/kernels/cuda/manipulation.h b/paddle/pten/kernels/cuda/manipulation.h
new file mode 100644
index 0000000000000..ac1cb0324f4ec
--- /dev/null
+++ b/paddle/pten/kernels/cuda/manipulation.h
@@ -0,0 +1,38 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+// CUDA and HIP use same api
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+
+#include "paddle/pten/core/dense_tensor.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/platform/device_context.h"
+
+namespace pten {
+
+using CUDAContext = paddle::platform::CUDADeviceContext;
+
+template <typename T>
+void Flatten(const CUDAContext& dev_ctx,
+             const DenseTensor& x,
+             int start_axis,
+             int stop_axis,
+             DenseTensor* out);
+
+}  // namespace pten
+
+#endif
diff --git a/paddle/pten/kernels/cuda/math.cu b/paddle/pten/kernels/cuda/math.cu
new file mode 100644
index 0000000000000..cc1a7bef4f18e
--- /dev/null
+++ b/paddle/pten/kernels/cuda/math.cu
@@ -0,0 +1,157 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/kernels/cuda/math.h"
+
+#include "paddle/pten/kernels/functions/eigen/mean.h"
+#include "paddle/pten/kernels/functions/eigen/scale.h"
+#include "paddle/pten/kernels/functions/eigen/sign.h"
+
+#ifdef __NVCC__
+#include "cub/cub.cuh"
+#endif
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/float16.h"
+#include "paddle/pten/api/lib/utils/tensor_utils.h"
+#include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/kernel_registry.h"
+
+namespace pten {
+
+/**
+ * Util Functors
+ */
+
+template <typename T>
+struct DivideFunctor {
+  HOSTDEVICE explicit inline DivideFunctor(int n)
+      : n_inv(static_cast<T>(1.0 / n)) {}
+
+  HOSTDEVICE inline T operator()(const T& x) const { return x * n_inv; }
+
+ private:
+  T n_inv;
+};
+
+/**
+ * Kernels
+ */
+
+template <typename T>
+void Sign(const CUDAContext& dev_ctx, const DenseTensor& x, DenseTensor* out) {
+  eigen::Sign<CUDAContext, T>(dev_ctx, x, out);
+}
+
+template <typename T>
+void Mean(const CUDAContext& dev_ctx, const DenseTensor& x, DenseTensor* out) {
+  auto size_prob = x.numel();
+  const T* x_data = x.data<T>();
+  T* out_data = out->mutable_data<T>();
+  auto stream = dev_ctx.stream();
+
+  DivideFunctor<T> transformer(size_prob);
+  cub::TransformInputIterator<T, DivideFunctor<T>, const T*> trans_x(
+      x_data, transformer);
+  size_t temp_storage_bytes = 0;
+
+  auto err = cub::DeviceReduce::Sum(
+      nullptr, temp_storage_bytes, trans_x, out_data, size_prob, stream);
+  PADDLE_ENFORCE_CUDA_SUCCESS(err);
+
+  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+      dev_ctx.GetPlace());
+  pten::DenseTensor tmp(
+      alloc,
+      DenseTensorMeta(x.data_type(),
+                      paddle::framework::make_ddim(
+                          {static_cast<int64_t>(temp_storage_bytes)}),
+                      x.layout()));
+  void* temp_storage = tmp.mutable_data<T>();
+  err = cub::DeviceReduce::Sum(static_cast<uint8_t*>(temp_storage),
+                               temp_storage_bytes,
+                               trans_x,
+                               out_data,
+                               size_prob,
+                               stream);
+  PADDLE_ENFORCE_CUDA_SUCCESS(err);
+}
+
+template <typename T>
+void Scale(const CUDAContext& dev_ctx,
+           const DenseTensor& x,
+           float scale,
+           float bias,
+           bool bias_after_scale,
+           DenseTensor* out) {
+  eigen::Scale<CUDAContext, T>(dev_ctx, x, scale, bias, bias_after_scale, out);
+}
+
+template <typename T>
+void ScaleHost(const CUDAContext& dev_ctx,
+               const DenseTensor& x,
+               const DenseTensor& scale,
+               float bias,
+               bool bias_after_scale,
+               DenseTensor* out) {
+  PADDLE_ENFORCE_EQ(paddle::platform::is_gpu_place(scale.place()),
+                    false,
+                    paddle::platform::errors::InvalidArgument(
+                        "Scale argument isn't a host tensor."));
+  eigen::Scale<CUDAContext, T>(dev_ctx,
+                               x,
+                               static_cast<float>(*scale.data<T>()),
+                               bias,
+                               bias_after_scale,
+                               out);
+}
+
+}  // namespace pten
+
+// TODO(chenweihang): replace by better impl
+PT_REGISTER_MODULE(MathCUDA);
+
+using float16 = paddle::platform::float16;
+PT_REGISTER_KERNEL("sign", CUDA, ANY, pten::Sign, float, double, float16) {}
+PT_REGISTER_KERNEL("mean", CUDA, ANY, pten::Mean, float, double, float16) {}
+PT_REGISTER_KERNEL("scale",
+                   CUDA,
+                   ANY,
+                   pten::Scale,
+                   float,
+                   double,
+                   float16,
+                   uint8_t,
+                   int8_t,
+                   int16_t,
+                   int,
+                   int64_t) {}
+PT_REGISTER_KERNEL("scale.host",
+                   CUDA,
+                   ANY,
+                   pten::ScaleHost,
+                   float,
+                   double,
+                   float16,
+                   uint8_t,
+                   int8_t,
+                   int16_t,
+                   int,
+                   int64_t) {
+  kernel->InputAt(1).SetBackend(pten::Backend::CPU);
+}
diff --git a/paddle/pten/kernels/cuda/math.h b/paddle/pten/kernels/cuda/math.h
new file mode 100644
index 0000000000000..65f4f41265836
--- /dev/null
+++ b/paddle/pten/kernels/cuda/math.h
@@ -0,0 +1,53 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+// CUDA and HIP use same api
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+
+#include "paddle/pten/core/dense_tensor.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/platform/device_context.h"
+
+namespace pten {
+
+using CUDAContext = paddle::platform::CUDADeviceContext;
+
+template <typename T>
+void Sign(const CUDAContext& dev_ctx, const DenseTensor& x, DenseTensor* out);
+
+template <typename T>
+void Mean(const CUDAContext& dev_ctx, const DenseTensor& x, DenseTensor* out);
+
+template <typename T>
+void Scale(const CUDAContext& dev_ctx,
+           const DenseTensor& x,
+           float scale,
+           float bias,
+           bool bias_after_scale,
+           DenseTensor* out);
+
+template <typename T>
+void ScaleHost(const CUDAContext& dev_ctx,
+               const DenseTensor& x,
+               const DenseTensor& scale,
+               float bias,
+               bool bias_after_scale,
+               DenseTensor* out);
+
+}  // namespace pten
+
+#endif
diff --git a/paddle/pten/kernels/cuda/utils.cu b/paddle/pten/kernels/cuda/utils.cu
new file mode 100644
index 0000000000000..e81e00a5873f7
--- /dev/null
+++ b/paddle/pten/kernels/cuda/utils.cu
@@ -0,0 +1,222 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/pten/common/data_type.h"
+#include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/kernel_registry.h"
+#include "paddle/pten/kernels/cuda/utils.h"
+
+namespace pten {
+
+void Copy(const CUDAContext& dev_ctx,
+          const DenseTensor& src,
+          DenseTensor* dst) {
+  auto* src_ptr = src.data();
+  auto* dst_ptr = dst->mutable_data();
+  const auto& src_place = src.place();
+  const auto& dst_place = dst->place();
+
+  if (src_ptr == dst_ptr && src_place == dst_place) {
+    VLOG(3) << "Skip copy the same data async from " << src_place << " to "
+            << dst_place;
+    return;
+  }
+  VLOG(4) << "src:" << src_ptr << ", dst:" << dst_ptr;
+
+  VLOG(3) << "TensorCopy " << src.dims() << " from " << src.place() << " to "
+          << dst_place;
+  dst->Resize(src.dims());
+  CHECK(dst->layout() == src.layout());
+  auto size = src.numel() * paddle::framework::SizeOfType(
+                                TransToProtoVarType(src.data_type()));
+
+  if (paddle::platform::is_cuda_pinned_place(src_place) &&  // NOLINT
+      paddle::platform::is_cuda_pinned_place(dst_place)) {
+    paddle::memory::Copy(
+        BOOST_GET_CONST(paddle::platform::CUDAPinnedPlace, dst_place),
+        dst_ptr,
+        BOOST_GET_CONST(paddle::platform::CUDAPinnedPlace, src_place),
+        src_ptr,
+        size);
+  } else if (paddle::platform::is_cuda_pinned_place(src_place) &&  // NOLINT
+             paddle::platform::is_cpu_place(dst_place)) {
+    paddle::memory::Copy(
+        BOOST_GET_CONST(paddle::platform::CPUPlace, dst_place),
+        dst_ptr,
+        BOOST_GET_CONST(paddle::platform::CUDAPinnedPlace, src_place),
+        src_ptr,
+        size);
+  } else if (paddle::platform::is_cpu_place(src_place) &&  // NOLINT
+             paddle::platform::is_cuda_pinned_place(dst_place)) {
+    paddle::memory::Copy(
+        BOOST_GET_CONST(paddle::platform::CUDAPinnedPlace, dst_place),
+        dst_ptr,
+        BOOST_GET_CONST(paddle::platform::CPUPlace, src_place),
+        src_ptr,
+        size);
+  } else if (paddle::platform::is_gpu_place(src_place) &&  // NOLINT
+             paddle::platform::is_cpu_place(dst_place)) {
+    auto src_gpu_place =
+        BOOST_GET_CONST(paddle::platform::CUDAPlace, src_place);
+    auto dst_cpu_place = BOOST_GET_CONST(paddle::platform::CPUPlace, dst_place);
+    auto ctx_place = dev_ctx.GetPlace();
+    PADDLE_ENFORCE_EQ(
+        paddle::platform::is_gpu_place(ctx_place),
+        true,
+        paddle::platform::errors::PreconditionNotMet(
+            "Context place error, excepted GPUPlace, but actually %s.",
+            ctx_place));
+    auto ctx_gpu_place =
+        BOOST_GET_CONST(paddle::platform::CUDAPlace, ctx_place);
+    PADDLE_ENFORCE_EQ(src_gpu_place,
+                      ctx_gpu_place,
+                      paddle::platform::errors::Unavailable(
+                          "Source place and context place do not match, source "
+                          "place is %s, context place is %s.",
+                          src_gpu_place,
+                          ctx_gpu_place));
+    auto stream =
+        reinterpret_cast<const paddle::platform::CUDADeviceContext&>(dev_ctx)
+            .stream();
+    paddle::memory::Copy(
+        dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
+  } else if (paddle::platform::is_cpu_place(src_place) &&  // NOLINT
+             paddle::platform::is_gpu_place(dst_place)) {
+    auto src_cpu_place = BOOST_GET_CONST(paddle::platform::CPUPlace, src_place);
+    auto dst_gpu_place =
+        BOOST_GET_CONST(paddle::platform::CUDAPlace, dst_place);
+    auto ctx_place = dev_ctx.GetPlace();
+    PADDLE_ENFORCE_EQ(
+        paddle::platform::is_gpu_place(ctx_place),
+        true,
+        paddle::platform::errors::PreconditionNotMet(
+            "Context place error, excepted GPUPlace, but actually %s.",
+            ctx_place));
+    auto ctx_gpu_place =
+        BOOST_GET_CONST(paddle::platform::CUDAPlace, ctx_place);
+    PADDLE_ENFORCE_EQ(dst_gpu_place,
+                      ctx_gpu_place,
+                      paddle::platform::errors::Unavailable(
+                          "Destination place and context place do not match, "
+                          "destination place is %s, context place is %s.",
+                          dst_gpu_place,
+                          ctx_gpu_place));
+    auto stream =
+        reinterpret_cast<const paddle::platform::CUDADeviceContext&>(dev_ctx)
+            .stream();
+    paddle::memory::Copy(
+        dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, stream);
+  } else if (paddle::platform::is_gpu_place(src_place) &&  // NOLINT
+             paddle::platform::is_cuda_pinned_place(dst_place)) {
+    auto src_gpu_place =
+        BOOST_GET_CONST(paddle::platform::CUDAPlace, src_place);
+    auto dst_cuda_pinned_place =
+        BOOST_GET_CONST(paddle::platform::CUDAPinnedPlace, dst_place);
+    auto ctx_place = dev_ctx.GetPlace();
+    PADDLE_ENFORCE_EQ(paddle::platform::is_gpu_place(ctx_place),
+                      true,
+                      paddle::platform::errors::PreconditionNotMet(
+                          "Device context place mismatch. When copying Tensor "
+                          "data from GPU memory to CUDA Pinned memory, current "
+                          "device context place should be GPU."));
+    auto ctx_gpu_place =
+        BOOST_GET_CONST(paddle::platform::CUDAPlace, ctx_place);
+    PADDLE_ENFORCE_EQ(src_gpu_place,
+                      ctx_gpu_place,
+                      paddle::platform::errors::PreconditionNotMet(
+                          "The source GPU device and current device context do "
+                          "not match. The source GPU device number is %d, but "
+                          "device context GPU number is %d.",
+                          src_gpu_place.device,
+                          ctx_gpu_place.device));
+    auto stream =
+        reinterpret_cast<const paddle::platform::CUDADeviceContext&>(dev_ctx)
+            .stream();
+    paddle::memory::Copy(
+        dst_cuda_pinned_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
+  } else if (paddle::platform::is_cuda_pinned_place(src_place) &&  // NOLINT
+             paddle::platform::is_gpu_place(dst_place)) {
+    auto src_cuda_pinned_place =
+        BOOST_GET_CONST(paddle::platform::CUDAPinnedPlace, src_place);
+    auto dst_gpu_place =
+        BOOST_GET_CONST(paddle::platform::CUDAPlace, dst_place);
+    auto ctx_place = dev_ctx.GetPlace();
+    PADDLE_ENFORCE_EQ(paddle::platform::is_gpu_place(ctx_place),
+                      true,
+                      paddle::platform::errors::PreconditionNotMet(
+                          "Device context place mismatch. When copying Tensor "
+                          "data from CUDA Pinned memory to GPU memory, current "
+                          "device context place should be GPU."));
+    auto ctx_gpu_place =
+        BOOST_GET_CONST(paddle::platform::CUDAPlace, ctx_place);
+    PADDLE_ENFORCE_EQ(dst_gpu_place,
+                      ctx_gpu_place,
+                      paddle::platform::errors::PreconditionNotMet(
+                          "The target GPU device and current device context do "
+                          "not match. The target GPU device number is %d, but "
+                          "device context GPU number is %d.",
+                          dst_gpu_place.device,
+                          ctx_gpu_place.device));
+    auto stream =
+        reinterpret_cast<const paddle::platform::CUDADeviceContext&>(dev_ctx)
+            .stream();
+    paddle::memory::Copy(
+        dst_gpu_place, dst_ptr, src_cuda_pinned_place, src_ptr, size, stream);
+  } else if (paddle::platform::is_gpu_place(src_place) &&  // NOLINT
+             paddle::platform::is_gpu_place(dst_place)) {
+    auto src_gpu_place =
+        BOOST_GET_CONST(paddle::platform::CUDAPlace, src_place);
+    auto dst_gpu_place =
+        BOOST_GET_CONST(paddle::platform::CUDAPlace, dst_place);
+    auto ctx_place = dev_ctx.GetPlace();
+    PADDLE_ENFORCE_EQ(
+        paddle::platform::is_gpu_place(ctx_place),
+        true,
+        paddle::platform::errors::PreconditionNotMet(
+            "Context place error, excepted GPUPlace, but actually %s.",
+            ctx_place));
+    auto stream =
+        reinterpret_cast<const paddle::platform::CUDADeviceContext&>(dev_ctx)
+            .stream();
+    if (paddle::platform::is_same_place(src_place, dst_place)) {
+      paddle::memory::Copy(
+          dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
+    } else {
+      if (paddle::platform::is_same_place(ctx_place, src_place)) {
+        paddle::memory::Copy(
+            dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
+        paddle::platform::DeviceContextPool::Instance()
+            .Get(src.place())
+            ->Wait();
+      } else if (paddle::platform::is_same_place(ctx_place, dst_place)) {
+        paddle::platform::DeviceContextPool::Instance()
+            .Get(src.place())
+            ->Wait();
+        paddle::memory::Copy(
+            dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
+      } else {
+        PADDLE_THROW(paddle::platform::errors::Unavailable(
+            "Context place dose not match the source and destination place."));
+      }
+    }
+  }
+}
+
+}  // namespace pten
+
+// TODO(chenweihang): replace by better impl
+PT_REGISTER_MODULE(UtilsCUDA);
+
+PT_REGISTER_KERNEL_WITH_NO_TYPE("copy", CUDA, ANY, pten::Copy) {}
diff --git a/paddle/pten/kernels/cuda/utils.h b/paddle/pten/kernels/cuda/utils.h
new file mode 100644
index 0000000000000..0d79f04f2ee5e
--- /dev/null
+++ b/paddle/pten/kernels/cuda/utils.h
@@ -0,0 +1,33 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+// CUDA and HIP use same api
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/kernel_registry.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/platform/device_context.h"
+namespace pten {
+
+using CUDAContext = paddle::platform::CUDADeviceContext;
+
+void Copy(const CUDAContext& dev_ctx, const DenseTensor& src, DenseTensor* dst);
+
+}  // namespace pten
+
+#endif
diff --git a/paddle/pten/kernels/functions/CMakeLists.txt b/paddle/pten/kernels/functions/CMakeLists.txt
new file mode 100644
index 0000000000000..a3b2bf314b4c0
--- /dev/null
+++ b/paddle/pten/kernels/functions/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(eigen)
diff --git a/paddle/pten/kernels/functions/eigen/CMakeLists.txt b/paddle/pten/kernels/functions/eigen/CMakeLists.txt
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/paddle/pten/kernels/functions/eigen/common.h b/paddle/pten/kernels/functions/eigen/common.h
new file mode 100644
index 0000000000000..5ac083f710213
--- /dev/null
+++ b/paddle/pten/kernels/functions/eigen/common.h
@@ -0,0 +1,171 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stdint.h>
+
+#include "paddle/pten/core/dense_tensor.h"
+#include "unsupported/Eigen/CXX11/Tensor"
+
+namespace pten {
+
+// EigenDim converts paddle::platform::DDim into Eigen::DSizes.
+template <int D>
+struct EigenDim {
+  using Type = Eigen::DSizes<Eigen::DenseIndex, D>;
+
+  static Type From(const DDim& dims) {
+    PADDLE_ENFORCE_EQ(arity(dims),
+                      D,
+                      paddle::platform::errors::InvalidArgument(
+                          "Input dimension size should be equal to %d, but "
+                          "received dimension size is %d.",
+                          arity(dims),
+                          D));
+    Type ret;
+    for (int64_t d = 0; d < arity(dims); d++) {
+      ret[d] = dims[d];
+    }
+    return ret;
+  }
+};
+
+// Interpret paddle::platform::Tensor as EigenTensor and EigenConstTensor.
+template <typename T,
+          size_t D,
+          int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+struct EigenTensor {
+  // TODO(qijun) Now, default type in unaligned, and we will make a benchmark on
+  // the speed of aligned and unaligned version in future.
+  using Type = Eigen::TensorMap<Eigen::Tensor<T, D, MajorType, IndexType>>;
+
+  using ConstType =
+      Eigen::TensorMap<Eigen::Tensor<const T, D, MajorType, IndexType>>;
+
+  static Type From(pten::DenseTensor& tensor, DDim dims) {  // NOLINT
+    // why tensor.data<T>() not work?
+    // return Type(const_cast<T*>(reinterpret_cast<const T*>(tensor.data())),
+    // EigenDim<D>::From(dims));
+    return Type(const_cast<T*>(tensor.data<T>()), EigenDim<D>::From(dims));
+  }
+
+  static Type From(pten::DenseTensor& tensor) {  // NOLINT
+    return From(tensor, tensor.dims());
+  }  // NOLINT
+
+  static ConstType From(const pten::DenseTensor& tensor, DDim dims) {
+    // return ConstType(reinterpret_cast<const T*>(tensor.data()),
+    // EigenDim<D>::From(dims));
+    return ConstType(tensor.data<T>(), EigenDim<D>::From(dims));
+  }
+
+  static ConstType From(const pten::DenseTensor& tensor) {
+    return From(tensor, tensor.dims());
+  }
+};
+
+template <typename T,
+          int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+struct EigenMatrix : public EigenTensor<T, 2, MajorType, IndexType> {
+  static typename EigenMatrix::Type Reshape(
+      pten::DenseTensor& tensor,  // NOLINT
+      int num_col_dims) {
+    int rank = tensor.dims().size();
+    PADDLE_ENFORCE_EQ((num_col_dims > 0 && num_col_dims < rank),
+                      true,
+                      paddle::platform::errors::InvalidArgument(
+                          "Input dimension number(num_col_dims) must be "
+                          "between 0 and %d, but received number is %d.",
+                          rank,
+                          num_col_dims));
+    return EigenMatrix::From(tensor,
+                             flatten_to_2d(tensor.dims(), num_col_dims));
+  }
+
+  static typename EigenMatrix::ConstType Reshape(
+      const pten::DenseTensor& tensor, int num_col_dims) {
+    int rank = tensor.dims().size();
+    PADDLE_ENFORCE_EQ((num_col_dims > 0 && num_col_dims < rank),
+                      true,
+                      paddle::platform::errors::InvalidArgument(
+                          "Input dimension number(num_col_dims) must be "
+                          "between 0 and %d, but received number is %d.",
+                          rank,
+                          num_col_dims));
+    return EigenMatrix::From(tensor,
+                             flatten_to_2d(tensor.dims(), num_col_dims));
+  }
+};
+
+template <typename T,
+          int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+struct EigenVector : public EigenTensor<T, 1, MajorType, IndexType> {
+  // Flatten reshapes a Tensor into an EigenVector.
+  static typename EigenVector::Type Flatten(
+      pten::DenseTensor& tensor) {  // NOLINT
+    return EigenVector::From(tensor, {product(tensor.dims())});
+  }
+
+  static typename EigenVector::ConstType Flatten(
+      const pten::DenseTensor& tensor) {  // NOLINT
+    return EigenVector::From(tensor, {product(tensor.dims())});
+  }
+};
+
+template <typename T,
+          int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+struct EigenScalar {
+  // Scalar tensor (implemented as a rank-0 tensor) of scalar type T.
+  using Type = Eigen::TensorMap<
+      Eigen::TensorFixedSize<T, Eigen::Sizes<>, MajorType, IndexType>>;
+  using ConstType = Eigen::TensorMap<
+      Eigen::TensorFixedSize<const T, Eigen::Sizes<>, MajorType, IndexType>>;
+
+  static Type From(pten::DenseTensor& tensor) {  // NOLINT
+    return Type(const_cast<T*>(tensor.data<T>()));
+  }
+
+  static ConstType From(const pten::DenseTensor& tensor) {
+    return ConstType(tensor.data<T>());
+  }
+};
+
+// Define Tensor with 32-bit index.
+template <typename T, int D, int MajorType = Eigen::RowMajor>
+using Tensor32BitIndex =
+    Eigen::TensorMap<Eigen::Tensor<T, D, MajorType, int>, Eigen::Aligned>;
+
+template <typename DSizes>
+Eigen::DSizes<int, DSizes::count> To32BitDims(const DSizes& in) {
+  Eigen::DSizes<int, DSizes::count> out;
+  for (int i = 0; i < DSizes::count; ++i) {
+    out[i] = in[i];
+  }
+  return out;
+}
+
+template <typename EigenTensor>
+Tensor32BitIndex<typename EigenTensor::Scalar, EigenTensor::NumIndices>
+To32BitIndex(EigenTensor in) {
+  using RetType =
+      Tensor32BitIndex<typename EigenTensor::Scalar, EigenTensor::NumIndices>;
+  return RetType(in.data(), To32BitDims(in.dimensions()));
+}
+
+}  // namespace pten
diff --git a/paddle/pten/kernels/functions/eigen/dot.h b/paddle/pten/kernels/functions/eigen/dot.h
new file mode 100644
index 0000000000000..300da4ae1f13b
--- /dev/null
+++ b/paddle/pten/kernels/functions/eigen/dot.h
@@ -0,0 +1,49 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/kernels/functions/eigen/common.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/operators/eigen/eigen_function.h"
+
+namespace pten {
+namespace eigen {
+
+template <typename DevCtx, typename T>
+void Dot(const DevCtx& dev_ctx,
+         const DenseTensor& x,
+         const DenseTensor& y,
+         DenseTensor* out) {
+  if (1 == out->dims().size()) {
+    auto eigen_out = pten::EigenScalar<T>::From(*out);
+    auto eigen_x = pten::EigenVector<T>::Flatten(x);
+    auto eigen_y = pten::EigenVector<T>::Flatten(y);
+
+    auto& dev = *dev_ctx.eigen_device();
+    eigen_out.device(dev) = (eigen_x * eigen_y).sum();
+  } else {
+    auto eigen_out = pten::EigenMatrix<T>::From(*out);
+    auto eigen_x = pten::EigenMatrix<T>::From(x);
+    auto eigen_y = pten::EigenMatrix<T>::From(y);
+
+    auto& dev = *dev_ctx.eigen_device();
+    eigen_out.device(dev) = (eigen_x * eigen_y).sum(Eigen::DSizes<int, 1>(1));
+  }
+}
+
+}  // namespace eigen
+}  // namespace pten
diff --git a/paddle/pten/kernels/functions/eigen/fill.h b/paddle/pten/kernels/functions/eigen/fill.h
new file mode 100644
index 0000000000000..122a6aef22dc6
--- /dev/null
+++ b/paddle/pten/kernels/functions/eigen/fill.h
@@ -0,0 +1,34 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/kernels/functions/eigen/common.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/operators/eigen/eigen_function.h"
+
+namespace pten {
+namespace eigen {
+
+template <typename DeviceContext, typename T, typename VType>
+void fill(const DeviceContext& context, DenseTensor* tensor, VType val) {
+  tensor->mutable_data<T>();
+  auto t = pten::EigenVector<T>::Flatten(*tensor);
+  t.device(*context.eigen_device()) = t.constant(static_cast<T>(val));
+}
+
+}  // namespace eigen
+}  // namespace pten
diff --git a/paddle/pten/kernels/functions/eigen/mean.h b/paddle/pten/kernels/functions/eigen/mean.h
new file mode 100644
index 0000000000000..ee4bf1653f23a
--- /dev/null
+++ b/paddle/pten/kernels/functions/eigen/mean.h
@@ -0,0 +1,39 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/kernels/functions/eigen/common.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/operators/eigen/eigen_function.h"
+
+namespace pten {
+namespace eigen {
+
+template <typename DevCtx, typename T>
+void Mean(const DevCtx& dev_ctx, const DenseTensor& x, DenseTensor* out) {
+  // TODO(chenweihang): if we design new tensor, we should support
+  // the low-level calc functor use new tensor as input,
+  // which may be a big project!
+  auto eigen_x = pten::EigenVector<T>::Flatten(x);
+  auto eigen_out = pten::EigenScalar<T>::From(*out);
+
+  auto& dev = *dev_ctx.eigen_device();
+  eigen_out.device(dev) = eigen_x.mean();
+}
+
+}  // namespace eigen
+}  // namespace pten
diff --git a/paddle/pten/kernels/functions/eigen/scale.h b/paddle/pten/kernels/functions/eigen/scale.h
new file mode 100644
index 0000000000000..49ee561df50ec
--- /dev/null
+++ b/paddle/pten/kernels/functions/eigen/scale.h
@@ -0,0 +1,51 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/kernels/functions/eigen/common.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/operators/eigen/eigen_function.h"
+
+namespace pten {
+namespace eigen {
+
+template <typename DevCtx, typename T>
+void Scale(const DevCtx& dev_ctx,
+           const DenseTensor& x,
+           float scale,
+           float bias,
+           bool bias_after_scale,
+           DenseTensor* out) {
+  // calc
+  out->mutable_data<T>();
+  auto eigen_out = pten::EigenVector<T>::Flatten(*out);
+  auto eigen_x = pten::EigenVector<T>::Flatten(x);
+  auto& dev = *dev_ctx.eigen_device();
+  // TODO(chenweihang): now the eigen function here need the dtype of scale,
+  // eigen_x, bias should be same, so here need cast for two scalar arg,
+  // maybe we declare that the type of scale and bias is T?
+  paddle::operators::EigenScale<std::decay_t<decltype(dev)>, T>::Eval(
+      dev,
+      eigen_out,
+      eigen_x,
+      static_cast<T>(scale),
+      static_cast<T>(bias),
+      bias_after_scale);
+}
+
+}  // namespace eigen
+}  // namespace pten
diff --git a/paddle/pten/kernels/functions/eigen/sign.h b/paddle/pten/kernels/functions/eigen/sign.h
new file mode 100644
index 0000000000000..5cd620815bf26
--- /dev/null
+++ b/paddle/pten/kernels/functions/eigen/sign.h
@@ -0,0 +1,41 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/kernels/functions/eigen/common.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/operators/eigen/eigen_function.h"
+
+namespace pten {
+namespace eigen {
+
+template <typename DevCtx, typename T>
+void Sign(const DevCtx& dev_ctx, const DenseTensor& x, DenseTensor* out) {
+  out->mutable_data<T>();
+  // TODO(chenweihang): if we design new tensor, we should support
+  // the low-level calc functor use new tensor as input,
+  // which may be a big project!
+  auto eigen_out = pten::EigenVector<T>::Flatten(*out);
+  auto eigen_x = pten::EigenVector<T>::Flatten(x);
+
+  auto& dev = *dev_ctx.eigen_device();
+  paddle::operators::EigenSign<std::decay_t<decltype(dev)>, T>::Eval(
+      dev, eigen_out, eigen_x);
+}
+
+}  // namespace eigen
+}  // namespace pten
diff --git a/paddle/pten/kernels/functions/math/matmul_func.h b/paddle/pten/kernels/functions/math/matmul_func.h
new file mode 100644
index 0000000000000..b5ddd26a95576
--- /dev/null
+++ b/paddle/pten/kernels/functions/math/matmul_func.h
@@ -0,0 +1,491 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/math/complex_functors.h"
+
+#include "paddle/fluid/operators/eigen/eigen_function.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/kernels/functions/eigen/common.h"
+
+namespace pten {
+namespace math {
+
+static void GetBroadcastFromDims(const int x_ndim,
+                                 const std::int64_t* x_dims,
+                                 const int y_ndim,
+                                 const std::int64_t* y_dims,
+                                 std::int64_t* x_bd_dims,
+                                 std::int64_t* y_bd_dims,
+                                 std::int64_t* out_bd_dims) {
+  const int ndim = (std::max)(x_ndim, y_ndim);
+  std::fill(x_bd_dims, x_bd_dims + ndim - x_ndim, 1);
+  std::fill(y_bd_dims, y_bd_dims + ndim - y_ndim, 1);
+  std::copy(x_dims, x_dims + x_ndim, x_bd_dims + ndim - x_ndim);
+  std::copy(y_dims, y_dims + y_ndim, y_bd_dims + ndim - y_ndim);
+
+  for (int i = 0; i < ndim; ++i) {
+    PADDLE_ENFORCE_EQ(
+        x_bd_dims[i] == y_bd_dims[i] || x_bd_dims[i] <= 1 || y_bd_dims[i] <= 1,
+        true,
+        paddle::platform::errors::InvalidArgument(
+            "Input(X) and Input(Y) has error dim."
+            "X_broadcast's shape[%s] must be equal to Y_broadcast's shape[%s],"
+            "or X_broadcast's shape[%s] <= 1, or Y_broadcast's shape[%s] <= 1,"
+            "But received X_broadcast's shape[%s] = [%s]"
+            "received Y_broadcast's shape[%s] = [%s]",
+            i,
+            i,
+            i,
+            i,
+            i,
+            x_bd_dims[i],
+            i,
+            y_bd_dims[i]));
+    if (x_bd_dims[i] == 0 || y_bd_dims[i] == 0) {
+      out_bd_dims[i] = 0;
+    } else {
+      out_bd_dims[i] = (std::max)(x_bd_dims[i], y_bd_dims[i]);
+    }
+  }
+}
+
+static int64_t GetIndexMessage(const int n,
+                               const int64_t* dims,
+                               const int64_t* index) {
+  int64_t sum = 0;
+  for (int i = 0; i < n; ++i) {
+    if (dims[i] > 1) {
+      sum = sum * dims[i] + index[i];
+    }
+  }
+  return sum;
+}
+
+static void IndexIncreaseFromDims(const int ndim,
+                                  const int64_t* dims,
+                                  int64_t* index) {
+  for (int i = ndim - 1; i >= 0; --i) {
+    ++index[i];
+    if (index[i] >= dims[i]) {
+      index[i] -= dims[i];
+    } else {
+      break;
+    }
+  }
+}
+
+template <typename DeviceContext, typename T>
+void MatMulFunction(const DeviceContext& dev_ctx,
+                    const DenseTensor& X,
+                    const DenseTensor& Y,
+                    const std::vector<std::int64_t>& x_dims,
+                    const std::vector<std::int64_t>& y_dims,
+                    DenseTensor* Out,
+                    bool trans_x,
+                    bool trans_y,
+                    bool flag = false) {
+  const int x_ndim = x_dims.size();
+  const int y_ndim = y_dims.size();
+
+  // Get data ptr
+  const T* x_data = X.data<T>();
+  const T* y_data = Y.data<T>();
+
+  if (x_ndim == 1 && y_ndim == 1) {
+    PADDLE_ENFORCE_EQ(
+        X.numel(),
+        Y.numel(),
+        paddle::platform::errors::InvalidArgument(
+            "X's numbers must be equal to Y's numbers,"
+            "when X/Y's dims =1. But received X has [%d] elements,"
+            "received Y has [%d] elements",
+            X.numel(),
+            Y.numel()));
+    VLOG(3) << "MatMul's case 1";
+    Out->Resize({1});
+    Out->mutable_data<T>();
+    auto out_eigen = EigenScalar<T>::From(*Out);
+    auto x_eigen = EigenVector<T>::Flatten(X);
+    auto y_eigen = EigenVector<T>::Flatten(Y);
+
+    auto& dev = *dev_ctx.eigen_device();
+    if (flag) {
+      out_eigen.device(dev) = (x_eigen * y_eigen).sum() + out_eigen;
+    } else {
+      out_eigen.device(dev) = (x_eigen * y_eigen).sum();
+    }
+    return;
+  }
+
+  auto blas = paddle::operators::math::GetBlas<DeviceContext, T>(dev_ctx);
+
+  if (x_ndim == 1) {
+    const int N = X.numel();
+    if (trans_y) {
+      PADDLE_ENFORCE_EQ(y_dims[y_ndim - 1],
+                        N,
+                        paddle::platform::errors::InvalidArgument(
+                            "Input(Y) has error dim."
+                            "Y'dims[%d] must be equal to %d"
+                            "But received Y'dims[%d] is %d",
+                            y_ndim - 1,
+                            N,
+                            y_ndim - 1,
+                            y_dims[y_ndim - 1]));
+    } else {
+      PADDLE_ENFORCE_EQ(y_dims[y_ndim - 2],
+                        N,
+                        paddle::platform::errors::InvalidArgument(
+                            "Input(Y) has error dim."
+                            "Y'dims[%d] must be equal to %d"
+                            "But received Y'dims[%d] is %d",
+                            y_ndim - 2,
+                            N,
+                            y_ndim - 2,
+                            y_dims[y_ndim - 2]));
+    }
+    std::vector<std::int64_t> out_dims(y_ndim - 1);
+    if (trans_y) {
+      std::copy_n(y_dims.cbegin(), y_ndim - 1, out_dims.begin());
+    } else {
+      std::copy_n(y_dims.cbegin(), y_ndim - 2, out_dims.begin());
+      out_dims.back() = y_dims.back();
+    }
+    Out->Resize(paddle::framework::make_ddim(out_dims));
+    Out->mutable_data<T>();
+    if (trans_y) {
+      const int M = Y.numel() / N;
+      VLOG(3) << "MatMul's case 2";
+      blas.GEMV(false,
+                M,
+                N,
+                static_cast<T>(1),
+                y_data,
+                x_data,
+                static_cast<T>(flag),
+                Out->mutable_data<T>());
+    } else {
+      const int M = y_dims[y_ndim - 1];
+      const int batch_size = Y.numel() / (M * N);
+      if (batch_size == 1) {
+        VLOG(3) << "MatMul's case 3";
+        blas.GEMV(true,
+                  N,
+                  M,
+                  static_cast<T>(1),
+                  y_data,
+                  x_data,
+                  static_cast<T>(flag),
+                  Out->mutable_data<T>());
+      } else {
+        VLOG(3) << "MatMul's case 4";
+        blas.BatchedGEMM(CblasTrans,
+                         CblasNoTrans,
+                         M,
+                         1,
+                         N,
+                         static_cast<T>(1),
+                         y_data,
+                         x_data,
+                         static_cast<T>(flag),
+                         Out->mutable_data<T>(),
+                         batch_size,
+                         M * N,
+                         0);
+      }
+    }
+    return;
+  }
+
+  if (y_ndim == 1) {
+    const int N = Y.numel();
+    if (trans_x) {
+      PADDLE_ENFORCE_EQ(x_dims[x_ndim - 2],
+                        N,
+                        paddle::platform::errors::InvalidArgument(
+                            "Input(X) has error dim."
+                            "X'dims[%d] must be equal to %d"
+                            "But received X'dims[%d] is %d",
+                            x_ndim - 2,
+                            N,
+                            x_ndim - 2,
+                            x_dims[x_ndim - 2]));
+    } else {
+      PADDLE_ENFORCE_EQ(x_dims[x_ndim - 1],
+                        N,
+                        paddle::platform::errors::InvalidArgument(
+                            "Input(X) has error dim."
+                            "X'dims[%d] must be equal to %d"
+                            "But received X'dims[%d] is %d",
+                            x_ndim - 1,
+                            N,
+                            x_ndim - 1,
+                            x_dims[x_ndim - 1]));
+    }
+    std::vector<std::int64_t> out_dims(x_ndim - 1);
+    if (trans_x) {
+      std::copy_n(x_dims.cbegin(), x_ndim - 2, out_dims.begin());
+      out_dims.back() = x_dims.back();
+    } else {
+      std::copy_n(x_dims.cbegin(), x_ndim - 1, out_dims.begin());
+    }
+    Out->Resize(paddle::framework::make_ddim(out_dims));
+    Out->mutable_data<T>();
+
+    if (trans_x) {
+      const int M = x_dims[x_ndim - 1];
+      const int batch_size = X.numel() / (M * N);
+      if (batch_size == 1) {
+        VLOG(3) << "MatMul's case 5";
+        blas.GEMV(true,
+                  N,
+                  M,
+                  static_cast<T>(1),
+                  x_data,
+                  y_data,
+                  static_cast<T>(flag),
+                  Out->mutable_data<T>());
+      } else {
+        VLOG(3) << "MatMul's case 6";
+        blas.BatchedGEMM(CblasTrans,
+                         CblasNoTrans,
+                         M,
+                         1,
+                         N,
+                         static_cast<T>(1),
+                         x_data,
+                         y_data,
+                         static_cast<T>(flag),
+                         Out->mutable_data<T>(),
+                         batch_size,
+                         M * N,
+                         0);
+      }
+    } else {
+      const int M = X.numel() / N;
+      VLOG(3) << "MatMul's case 7";
+      blas.GEMV(false,
+                M,
+                N,
+                static_cast<T>(1),
+                x_data,
+                y_data,
+                static_cast<T>(flag),
+                Out->mutable_data<T>());
+    }
+    return;
+  }
+
+  const int M = trans_x ? x_dims[x_ndim - 1] : x_dims[x_ndim - 2];
+  const int K = trans_x ? x_dims[x_ndim - 2] : x_dims[x_ndim - 1];
+  if (trans_y) {
+    PADDLE_ENFORCE_EQ(y_dims[y_ndim - 1],
+                      K,
+                      paddle::platform::errors::InvalidArgument(
+                          "Input(Y) has error dim."
+                          "Y'dims[%d] must be equal to %d"
+                          "But received Y'dims[%d] is %d",
+                          y_ndim - 1,
+                          K,
+                          y_ndim - 1,
+                          y_dims[y_ndim - 1]));
+  } else {
+    PADDLE_ENFORCE_EQ(y_dims[y_ndim - 2],
+                      K,
+                      paddle::platform::errors::InvalidArgument(
+                          "Input(Y) has error dim."
+                          "Y'dims[%d] must be equal to %d"
+                          "But received Y'dims[%d] is %d",
+                          y_ndim - 2,
+                          K,
+                          y_ndim - 2,
+                          y_dims[y_ndim - 2]));
+  }
+  const int N = trans_y ? y_dims[y_ndim - 2] : y_dims[y_ndim - 1];
+  const int ndim = (std::max)(x_ndim, y_ndim);
+  std::vector<std::int64_t> x_broadcast_dims(ndim);
+  std::vector<std::int64_t> y_broadcast_dims(ndim);
+  std::vector<std::int64_t> out_broadcast_dims(ndim);
+
+  GetBroadcastFromDims(x_ndim - 2,
+                       x_dims.data(),
+                       y_ndim - 2,
+                       y_dims.data(),
+                       x_broadcast_dims.data(),
+                       y_broadcast_dims.data(),
+                       out_broadcast_dims.data());
+  out_broadcast_dims[ndim - 2] = M;
+  out_broadcast_dims[ndim - 1] = N;
+
+  Out->Resize(paddle::framework::make_ddim(out_broadcast_dims));
+  Out->mutable_data<T>();
+
+  const int batch_dim = ndim - 2;
+  // broadcast message
+  const bool is_broadcast_dims =
+      !std::equal(x_broadcast_dims.cbegin(),
+                  x_broadcast_dims.cbegin() + batch_dim,
+                  y_broadcast_dims.cbegin());
+
+  const std::int64_t x_batch_size =
+      std::accumulate(x_broadcast_dims.cbegin(),
+                      x_broadcast_dims.cbegin() + batch_dim,
+                      1LL,
+                      std::multiplies<std::int64_t>());
+  const std::int64_t y_batch_size =
+      std::accumulate(y_broadcast_dims.cbegin(),
+                      y_broadcast_dims.cbegin() + batch_dim,
+                      1LL,
+                      std::multiplies<std::int64_t>());
+  const std::int64_t out_batch_size =
+      std::accumulate(out_broadcast_dims.cbegin(),
+                      out_broadcast_dims.cbegin() + batch_dim,
+                      1LL,
+                      std::multiplies<std::int64_t>());
+  if (out_batch_size == 0) return;
+  if (x_batch_size == 1 && y_batch_size == 1) {
+    VLOG(3) << "MatMul's case 8";
+    blas.GEMM(trans_x ? CblasTrans : CblasNoTrans,
+              trans_y ? CblasTrans : CblasNoTrans,
+              M,
+              N,
+              K,
+              static_cast<T>(1),
+              x_data,
+              y_data,
+              static_cast<T>(flag),
+              Out->mutable_data<T>());
+  } else if (x_batch_size == 1) {
+    if (M == 1 && trans_y) {
+      VLOG(3) << "MatMul's case 9";
+      blas.GEMV(false,
+                y_batch_size * N,
+                K,
+                static_cast<T>(1),
+                y_data,
+                x_data,
+                static_cast<T>(flag),
+                Out->mutable_data<T>());
+    } else {
+      VLOG(3) << "MatMul's case 10";
+      blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans,
+                       trans_y ? CblasTrans : CblasNoTrans,
+                       M,
+                       N,
+                       K,
+                       static_cast<T>(1),
+                       x_data,
+                       y_data,
+                       static_cast<T>(flag),
+                       Out->mutable_data<T>(),
+                       out_batch_size,
+                       0,
+                       K * N);
+    }
+  } else if (y_batch_size == 1) {
+    if (!trans_x) {
+      VLOG(3) << "MatMul's case 11";
+      blas.GEMM(CblasNoTrans,
+                trans_y ? CblasTrans : CblasNoTrans,
+                x_batch_size * M,
+                N,
+                K,
+                static_cast<T>(1),
+                x_data,
+                y_data,
+                static_cast<T>(flag),
+                Out->mutable_data<T>());
+    } else {
+      VLOG(3) << "MatMul's case 12";
+      blas.BatchedGEMM(CblasTrans,
+                       trans_y ? CblasTrans : CblasNoTrans,
+                       M,
+                       N,
+                       K,
+                       static_cast<T>(1),
+                       x_data,
+                       y_data,
+                       static_cast<T>(flag),
+                       Out->mutable_data<T>(),
+                       out_batch_size,
+                       M * K,
+                       0);
+    }
+  } else if (!is_broadcast_dims) {
+    VLOG(3) << "MatMul's case 13";
+    blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans,
+                     trans_y ? CblasTrans : CblasNoTrans,
+                     M,
+                     N,
+                     K,
+                     static_cast<T>(1),
+                     x_data,
+                     y_data,
+                     static_cast<T>(flag),
+                     Out->mutable_data<T>(),
+                     out_batch_size,
+                     M * K,
+                     K * N);
+  } else {
+    // in the case, can't use stridedgemm
+    std::vector<const T*> x_ptr(out_batch_size);
+    std::vector<const T*> y_ptr(out_batch_size);
+    std::vector<T*> out_ptr(out_batch_size);
+    std::vector<std::int64_t> index(batch_dim, 0);
+    for (std::int64_t i = 0; i < out_batch_size; ++i) {
+      // using the index to get offset
+      const std::int64_t x_index =
+          GetIndexMessage(batch_dim, x_broadcast_dims.data(), index.data());
+      const std::int64_t y_index =
+          GetIndexMessage(batch_dim, y_broadcast_dims.data(), index.data());
+
+      x_ptr[i] = x_data + x_index * M * K;
+      y_ptr[i] = y_data + y_index * K * N;
+      out_ptr[i] = Out->mutable_data<T>() + i * M * N;
+      IndexIncreaseFromDims(batch_dim, out_broadcast_dims.data(), index.data());
+    }
+    VLOG(3) << "MatMul's case 14";
+    blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans,
+                     trans_y ? CblasTrans : CblasNoTrans,
+                     M,
+                     N,
+                     K,
+                     static_cast<T>(1),
+                     x_ptr.data(),
+                     y_ptr.data(),
+                     static_cast<T>(flag),
+                     out_ptr.data(),
+                     out_batch_size);
+  }
+}
+
+template <typename DeviceContext, typename T>
+void MatMulFunction(const DeviceContext& dev_ctx,
+                    const DenseTensor& X,
+                    const DenseTensor& Y,
+                    DenseTensor* Out,
+                    bool trans_x,
+                    bool trans_y,
+                    bool flag = false) {
+  const std::vector<std::int64_t> x_dims = vectorize(X.dims());
+  const std::vector<std::int64_t> y_dims = vectorize(Y.dims());
+  MatMulFunction<DeviceContext, T>(
+      dev_ctx, X, Y, x_dims, y_dims, Out, trans_x, trans_y, flag);
+}
+
+}  // namespace math
+}  // namespace pten
diff --git a/paddle/pten/kernels/mkldnn/CMakeLists.txt b/paddle/pten/kernels/mkldnn/CMakeLists.txt
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/paddle/pten/kernels/npu/CMakeLists.txt b/paddle/pten/kernels/npu/CMakeLists.txt
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/paddle/pten/kernels/xpu/CMakeLists.txt b/paddle/pten/kernels/xpu/CMakeLists.txt
new file mode 100644
index 0000000000000..3ba070bdd6c96
--- /dev/null
+++ b/paddle/pten/kernels/xpu/CMakeLists.txt
@@ -0,0 +1,2 @@
+cc_library(utils_xpu SRCS utils.cc DEPS dense_tensor kernel_context kernel_factory memory convert_utils)
+cc_library(manipulation_xpu SRCS manipulation.cc DEPS dense_tensor kernel_context kernel_factory utils_xpu unary)
diff --git a/paddle/pten/kernels/xpu/manipulation.cc b/paddle/pten/kernels/xpu/manipulation.cc
new file mode 100644
index 0000000000000..379e459a60515
--- /dev/null
+++ b/paddle/pten/kernels/xpu/manipulation.cc
@@ -0,0 +1,82 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/pten/kernels/xpu/manipulation.h"
+#include "paddle/pten/infershape/unary.h"
+#include "paddle/pten/kernels/xpu/utils.h"
+
+namespace pten {
+
+template <typename T>
+void Flatten(const XPUContext& dev_ctx,
+             const DenseTensor& x,
+             int start_axis,
+             int stop_axis,
+             DenseTensor* out) {
+  auto out_dims = out->dims();
+  pten::Copy(dev_ctx, x, out);
+  out->Resize(out_dims);
+}
+
+// TODO(yuanrisheng): this kernel is for training and xshape is a Intermediate
+// Output Tensor，
+// is there a more flexible way to deal with this case?
+template <typename T>
+void FlattenWithXShape(const XPUContext& dev_ctx,
+                       const DenseTensor& x,
+                       int start_axis,
+                       int stop_axis,
+                       DenseTensor* out,
+                       DenseTensor* xshape) {
+  Flatten<T>(dev_ctx, x, start_axis, stop_axis, out);
+  const auto& in_dims = x.dims();
+  std::vector<int64_t> xshape_dims(in_dims.size() + 1);
+  xshape_dims[0] = 0;
+  for (int i = 0; i < in_dims.size(); ++i) {
+    xshape_dims[i + 1] = in_dims[i];
+  }
+  xshape->Resize(paddle::framework::make_ddim(xshape_dims));
+  xshape->set_lod(x.lod());
+}
+
+}  // namespace pten
+
+// TODO(chenweihang): replace by better impl
+PT_REGISTER_MODULE(ManipulationXPU);
+
+// TODO(yuanrisheng): "flatten_contiguous_range" is compatible with old kernel
+// architecture, kernel_name should be "flatten".
+PT_REGISTER_KERNEL("flatten_contiguous_range",
+                   XPU,
+                   ANY,
+                   pten::Flatten,
+                   float,
+                   paddle::platform::float16,
+                   double,
+                   uint8_t,
+                   int8_t,
+                   int,
+                   int64_t) {}
+
+PT_REGISTER_KERNEL("flatten_contiguous_range.mid",
+                   XPU,
+                   ANY,
+                   pten::FlattenWithXShape,
+                   float,
+                   paddle::platform::float16,
+                   double,
+                   uint8_t,
+                   int8_t,
+                   int,
+                   int64_t) {}
diff --git a/paddle/pten/kernels/xpu/manipulation.h b/paddle/pten/kernels/xpu/manipulation.h
new file mode 100644
index 0000000000000..02947759b477e
--- /dev/null
+++ b/paddle/pten/kernels/xpu/manipulation.h
@@ -0,0 +1,38 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef PADDLE_WITH_XPU
+
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/kernel_registry.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/platform/device_context.h"
+
+namespace pten {
+
+using XPUContext = paddle::platform::XPUDeviceContext;
+
+template <typename T>
+void Flatten(const XPUContext& dev_ctx,
+             const DenseTensor& x,
+             int start_axis,
+             int stop_axis,
+             DenseTensor* out);
+
+}  // namespace pten
+
+#endif
diff --git a/paddle/pten/kernels/xpu/utils.cc b/paddle/pten/kernels/xpu/utils.cc
new file mode 100644
index 0000000000000..33bdc66ff01f3
--- /dev/null
+++ b/paddle/pten/kernels/xpu/utils.cc
@@ -0,0 +1,81 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/kernels/xpu/utils.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/pten/common/data_type.h"
+#include "paddle/pten/core/convert_utils.h"
+
+namespace pten {
+
+void Copy(const XPUDeviceContext& dev_ctx,
+          const DenseTensor& src,
+          DenseTensor* dst) {
+  auto* src_ptr = src.data();
+  auto* dst_ptr = dst->mutable_data();
+  const auto& src_place = src.place();
+  const auto& dst_place = dst->place();
+
+  if (src_ptr == dst_ptr && src_place == dst_place) {
+    VLOG(3) << "Skip copy the same data async from " << src_place << " to "
+            << dst_place;
+    return;
+  }
+  VLOG(4) << "src:" << src_ptr << ", dst:" << dst_ptr;
+
+  VLOG(3) << "TensorCopy " << src.dims() << " from " << src.place() << " to "
+          << dst_place;
+  dst->Resize(src.dims());
+  CHECK(dst->layout() == src.layout());
+  auto size = src.numel() * paddle::framework::SizeOfType(
+                                TransToProtoVarType(src.data_type()));
+
+  if (paddle::platform::is_xpu_place(src_place) &&  // NOLINT
+      paddle::platform::is_cpu_place(dst_place)) {
+    paddle::memory::Copy(BOOST_GET_CONST(paddle::platform::CPUPlace, dst_place),
+                         dst_ptr,
+                         BOOST_GET_CONST(paddle::platform::XPUPlace, src_place),
+                         src_ptr,
+                         size);
+  } else if (paddle::platform::is_cpu_place(src_place) &&
+             paddle::platform::is_xpu_place(dst_place)) {
+    paddle::memory::Copy(BOOST_GET_CONST(paddle::platform::XPUPlace, dst_place),
+                         dst_ptr,
+                         BOOST_GET_CONST(paddle::platform::CPUPlace, src_place),
+                         src_ptr,
+                         size);
+  } else if (paddle::platform::is_xpu_place(src_place) &&
+             paddle::platform::is_xpu_place(dst_place)) {
+    if (src_ptr == dst_ptr) {
+      VLOG(3) << "Skip copy the same data async from " << src_place << " to "
+              << dst_place;
+      return;
+    }
+    paddle::memory::Copy(BOOST_GET_CONST(paddle::platform::XPUPlace, dst_place),
+                         dst_ptr,
+                         BOOST_GET_CONST(paddle::platform::XPUPlace, src_place),
+                         src_ptr,
+                         size);
+  } else {
+    PADDLE_THROW(paddle::platform::errors::Unimplemented(
+        "Copy from %s to %s is not supported.", src_place, dst_place));
+  }
+}
+
+}  // namespace pten
+
+// TODO(chenweihang): replace by better impl
+PT_REGISTER_MODULE(UtilsXPU);
+
+PT_REGISTER_KERNEL_WITH_NO_TYPE("copy", XPU, ANY, pten::Copy) {}
diff --git a/paddle/pten/kernels/xpu/utils.h b/paddle/pten/kernels/xpu/utils.h
new file mode 100644
index 0000000000000..c92812ed68842
--- /dev/null
+++ b/paddle/pten/kernels/xpu/utils.h
@@ -0,0 +1,34 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef PADDLE_WITH_XPU
+
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/kernel_registry.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/platform/device_context.h"
+namespace pten {
+
+using XPUDeviceContext = paddle::platform::XPUDeviceContext;
+
+void Copy(const XPUDeviceContext& dev_ctx,
+          const DenseTensor& src,
+          DenseTensor* dst);
+
+}  // namespace pten
+
+#endif
diff --git a/paddle/pten/tests/CMakeLists.txt b/paddle/pten/tests/CMakeLists.txt
new file mode 100644
index 0000000000000..9946821581c3a
--- /dev/null
+++ b/paddle/pten/tests/CMakeLists.txt
@@ -0,0 +1,4 @@
+add_subdirectory(api)
+add_subdirectory(common)
+add_subdirectory(core)
+add_subdirectory(kernels)
diff --git a/paddle/pten/tests/api/CMakeLists.txt b/paddle/pten/tests/api/CMakeLists.txt
new file mode 100644
index 0000000000000..2c6bd9c45d18a
--- /dev/null
+++ b/paddle/pten/tests/api/CMakeLists.txt
@@ -0,0 +1,7 @@
+cc_test(test_mean_api SRCS test_mean_api.cc DEPS pten_api pten_api_utils)
+cc_test(test_dot_api SRCS test_dot_api.cc DEPS pten_api pten_api_utils)
+cc_test(test_matmul_api SRCS test_matmul_api.cc DEPS pten_api pten_api_utils)
+cc_test(test_fill_api SRCS test_fill_api.cc DEPS pten_api pten_api_utils)
+cc_test(test_flatten_api SRCS test_flatten_api.cc DEPS pten_api pten_api_utils)
+cc_test(test_framework_storage SRCS test_storage.cc DEPS pten_api_utils)
+cc_test(test_framework_tensor_utils SRCS test_tensor_utils.cc DEPS pten_api_utils)
diff --git a/paddle/pten/tests/api/test_dot_api.cc b/paddle/pten/tests/api/test_dot_api.cc
new file mode 100644
index 0000000000000..2c7b65f98135b
--- /dev/null
+++ b/paddle/pten/tests/api/test_dot_api.cc
@@ -0,0 +1,84 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <memory>
+
+#include "paddle/pten/api/include/linalg.h"
+
+#include "paddle/pten/api/lib/utils/allocator.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/kernel_registry.h"
+
+PT_DECLARE_MODULE(LinalgCPU);
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PT_DECLARE_MODULE(LinalgCUDA);
+#endif
+
+namespace framework = paddle::framework;
+using DDim = paddle::framework::DDim;
+
+// TODO(chenweihang): Remove this test after the API is used in the dygraph
+TEST(API, dot) {
+  // 1. create tensor
+  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+      paddle::platform::CPUPlace());
+  auto dense_x = std::make_shared<pten::DenseTensor>(
+      alloc,
+      pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                            framework::make_ddim({3, 10}),
+                            pten::DataLayout::NCHW));
+  auto* dense_x_data = dense_x->mutable_data<float>();
+
+  auto dense_y = std::make_shared<pten::DenseTensor>(
+      alloc,
+      pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                            framework::make_ddim({3, 10}),
+                            pten::DataLayout::NCHW));
+  auto* dense_y_data = dense_y->mutable_data<float>();
+
+  float sum[3] = {0.0, 0.0, 0.0};
+  for (size_t i = 0; i < 3; ++i) {
+    for (size_t j = 0; j < 10; ++j) {
+      dense_x_data[i * 10 + j] = (i * 10 + j) * 1.0;
+      dense_y_data[i * 10 + j] = (i * 10 + j) * 1.0;
+      sum[i] += (i * 10 + j) * (i * 10 + j) * 1.0;
+    }
+  }
+
+  paddle::experimental::Tensor x(dense_x);
+  paddle::experimental::Tensor y(dense_y);
+
+  // 2. test API
+  auto out = paddle::experimental::dot(x, y);
+
+  // 3. check result
+  ASSERT_EQ(out.shape().size(), 2);
+  ASSERT_EQ(out.shape()[0], 3);
+  ASSERT_EQ(out.numel(), 3);
+  ASSERT_EQ(out.is_cpu(), true);
+  ASSERT_EQ(out.type(), pten::DataType::FLOAT32);
+  ASSERT_EQ(out.layout(), pten::DataLayout::NCHW);
+  ASSERT_EQ(out.initialized(), true);
+
+  auto expect_result = sum;
+  auto dense_out = std::dynamic_pointer_cast<pten::DenseTensor>(out.impl());
+  auto actual_result0 = dense_out->data<float>()[0];
+  auto actual_result1 = dense_out->data<float>()[1];
+  auto actual_result2 = dense_out->data<float>()[2];
+  ASSERT_NEAR(expect_result[0], actual_result0, 1e-6f);
+  ASSERT_NEAR(expect_result[1], actual_result1, 1e-6f);
+  ASSERT_NEAR(expect_result[2], actual_result2, 1e-6f);
+}
diff --git a/paddle/pten/tests/api/test_fill_api.cc b/paddle/pten/tests/api/test_fill_api.cc
new file mode 100644
index 0000000000000..897637942547e
--- /dev/null
+++ b/paddle/pten/tests/api/test_fill_api.cc
@@ -0,0 +1,159 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <memory>
+
+#include "paddle/pten/api/include/creation.h"
+
+#include "paddle/pten/api/lib/utils/allocator.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/kernel_registry.h"
+
+PT_DECLARE_MODULE(CreationCPU);
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PT_DECLARE_MODULE(CreationCUDA);
+#endif
+
+namespace framework = paddle::framework;
+using DDim = paddle::framework::DDim;
+
+// TODO(chenweihang): Remove this test after the API is used in the dygraph
+TEST(API, full_like) {
+  // 1. create tensor
+  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+      paddle::platform::CPUPlace());
+  auto dense_x = std::make_shared<pten::DenseTensor>(
+      alloc,
+      pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                            framework::make_ddim({3, 2}),
+                            pten::DataLayout::NCHW));
+  auto* dense_x_data = dense_x->mutable_data<float>();
+  dense_x_data[0] = 0;
+
+  float val = 1.0;
+
+  paddle::experimental::Tensor x(dense_x);
+
+  // 2. test API
+  auto out = paddle::experimental::full_like(x, val, pten::DataType::FLOAT32);
+
+  // 3. check result
+  ASSERT_EQ(out.shape().size(), 2);
+  ASSERT_EQ(out.shape()[0], 3);
+  ASSERT_EQ(out.numel(), 6);
+  ASSERT_EQ(out.is_cpu(), true);
+  ASSERT_EQ(out.type(), pten::DataType::FLOAT32);
+  ASSERT_EQ(out.layout(), pten::DataLayout::NCHW);
+  ASSERT_EQ(out.initialized(), true);
+
+  auto dense_out = std::dynamic_pointer_cast<pten::DenseTensor>(out.impl());
+  auto* actual_result = dense_out->data<float>();
+  for (auto i = 0; i < 6; i++) {
+    ASSERT_NEAR(actual_result[i], val, 1e-6f);
+  }
+}
+
+TEST(API, zeros_like) {
+  // 1. create tensor
+  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+      paddle::platform::CPUPlace());
+  auto dense_x = std::make_shared<pten::DenseTensor>(
+      alloc,
+      pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                            framework::make_ddim({3, 2}),
+                            pten::DataLayout::NCHW));
+  auto* dense_x_data = dense_x->mutable_data<float>();
+  dense_x_data[0] = 1;
+
+  paddle::experimental::Tensor x(dense_x);
+
+  // 2. test API
+  auto out = paddle::experimental::zeros_like(x, pten::DataType::INT32);
+
+  // 3. check result
+  ASSERT_EQ(out.shape().size(), 2);
+  ASSERT_EQ(out.shape()[0], 3);
+  ASSERT_EQ(out.numel(), 6);
+  ASSERT_EQ(out.is_cpu(), true);
+  ASSERT_EQ(out.type(), pten::DataType::INT32);
+  ASSERT_EQ(out.layout(), pten::DataLayout::NCHW);
+  ASSERT_EQ(out.initialized(), true);
+
+  auto dense_out = std::dynamic_pointer_cast<pten::DenseTensor>(out.impl());
+  auto* actual_result = dense_out->data<int32_t>();
+  for (auto i = 0; i < 6; i++) {
+    ASSERT_EQ(actual_result[i], 0);
+  }
+}
+
+TEST(API, ones_like) {
+  // 1. create tensor
+  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+      paddle::platform::CPUPlace());
+  auto dense_x = std::make_shared<pten::DenseTensor>(
+      alloc,
+      pten::DenseTensorMeta(pten::DataType::INT32,
+                            framework::make_ddim({3, 2}),
+                            pten::DataLayout::NCHW));
+  auto* dense_x_data = dense_x->mutable_data<int32_t>();
+  dense_x_data[0] = 0;
+
+  paddle::experimental::Tensor x(dense_x);
+
+  // 2. test API
+  auto out = paddle::experimental::ones_like(x, pten::DataType::INT32);
+
+  // 3. check result
+  ASSERT_EQ(out.shape().size(), 2);
+  ASSERT_EQ(out.shape()[0], 3);
+  ASSERT_EQ(out.numel(), 6);
+  ASSERT_EQ(out.is_cpu(), true);
+  ASSERT_EQ(out.type(), pten::DataType::INT32);
+  ASSERT_EQ(out.layout(), pten::DataLayout::NCHW);
+  ASSERT_EQ(out.initialized(), true);
+
+  auto dense_out = std::dynamic_pointer_cast<pten::DenseTensor>(out.impl());
+  auto* actual_result = dense_out->data<int32_t>();
+  for (auto i = 0; i < 6; i++) {
+    ASSERT_EQ(actual_result[i], 1);
+  }
+}
+
+TEST(API, full) {
+  // 1. create tensor
+  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+      paddle::platform::CPUPlace());
+
+  float val = 1.0;
+
+  // 2. test API
+  auto out = paddle::experimental::full({3, 2}, val, pten::DataType::FLOAT32);
+
+  // 3. check result
+  ASSERT_EQ(out.shape().size(), 2);
+  ASSERT_EQ(out.shape()[0], 3);
+  ASSERT_EQ(out.numel(), 6);
+  ASSERT_EQ(out.is_cpu(), true);
+  ASSERT_EQ(out.type(), pten::DataType::FLOAT32);
+  ASSERT_EQ(out.layout(), pten::DataLayout::NCHW);
+  ASSERT_EQ(out.initialized(), true);
+
+  auto dense_out = std::dynamic_pointer_cast<pten::DenseTensor>(out.impl());
+  auto* actual_result = dense_out->data<float>();
+  for (auto i = 0; i < 6; i++) {
+    ASSERT_NEAR(actual_result[i], val, 1e-6f);
+  }
+}
diff --git a/paddle/pten/tests/api/test_flatten_api.cc b/paddle/pten/tests/api/test_flatten_api.cc
new file mode 100644
index 0000000000000..3701c358c667e
--- /dev/null
+++ b/paddle/pten/tests/api/test_flatten_api.cc
@@ -0,0 +1,72 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <memory>
+
+#include "paddle/pten/api/include/manipulation.h"
+
+#include "paddle/pten/api/lib/utils/allocator.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/kernel_registry.h"
+
+PT_DECLARE_MODULE(ManipulationCPU);
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PT_DECLARE_MODULE(ManipulationCUDA);
+#endif
+
+namespace framework = paddle::framework;
+using DDim = paddle::framework::DDim;
+
+// TODO(chenweihang): Remove this test after the API is used in the dygraph
+TEST(API, flatten) {
+  // 1. create tensor
+  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+      paddle::platform::CPUPlace());
+  auto dense_x = std::make_shared<pten::DenseTensor>(
+      alloc,
+      pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                            framework::make_ddim({3, 2, 2, 3}),
+                            pten::DataLayout::NCHW));
+  auto* dense_x_data = dense_x->mutable_data<float>();
+
+  for (int i = 0; i < dense_x->numel(); i++) {
+    dense_x_data[i] = i;
+  }
+
+  paddle::experimental::Tensor x(dense_x);
+  int start_axis = 1, stop_axis = 2;
+  // 2. test API
+  auto out = paddle::experimental::flatten(x, start_axis, stop_axis);
+
+  // 3. check result
+  std::vector<int> expect_shape = {3, 4, 3};
+  ASSERT_EQ(out.shape()[0], expect_shape[0]);
+  ASSERT_EQ(out.shape()[1], expect_shape[1]);
+  ASSERT_EQ(out.shape()[2], expect_shape[2]);
+  ASSERT_EQ(out.numel(), 36);
+  ASSERT_EQ(out.is_cpu(), true);
+  ASSERT_EQ(out.type(), pten::DataType::FLOAT32);
+  ASSERT_EQ(out.layout(), pten::DataLayout::NCHW);
+  ASSERT_EQ(out.initialized(), true);
+  bool value_equal = true;
+  auto dense_out = std::dynamic_pointer_cast<pten::DenseTensor>(out.impl());
+  auto* dense_out_data = dense_out->data<float>();
+  for (int i = 0; i < dense_x->numel(); i++) {
+    if (std::abs(dense_x_data[i] - dense_out_data[i]) > 1e-6f)
+      value_equal = false;
+  }
+  ASSERT_EQ(value_equal, true);
+}
diff --git a/paddle/pten/tests/api/test_matmul_api.cc b/paddle/pten/tests/api/test_matmul_api.cc
new file mode 100644
index 0000000000000..83a70c905b60e
--- /dev/null
+++ b/paddle/pten/tests/api/test_matmul_api.cc
@@ -0,0 +1,160 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <memory>
+
+#include "paddle/pten/api/include/linalg.h"
+
+#include "paddle/pten/api/lib/utils/allocator.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/kernel_registry.h"
+#include "paddle/pten/kernels/cuda/utils.h"
+
+PT_DECLARE_MODULE(LinalgCPU);
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PT_DECLARE_MODULE(LinalgCUDA);
+#endif
+
+namespace framework = paddle::framework;
+using DDim = paddle::framework::DDim;
+
+TEST(API, matmul_cpu) {
+  // 1. create tensor
+  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+      paddle::platform::CPUPlace());
+  auto dense_x = std::make_shared<pten::DenseTensor>(
+      alloc,
+      pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                            framework::make_ddim({3, 3}),
+                            pten::DataLayout::NCHW));
+
+  auto* dense_x_data = dense_x->mutable_data<float>();
+
+  auto dense_y = std::make_shared<pten::DenseTensor>(
+      alloc,
+      pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                            framework::make_ddim({3, 3}),
+                            pten::DataLayout::NCHW));
+  auto* dense_y_data = dense_y->mutable_data<float>();
+
+  for (size_t i = 0; i < 9; ++i) {
+    dense_x_data[i] = 1.0;
+    dense_y_data[i] = 2.0;
+  }
+  std::vector<float> sum(9, 6.0);
+
+  paddle::experimental::Tensor x(dense_x);
+  paddle::experimental::Tensor y(dense_y);
+
+  // 2. test API
+  auto out = paddle::experimental::matmul(x, y, false, false);
+
+  // 3. check result
+  ASSERT_EQ(out.shape().size(), 2);
+  ASSERT_EQ(out.shape()[0], 3);
+  ASSERT_EQ(out.shape()[1], 3);
+  ASSERT_EQ(out.numel(), 9);
+  ASSERT_EQ(out.type(), pten::DataType::FLOAT32);
+  ASSERT_EQ(out.layout(), pten::DataLayout::NCHW);
+  ASSERT_EQ(out.initialized(), true);
+
+  auto dense_out = std::dynamic_pointer_cast<pten::DenseTensor>(out.impl());
+
+  for (size_t i = 0; i < 9; i++) {
+    ASSERT_NEAR(sum[i], dense_out->data<float>()[i], 1e-6f);
+  }
+}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+TEST(API, matmul_cuda) {
+  // Prepare CPU Dense Tensor
+  const auto alloc_cpu =
+      std::make_shared<paddle::experimental::DefaultAllocator>(
+          paddle::platform::CPUPlace());
+  auto ref_x = std::make_shared<pten::DenseTensor>(
+      alloc_cpu,
+      pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                            framework::make_ddim({3, 3}),
+                            pten::DataLayout::NCHW));
+
+  auto* ref_x_data = ref_x->mutable_data<float>();
+
+  auto ref_y = std::make_shared<pten::DenseTensor>(
+      alloc_cpu,
+      pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                            framework::make_ddim({3, 3}),
+                            pten::DataLayout::NCHW));
+  auto* ref_y_data = ref_y->mutable_data<float>();
+
+  for (size_t i = 0; i < 9; ++i) {
+    ref_x_data[i] = 1.0;
+    ref_y_data[i] = 2.0;
+  }
+  std::vector<float> sum(9, 6.0);
+
+  // 1. create tensor
+  const auto alloc_cuda =
+      std::make_shared<paddle::experimental::DefaultAllocator>(
+          paddle::platform::CUDAPlace());
+  auto dense_x = std::make_shared<pten::DenseTensor>(
+      alloc_cuda,
+      pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                            framework::make_ddim({3, 3}),
+                            pten::DataLayout::NCHW));
+
+  auto dense_y = std::make_shared<pten::DenseTensor>(
+      alloc_cuda,
+      pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                            framework::make_ddim({3, 3}),
+                            pten::DataLayout::NCHW));
+
+  auto& pool = paddle::platform::DeviceContextPool::Instance();
+  auto place = paddle::platform::CUDAPlace();
+  auto* dev_ctx = pool.GetByPlace(place);
+
+  pten::Copy(*dev_ctx, *ref_x.get(), dense_x.get());
+  pten::Copy(*dev_ctx, *ref_y.get(), dense_y.get());
+
+  paddle::experimental::Tensor x(dense_x);
+  paddle::experimental::Tensor y(dense_y);
+
+  // 2. test API
+  auto out = paddle::experimental::matmul(x, y, false, false);
+
+  // 3. check result
+  ASSERT_EQ(out.shape().size(), 2);
+  ASSERT_EQ(out.shape()[0], 3);
+  ASSERT_EQ(out.shape()[1], 3);
+  ASSERT_EQ(out.numel(), 9);
+  ASSERT_EQ(out.type(), pten::DataType::FLOAT32);
+  ASSERT_EQ(out.layout(), pten::DataLayout::NCHW);
+  ASSERT_EQ(out.initialized(), true);
+
+  auto dense_out = std::dynamic_pointer_cast<pten::DenseTensor>(out.impl());
+
+  auto ref_out = std::make_shared<pten::DenseTensor>(
+      alloc_cpu,
+      pten::DenseTensorMeta(
+          pten::DataType::FLOAT32, out.shape(), pten::DataLayout::NCHW));
+
+  pten::Copy(*dev_ctx, *dense_out.get(), ref_out.get());
+
+  for (size_t i = 0; i < 9; i++) {
+    ASSERT_NEAR(sum[i], ref_out->data<float>()[i], 1e-6f);
+  }
+}
+
+#endif
diff --git a/paddle/pten/tests/api/test_mean_api.cc b/paddle/pten/tests/api/test_mean_api.cc
new file mode 100644
index 0000000000000..d772e58659507
--- /dev/null
+++ b/paddle/pten/tests/api/test_mean_api.cc
@@ -0,0 +1,69 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <memory>
+
+#include "paddle/pten/api/include/math.h"
+
+#include "paddle/pten/api/lib/utils/allocator.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/kernel_registry.h"
+
+PT_DECLARE_MODULE(MathCPU);
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PT_DECLARE_MODULE(MathCUDA);
+#endif
+
+namespace framework = paddle::framework;
+using DDim = paddle::framework::DDim;
+
+// TODO(chenweihang): Remove this test after the API is used in the dygraph
+TEST(API, mean) {
+  // 1. create tensor
+  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+      paddle::platform::CPUPlace());
+  auto dense_x = std::make_shared<pten::DenseTensor>(
+      alloc,
+      pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                            framework::make_ddim({3, 4}),
+                            pten::DataLayout::NCHW));
+  auto* dense_x_data = dense_x->mutable_data<float>();
+
+  float sum = 0.0;
+  for (size_t i = 0; i < 12; ++i) {
+    dense_x_data[i] = i * 1.0;
+    sum += i * 1.0;
+  }
+
+  paddle::experimental::Tensor x(dense_x);
+
+  // 2. test API
+  auto out = paddle::experimental::mean(x);
+
+  // 3. check result
+  ASSERT_EQ(out.shape().size(), 1);
+  ASSERT_EQ(out.shape()[0], 1);
+  ASSERT_EQ(out.numel(), 1);
+  ASSERT_EQ(out.is_cpu(), true);
+  ASSERT_EQ(out.type(), pten::DataType::FLOAT32);
+  ASSERT_EQ(out.layout(), pten::DataLayout::NCHW);
+  ASSERT_EQ(out.initialized(), true);
+
+  auto expect_result = sum / 12;
+  auto dense_out = std::dynamic_pointer_cast<pten::DenseTensor>(out.impl());
+  auto actual_result = dense_out->data<float>()[0];
+  ASSERT_NEAR(expect_result, actual_result, 1e-6f);
+}
diff --git a/paddle/pten/tests/api/test_storage.cc b/paddle/pten/tests/api/test_storage.cc
new file mode 100644
index 0000000000000..fc95f95f26e0b
--- /dev/null
+++ b/paddle/pten/tests/api/test_storage.cc
@@ -0,0 +1,65 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "paddle/pten/api/lib/utils/allocator.h"
+#include "paddle/pten/api/lib/utils/storage.h"
+
+namespace paddle {
+namespace experimental {
+namespace tests {
+
+TEST(host_storage, external_stroage) {
+  const size_t size{100};
+  const auto a =
+      std::make_shared<DefaultAllocator>(paddle::platform::CPUPlace());
+  pten::intrusive_ptr<pten::Storage> in_storage =
+      pten::make_intrusive<pten::TensorStorage>(a, size);
+  char* data = static_cast<char*>(in_storage->data());
+  for (size_t i = 0; i < size; ++i) {
+    data[i] = i;
+  }
+  const size_t delta{1};
+  const size_t n{10};
+  auto ex_storage = pten::make_intrusive<ExternalStorage>(in_storage, delta, n);
+  CHECK_EQ(ex_storage->size(), n);
+  CHECK(paddle::platform::is_cpu_place(ex_storage->place()));
+  CHECK(!ex_storage->OwnsMemory());
+  for (size_t i = delta; i < delta + n; ++i) {
+    CHECK_EQ(data[i], static_cast<char>(i));
+  }
+}
+
+TEST(host_storage, external_vector) {
+  std::vector<char> data(100);
+  for (size_t i = 0; i < data.size(); ++i) {
+    data[i] = i;
+  }
+  const size_t delta{1};
+  const size_t n{10};
+  auto ex_storage = pten::make_intrusive<ExternalStorage>(
+      data.data(), n, paddle::platform::CPUPlace());
+  CHECK_EQ(ex_storage->size(), n);
+  CHECK(paddle::platform::is_cpu_place(ex_storage->place()));
+  CHECK(!ex_storage->OwnsMemory());
+  for (size_t i = delta; i < delta + n; ++i) {
+    CHECK_EQ(data[i], static_cast<char>(i));
+  }
+}
+}  // namespace tests
+}  // namespace experimental
+}  // namespace paddle
diff --git a/paddle/pten/tests/api/test_tensor_utils.cc b/paddle/pten/tests/api/test_tensor_utils.cc
new file mode 100644
index 0000000000000..fd52b96542c71
--- /dev/null
+++ b/paddle/pten/tests/api/test_tensor_utils.cc
@@ -0,0 +1,125 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "gtest/gtest.h"
+
+#include "paddle/pten/api/lib/utils/tensor_utils.h"
+
+namespace paddle {
+namespace experimental {
+namespace tests {
+
+using DDim = paddle::framework::DDim;
+using DataType = paddle::experimental::DataType;
+using DataLayout = paddle::experimental::DataLayout;
+
+using DenseTensor = pten::DenseTensor;
+using DenseTensorMeta = pten::DenseTensorMeta;
+
+TEST(tensor_utils, dense_tensor_to_lod_tensor) {
+  const DDim dims({2, 1});
+  const DataType dtype{DataType::FLOAT32};
+  const DataLayout layout{DataLayout::NCHW};
+  const std::vector<std::vector<size_t>> lod{{0, 2}};
+  DenseTensorMeta meta(dtype, dims, layout, lod);
+
+  auto alloc = std::make_shared<DefaultAllocator>(platform::CPUPlace());
+
+  DenseTensor dense_tensor(alloc, meta);
+  float* data = dense_tensor.mutable_data<float>();
+  data[0] = 1.0f;
+  data[1] = 2.1f;
+
+  framework::LoDTensor lod_tensor;
+  MovesStorage(&dense_tensor, &lod_tensor);
+
+  CHECK(dense_tensor.lod().size() == lod_tensor.lod().size());
+  CHECK(dense_tensor.lod()[0] ==
+        static_cast<std::vector<size_t>>((lod_tensor.lod()[0])));
+  CHECK(dense_tensor.data_type() ==
+        pten::TransToPtenDataType(lod_tensor.type()));
+  CHECK(dense_tensor.layout() ==
+        pten::TransToPtenDataLayout(lod_tensor.layout()));
+  CHECK(platform::is_cpu_place(lod_tensor.place()));
+
+  CHECK(lod_tensor.data<float>()[0] == 1.0f);
+  CHECK(lod_tensor.data<float>()[1] == 2.1f);
+
+  auto dense_tensor_1 = MakePtenDenseTensor(lod_tensor);
+  CHECK(dense_tensor_1->dims() == dims);
+  CHECK(dense_tensor_1->data_type() == dtype);
+  CHECK(dense_tensor_1->layout() == layout);
+  CHECK(dense_tensor_1->lod().size() == lod.size());
+  CHECK(dense_tensor_1->lod()[0] == lod[0]);
+  const float* data_1 = dense_tensor_1->data<float>();
+  CHECK(data_1[0] == 1.0f);
+  CHECK(data_1[1] == 2.1f);
+}
+
+TEST(tensor_utils, dense_tensor_to_tensor) {
+  const DDim dims({2, 1});
+  const DataType dtype{DataType::FLOAT32};
+  const DataLayout layout{DataLayout::NCHW};
+  DenseTensorMeta meta(dtype, dims, layout);
+
+  auto alloc = std::make_shared<DefaultAllocator>(platform::CPUPlace());
+
+  DenseTensor dense_tensor(alloc, meta);
+  float* data = dense_tensor.mutable_data<float>();
+  data[0] = 1.0f;
+  data[1] = 2.1f;
+
+  framework::Tensor tensor;
+  MovesStorage(&dense_tensor, &tensor);
+
+  CHECK(dense_tensor.data_type() == pten::TransToPtenDataType(tensor.type()));
+  CHECK(dense_tensor.layout() == pten::TransToPtenDataLayout(tensor.layout()));
+  CHECK(platform::is_cpu_place(tensor.place()));
+
+  CHECK(tensor.data<float>()[0] == 1.0f);
+  CHECK(tensor.data<float>()[1] == 2.1f);
+
+  auto dense_tensor_1 = MakePtenDenseTensor(tensor);
+  CHECK(dense_tensor_1->dims() == dims);
+  CHECK(dense_tensor_1->data_type() == dtype);
+  CHECK(dense_tensor_1->layout() == layout);
+  const float* data_1 = dense_tensor_1->data<float>();
+  CHECK(data_1[0] == 1.0f);
+  CHECK(data_1[1] == 2.1f);
+}
+
+TEST(PtenUtils, VarToPtTensor) {
+  // 1. create Variable
+  paddle::framework::Variable v;
+  auto selected_rows = v.GetMutable<paddle::framework::SelectedRows>();
+  paddle::framework::Tensor* value = selected_rows->mutable_value();
+  auto* data = value->mutable_data<int>(paddle::framework::make_ddim({1, 1}),
+                                        paddle::platform::CPUPlace());
+  data[0] = 123;
+  pten::Backend expect_backend = pten::Backend::CPU;
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  expect_backend = pten::Backend::CUDA;
+#endif
+  auto tensor_def = pten::TensorArgDef(
+      expect_backend, pten::DataLayout::NCHW, pten::DataType::INT32);
+  // 2. test API
+  auto tensor_x = MakePtenTensorBaseFromVar(v, tensor_def);
+  // 3. check result
+  ASSERT_EQ(tensor_x->data_type(), pten::DataType::INT32);
+}
+
+}  // namespace tests
+}  // namespace experimental
+}  // namespace paddle
diff --git a/paddle/pten/tests/common/CMakeLists.txt b/paddle/pten/tests/common/CMakeLists.txt
new file mode 100644
index 0000000000000..c0a5414d53e47
--- /dev/null
+++ b/paddle/pten/tests/common/CMakeLists.txt
@@ -0,0 +1,3 @@
+cc_test(pten_test_backend SRCS test_backend.cc DEPS gtest)
+cc_test(pten_test_data_layout SRCS test_data_layout.cc DEPS gtest)
+cc_test(pten_test_data_type SRCS test_data_type.cc DEPS gtest)
diff --git a/paddle/pten/tests/common/test_backend.cc b/paddle/pten/tests/common/test_backend.cc
new file mode 100644
index 0000000000000..1c17c881ed24f
--- /dev/null
+++ b/paddle/pten/tests/common/test_backend.cc
@@ -0,0 +1,49 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <iostream>
+
+#include "paddle/pten/common/backend.h"
+
+TEST(Backend, OStream) {
+  std::ostringstream oss;
+  oss << pten::Backend::UNDEFINED;
+  EXPECT_EQ(oss.str(), "Undefined");
+  oss.str("");
+  oss << pten::Backend::CPU;
+  EXPECT_EQ(oss.str(), "CPU");
+  oss.str("");
+  oss << pten::Backend::CUDA;
+  EXPECT_EQ(oss.str(), "CUDA");
+  oss.str("");
+  oss << pten::Backend::XPU;
+  EXPECT_EQ(oss.str(), "XPU");
+  oss.str("");
+  oss << pten::Backend::NPU;
+  EXPECT_EQ(oss.str(), "NPU");
+  oss.str("");
+  oss << pten::Backend::MKLDNN;
+  EXPECT_EQ(oss.str(), "MKLDNN");
+  oss.str("");
+  oss << pten::Backend::CUDNN;
+  EXPECT_EQ(oss.str(), "CUDNN");
+  oss.str("");
+  try {
+    oss << pten::Backend::NUM_BACKENDS;
+  } catch (paddle::platform::EnforceNotMet &exception) {
+    std::string ex_msg = exception.what();
+    EXPECT_TRUE(ex_msg.find("Invalid enum backend type") != std::string::npos);
+  }
+}
diff --git a/paddle/pten/tests/common/test_data_layout.cc b/paddle/pten/tests/common/test_data_layout.cc
new file mode 100644
index 0000000000000..7fe1b6c2ffd2b
--- /dev/null
+++ b/paddle/pten/tests/common/test_data_layout.cc
@@ -0,0 +1,45 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <iostream>
+#include <sstream>
+
+#include "paddle/pten/common/layout.h"
+
+TEST(DataLayout, OStream) {
+  std::ostringstream oss;
+  oss << pten::DataLayout::UNDEFINED;
+  EXPECT_EQ(oss.str(), "Undefined");
+  oss.str("");
+  oss << pten::DataLayout::ANY;
+  EXPECT_EQ(oss.str(), "Any");
+  oss.str("");
+  oss << pten::DataLayout::NHWC;
+  EXPECT_EQ(oss.str(), "NHWC");
+  oss.str("");
+  oss << pten::DataLayout::NCHW;
+  EXPECT_EQ(oss.str(), "NCHW");
+  oss.str("");
+  oss << pten::DataLayout::MKLDNN;
+  EXPECT_EQ(oss.str(), "MKLDNN");
+  oss.str("");
+  try {
+    oss << pten::DataLayout::NUM_DATA_LAYOUTS;
+  } catch (paddle::platform::EnforceNotMet &exception) {
+    std::string ex_msg = exception.what();
+    EXPECT_TRUE(ex_msg.find("Invalid enum data layout type") !=
+                std::string::npos);
+  }
+}
diff --git a/paddle/pten/tests/common/test_data_type.cc b/paddle/pten/tests/common/test_data_type.cc
new file mode 100644
index 0000000000000..28d58858bb42c
--- /dev/null
+++ b/paddle/pten/tests/common/test_data_type.cc
@@ -0,0 +1,68 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <iostream>
+#include <sstream>
+
+#include "paddle/pten/common/data_type.h"
+
+TEST(DataType, OStream) {
+  std::ostringstream oss;
+  oss << pten::DataType::UNDEFINED;
+  EXPECT_EQ(oss.str(), "Undefined");
+  oss.str("");
+  oss << pten::DataType::BOOL;
+  EXPECT_EQ(oss.str(), "bool");
+  oss.str("");
+  oss << pten::DataType::INT8;
+  EXPECT_EQ(oss.str(), "int8");
+  oss.str("");
+  oss << pten::DataType::UINT8;
+  EXPECT_EQ(oss.str(), "uint8");
+  oss.str("");
+  oss << pten::DataType::INT16;
+  EXPECT_EQ(oss.str(), "int16");
+  oss.str("");
+  oss << pten::DataType::INT32;
+  EXPECT_EQ(oss.str(), "int32");
+  oss.str("");
+  oss << pten::DataType::INT64;
+  EXPECT_EQ(oss.str(), "int64");
+  oss.str("");
+  oss << pten::DataType::BFLOAT16;
+  EXPECT_EQ(oss.str(), "bfloat16");
+  oss.str("");
+  oss << pten::DataType::FLOAT16;
+  EXPECT_EQ(oss.str(), "float16");
+  oss.str("");
+  oss << pten::DataType::FLOAT32;
+  EXPECT_EQ(oss.str(), "float32");
+  oss.str("");
+  oss << pten::DataType::FLOAT64;
+  EXPECT_EQ(oss.str(), "float64");
+  oss.str("");
+  oss << pten::DataType::COMPLEX64;
+  EXPECT_EQ(oss.str(), "complex64");
+  oss.str("");
+  oss << pten::DataType::COMPLEX128;
+  EXPECT_EQ(oss.str(), "complex128");
+  oss.str("");
+  try {
+    oss << pten::DataType::NUM_DATA_TYPES;
+  } catch (paddle::platform::EnforceNotMet &exception) {
+    std::string ex_msg = exception.what();
+    EXPECT_TRUE(ex_msg.find("Invalid enum data type") != std::string::npos);
+  }
+}
diff --git a/paddle/pten/tests/core/CMakeLists.txt b/paddle/pten/tests/core/CMakeLists.txt
new file mode 100644
index 0000000000000..b25439cfe2527
--- /dev/null
+++ b/paddle/pten/tests/core/CMakeLists.txt
@@ -0,0 +1,6 @@
+cc_test(test_allocator SRCS test_allocator.cc DEPS tensor_base)
+cc_test(test_storage SRCS test_storage.cc DEPS tensor_base)
+cc_test(test_dense_tensor SRCS test_dense_tensor.cc DEPS dense_tensor)
+cc_test(test_intrusive_ptr SRCS test_intrusive_ptr.cc)
+cc_test(test_type_info SRCS test_type_info.cc)
+cc_test(test_kernel_factory SRCS test_kernel_factory.cc DEPS kernel_factory)
diff --git a/paddle/pten/tests/core/allocator.h b/paddle/pten/tests/core/allocator.h
new file mode 100644
index 0000000000000..053e8ba7b382b
--- /dev/null
+++ b/paddle/pten/tests/core/allocator.h
@@ -0,0 +1,89 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+
+#include "paddle/pten/core/allocator.h"
+
+namespace pten {
+namespace tests {
+
+class HostAllocatorSample : public pten::RawAllocator {
+ public:
+  using Place = paddle::platform::Place;
+  void* Allocate(size_t bytes_size) override {
+    return ::operator new(bytes_size);
+  }
+  void Deallocate(void* ptr, size_t bytes_size) override {
+    return ::operator delete(ptr);
+  }
+  const Place& place() const override { return place_; }
+
+ private:
+  Place place_{paddle::platform::CPUPlace()};
+};
+
+class FancyAllocator : public pten::Allocator {
+ public:
+  static void Delete(void* data) { ::operator delete(data); }
+
+  Allocation Allocate(size_t bytes_size) override {
+    void* data = ::operator new(bytes_size);
+    return Allocation(data, data, &Delete, paddle::platform::CPUPlace());
+  }
+};
+
+template <typename T>
+struct CustomAllocator {
+  using value_type = T;
+  using Allocator = pten::RawAllocator;
+
+  explicit CustomAllocator(const std::shared_ptr<Allocator>& a) noexcept
+      : alloc_(a) {}
+
+  CustomAllocator(const CustomAllocator&) noexcept = default;
+  T* allocate(std::size_t n) {
+    return static_cast<T*>(alloc_->Allocate(n * sizeof(T)));
+  }
+  void deallocate(T* p, std::size_t n) {
+    return alloc_->Deallocate(p, sizeof(T) * n);
+  }
+
+  template <typename R, typename U>
+  friend bool operator==(const CustomAllocator<R>&,
+                         const CustomAllocator<U>&) noexcept;
+  template <typename R, typename U>
+  friend bool operator!=(const CustomAllocator<R>&,
+                         const CustomAllocator<U>&) noexcept;
+
+ private:
+  std::shared_ptr<Allocator> alloc_;
+};
+
+template <typename T, typename U>
+inline bool operator==(const CustomAllocator<T>& lhs,
+                       const CustomAllocator<U>& rhs) noexcept {
+  return &lhs.alloc_ == &rhs.alloc_;
+}
+
+template <typename T, typename U>
+inline bool operator!=(const CustomAllocator<T>& lhs,
+                       const CustomAllocator<U>& rhs) noexcept {
+  return &lhs.alloc_ != &rhs.alloc_;
+}
+
+}  // namespace tests
+}  // namespace pten
diff --git a/paddle/pten/tests/core/random.h b/paddle/pten/tests/core/random.h
new file mode 100644
index 0000000000000..4e2e55162d3a4
--- /dev/null
+++ b/paddle/pten/tests/core/random.h
@@ -0,0 +1,47 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <random>
+#include <type_traits>
+
+namespace pten {
+namespace tests {
+
+template <typename T,
+          typename =
+              typename std::enable_if<std::is_arithmetic<T>::value>::type>
+class RandomGenerator {
+  using distribution_type =
+      typename std::conditional<std::is_integral<T>::value,
+                                std::uniform_int_distribution<T>,
+                                std::uniform_real_distribution<T>>::type;
+
+  std::default_random_engine engine;
+  distribution_type distribution;
+
+ public:
+  auto operator()() -> decltype(distribution(engine)) {
+    return distribution(engine);
+  }
+};
+
+template <typename Container, typename T = typename Container::value_type>
+auto make_generator(Container const&) -> decltype(RandomGenerator<T>()) {
+  return RandomGenerator<T>();
+}
+
+}  // namespace tests
+}  // namespace pten
diff --git a/paddle/pten/tests/core/test_allocator.cc b/paddle/pten/tests/core/test_allocator.cc
new file mode 100644
index 0000000000000..c509d8bd20a01
--- /dev/null
+++ b/paddle/pten/tests/core/test_allocator.cc
@@ -0,0 +1,91 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/generator.h"
+#include "paddle/pten/tests/core/allocator.h"
+#include "paddle/pten/tests/core/random.h"
+#include "paddle/pten/tests/core/timer.h"
+
+namespace pten {
+namespace tests {
+
+template <typename T>
+bool host_allocator_test(size_t vector_size) {
+  std::vector<T> src(vector_size);
+  std::generate(src.begin(), src.end(), make_generator(src));
+  std::vector<T, CustomAllocator<T>> dst(
+      src.begin(),
+      src.end(),
+      CustomAllocator<T>(std::make_shared<HostAllocatorSample>()));
+  return std::equal(src.begin(), src.end(), dst.begin());
+}
+
+TEST(raw_allocator, host) {
+  CHECK(host_allocator_test<float>(1000));
+  CHECK(host_allocator_test<int32_t>(1000));
+  CHECK(host_allocator_test<int64_t>(1000));
+}
+
+class StorageRawAlloc {
+ public:
+  StorageRawAlloc(const std::shared_ptr<RawAllocator>& a, size_t size)
+      : alloc_(a) {
+    data_ = alloc_->Allocate(size);
+  }
+  ~StorageRawAlloc() { alloc_->Deallocate(data_, size); }
+
+ private:
+  void* data_;
+  size_t size;
+  std::shared_ptr<RawAllocator> alloc_;
+};
+
+class StorageFancyAlloc {
+ public:
+  StorageFancyAlloc(const std::shared_ptr<Allocator>& a, size_t size)
+      : alloc_(a), allocation_(a->Allocate(size)) {}
+
+ private:
+  std::shared_ptr<Allocator> alloc_;
+  Allocation allocation_;
+};
+
+TEST(benchmark, allocator) {
+  std::shared_ptr<RawAllocator> raw_allocator(new HostAllocatorSample);
+  std::shared_ptr<Allocator> fancy_allocator(new FancyAllocator);
+  const size_t cycles = 100;
+  Timer timer;
+  double t1{}, t2{};
+  for (size_t i = 0; i < cycles; ++i) {
+    timer.tic();
+    for (size_t i = 0; i < cycles; ++i) {
+      StorageRawAlloc(raw_allocator, i * 100);
+    }
+    t1 += timer.toc();
+    timer.tic();
+    for (size_t i = 0; i < cycles; ++i) {
+      StorageFancyAlloc(fancy_allocator, i * 100);
+    }
+    t2 += timer.toc();
+  }
+  std::cout << "The cost of raw alloc is " << t1 << "ms.\n";
+  std::cout << "The cost of fancy alloc with place is " << t2 << "ms.\n";
+}
+
+}  // namespace tests
+}  // namespace pten
diff --git a/paddle/pten/tests/core/test_dense_tensor.cc b/paddle/pten/tests/core/test_dense_tensor.cc
new file mode 100644
index 0000000000000..12476373f8d98
--- /dev/null
+++ b/paddle/pten/tests/core/test_dense_tensor.cc
@@ -0,0 +1,127 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "gtest/gtest.h"
+
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/tests/core/allocator.h"
+
+namespace pten {
+namespace tests {
+
+TEST(dense_tensor, meta) {
+  const DDim dims({1, 2});
+  const DataType dtype{DataType::INT8};
+  const DataLayout layout{DataLayout::NHWC};
+  // TODO(Shixiaowei02): need to check the lod is valid.
+  const std::vector<std::vector<size_t>> lod{};
+
+  DenseTensorMeta meta_0;
+  CHECK(!meta_0.valid());
+
+  DenseTensorMeta meta_1(dtype, dims);
+  CHECK(meta_1.type == dtype);
+  CHECK(meta_1.dims == dims);
+  CHECK(meta_1.valid());
+
+  DenseTensorMeta meta_2(dtype, dims, layout);
+  CHECK(meta_2.type == dtype);
+  CHECK(meta_2.dims == dims);
+  CHECK(meta_2.layout == layout);
+  CHECK(meta_2.valid());
+
+  DenseTensorMeta meta_3(dtype, dims, layout, lod);
+  CHECK(meta_3.type == dtype);
+  CHECK(meta_3.dims == dims);
+  CHECK(meta_3.layout == layout);
+  CHECK(meta_3.lod == lod);
+  CHECK(meta_3.valid());
+
+  DenseTensorMeta meta_4(meta_3);
+  CHECK(meta_4.type == dtype);
+  CHECK(meta_4.dims == dims);
+  CHECK(meta_4.layout == layout);
+  CHECK(meta_4.lod == lod);
+  CHECK(meta_4.valid());
+
+  DenseTensorMeta meta_5(std::move(meta_4));
+  CHECK(meta_5.type == dtype);
+  CHECK(meta_5.dims == dims);
+  CHECK(meta_5.layout == layout);
+  CHECK(meta_5.lod == lod);
+  CHECK(meta_5.valid());
+}
+
+TEST(dense_tensor, def_ctor) {
+  DenseTensor tensor_0;
+  CHECK(!tensor_0.valid());
+}
+
+TEST(dense_tensor, ctor) {
+  const DDim dims({1, 2});
+  const DataType dtype{DataType::INT8};
+  const DataLayout layout{DataLayout::NHWC};
+  const std::vector<std::vector<size_t>> lod{};
+  DenseTensorMeta meta(dtype, dims, layout, lod);
+
+  auto alloc = std::make_shared<FancyAllocator>();
+
+  auto check_dense_tensor = [](const DenseTensor& t,
+                               const DenseTensorMeta& m) -> bool {
+    bool r{true};
+    r = r && (t.numel() == product(m.dims));
+    r = r && (t.dims() == m.dims);
+    r = r && (t.data_type() == m.type);
+    r = r && (t.layout() == m.layout);
+    r = r && (t.place() == paddle::platform::CPUPlace());
+    r = r && t.initialized();
+    r = r && t.IsSharedWith(t);
+    return r;
+  };
+
+  DenseTensor tensor_0(alloc, meta);
+  check_dense_tensor(tensor_0, meta);
+
+  DenseTensor tensor_1(alloc, DenseTensorMeta(meta));
+  check_dense_tensor(tensor_0, meta);
+
+  DenseTensor tensor_2(make_intrusive<TensorStorage>(alloc), meta);
+  CHECK(tensor_2.data<int8_t>() == nullptr);
+  CHECK_NOTNULL(tensor_2.mutable_data<int8_t>());
+  check_dense_tensor(tensor_2, meta);
+}
+
+TEST(dense_tensor, resize) {
+  const DDim dims({1, 2});
+  const DataType dtype{DataType::INT8};
+  const DataLayout layout{DataLayout::NHWC};
+  const std::vector<std::vector<size_t>> lod{};
+  DenseTensorMeta meta(dtype, dims, layout, lod);
+
+  auto alloc = std::make_shared<FancyAllocator>();
+  DenseTensor tensor_0(alloc, meta);
+
+  CHECK_EQ(tensor_0.memory_size(), 2u);
+  tensor_0.check_memory_size();
+  tensor_0.Resize({1, 2, 3});
+  CHECK_EQ(tensor_0.memory_size(), 2u);
+  tensor_0.mutable_data<int8_t>();
+  CHECK_EQ(tensor_0.memory_size(), 6u);
+
+  auto storage = tensor_0.release();
+  CHECK_EQ(storage->size(), 6u);
+}
+
+}  // namespace tests
+}  // namespace pten
diff --git a/paddle/pten/tests/core/test_intrusive_ptr.cc b/paddle/pten/tests/core/test_intrusive_ptr.cc
new file mode 100644
index 0000000000000..799f594d10f91
--- /dev/null
+++ b/paddle/pten/tests/core/test_intrusive_ptr.cc
@@ -0,0 +1,97 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <future>
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "paddle/pten/core/utils/intrusive_ptr.h"
+#include "paddle/pten/core/utils/intrusive_ref_counter.h"
+
+namespace pten {
+namespace tests {
+
+struct SharedObject : public intrusive_ref_counter<SharedObject> {
+  int i{0};
+};
+
+TEST(intrusive_ref_counter, async) {
+  SharedObject obj;
+  const size_t num{100};
+  std::vector<std::future<void>> results;
+  auto add_ref_and_release = [](const SharedObject* p) {
+    intrusive_ptr_add_ref<SharedObject>(p);
+    intrusive_ptr_release<SharedObject>(p);
+  };
+  for (size_t i = 0; i < num; ++i) {
+    results.emplace_back(std::async(add_ref_and_release, &obj));
+  }
+  for (auto& result : results) {
+    result.get();
+  }
+  CHECK_EQ(obj.use_count(), 1u);
+}
+
+TEST(intrusive_ptr, default_ctor) {
+  intrusive_ptr<SharedObject> p;
+  CHECK(p == nullptr);
+}
+
+TEST(intrusive_ptr, private_ctor) {
+  auto p = make_intrusive<SharedObject>();
+  const auto* ptr0 = p.get();
+  auto p1 = std::move(p);
+  intrusive_ptr<intrusive_ref_counter<SharedObject>> p2(std::move(p1));
+  const auto* ptr1 = p2.get();
+  CHECK_EQ(ptr0, ptr1);
+}
+
+TEST(intrusive_ptr, reset_with_obj) {
+  SharedObject obj;
+  obj.i = 1;
+  intrusive_ptr<SharedObject> p;
+  p.reset(&obj, true);
+  CHECK_EQ(p->i, obj.i);
+}
+
+TEST(intrusive_ptr, reset_with_ptr) {
+  auto* ptr = new SharedObject;
+  ptr->i = 1;
+  intrusive_ptr<SharedObject> p;
+  p.reset(ptr, false);
+  CHECK_EQ((*p).i, ptr->i);
+  p.reset();
+  CHECK(p == nullptr);
+}
+
+TEST(intrusive_ptr, op_comp) {
+  auto p = make_intrusive<SharedObject>();
+  auto copy = copy_intrusive<SharedObject>(p);
+  auto null = intrusive_ptr<SharedObject>();
+  auto p1 = make_intrusive<SharedObject>();
+  CHECK(p == copy);
+  CHECK(p != p1);
+  CHECK(p == copy.get());
+  CHECK(p != p1.get());
+  CHECK(p.get() == copy);
+  CHECK(p.get() != p1);
+  CHECK(null == nullptr);
+  CHECK(nullptr == null);
+  CHECK(p != nullptr);
+  CHECK(nullptr != p);
+}
+
+}  // namespace tests
+}  // namespace pten
diff --git a/paddle/pten/tests/core/test_kernel_factory.cc b/paddle/pten/tests/core/test_kernel_factory.cc
new file mode 100644
index 0000000000000..c1c17171b5898
--- /dev/null
+++ b/paddle/pten/tests/core/test_kernel_factory.cc
@@ -0,0 +1,47 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <iostream>
+#include <sstream>
+
+#include "paddle/pten/core/kernel_factory.h"
+
+#include "gtest/gtest.h"
+
+// TODO(chenweihang): add more unittests later
+
+TEST(KernelName, ConstructAndOStream) {
+  std::ostringstream oss;
+  oss << pten::KernelName("scale", "host");
+  EXPECT_EQ(oss.str(), "scale.host");
+  pten::KernelName kernel_name1("scale.host");
+  EXPECT_EQ(kernel_name1.name(), "scale");
+  EXPECT_EQ(kernel_name1.overload_name(), "host");
+  pten::KernelName kernel_name2("scale.host");
+  EXPECT_EQ(kernel_name2.name(), "scale");
+  EXPECT_EQ(kernel_name2.overload_name(), "host");
+}
+
+TEST(KernelKey, ConstructAndOStream) {
+  pten::KernelKey key(
+      pten::Backend::CPU, pten::DataLayout::NCHW, pten::DataType::FLOAT32);
+  EXPECT_EQ(key.backend(), pten::Backend::CPU);
+  EXPECT_EQ(key.layout(), pten::DataLayout::NCHW);
+  EXPECT_EQ(key.dtype(), pten::DataType::FLOAT32);
+  std::ostringstream oss;
+  oss << key;
+  std::cout << oss.str();
+  // EXPECT_EQ(oss.str(), "scale.host");
+  oss.flush();
+}
diff --git a/paddle/pten/tests/core/test_storage.cc b/paddle/pten/tests/core/test_storage.cc
new file mode 100644
index 0000000000000..69d1eae668c58
--- /dev/null
+++ b/paddle/pten/tests/core/test_storage.cc
@@ -0,0 +1,40 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "paddle/pten/core/storage.h"
+#include "paddle/pten/tests/core/allocator.h"
+
+namespace pten {
+namespace tests {
+
+TEST(host_storage, internal) {
+  // TODO(Shixiaowei02): Here we need to consider the case
+  // where the size is zero.
+  const size_t size{100};
+  const auto a = std::make_shared<FancyAllocator>();
+  TensorStorage storage(a, size);
+  CHECK_EQ(storage.size(), size);
+  CHECK(paddle::platform::is_cpu_place(storage.place()));
+  CHECK(storage.OwnsMemory());
+  CHECK(storage.allocator() == a);
+  storage.Realloc(size + 100);
+  CHECK_EQ(storage.size(), size + 100);
+}
+
+}  // namespace tests
+}  // namespace pten
diff --git a/paddle/pten/tests/core/test_type_info.cc b/paddle/pten/tests/core/test_type_info.cc
new file mode 100644
index 0000000000000..a83f83e90db88
--- /dev/null
+++ b/paddle/pten/tests/core/test_type_info.cc
@@ -0,0 +1,71 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "gtest/gtest.h"
+
+#include "paddle/pten/core/utils/type_registry.h"
+
+namespace pten {
+namespace tests {
+
+template <typename T>
+class Base {
+ public:
+  TypeInfo<Base<T>> type_info() const { return type_info_; }
+
+ private:
+  template <typename T1, typename T2>
+  friend class pten::TypeInfoTraits;
+  TypeInfo<Base<T>> type_info_{TypeInfo<Base<T>>::kUnknownType};
+};
+
+template <typename T>
+class DerivedA : public Base<T>, public TypeInfoTraits<Base<T>, DerivedA<T>> {
+ public:
+  static const char* name() { return "DerivedA"; }
+};
+
+template <typename T>
+class DerivedB : public Base<T>, public TypeInfoTraits<Base<T>, DerivedB<T>> {
+ public:
+  static const char* name() { return "DerivedB"; }
+};
+
+template <typename T>
+void check_type_info() {
+  std::unique_ptr<Base<T>> base(new Base<T>);
+  std::unique_ptr<Base<T>> derived_a(new DerivedA<T>);
+  std::unique_ptr<Base<T>> derived_b(new DerivedB<T>);
+
+  EXPECT_EQ(DerivedA<T>::classof(derived_a.get()), true);
+  EXPECT_EQ(DerivedB<T>::classof(derived_b.get()), true);
+  EXPECT_EQ(DerivedB<T>::classof(derived_a.get()), false);
+  EXPECT_EQ(DerivedA<T>::classof(derived_b.get()), false);
+
+  EXPECT_EQ(base->type_info().id(), 0);
+  EXPECT_EQ(derived_a->type_info().id(), 1);
+  EXPECT_EQ(derived_b->type_info().id(), 2);
+
+  EXPECT_EQ(base->type_info().name(), "Unknown");
+  EXPECT_EQ(derived_a->type_info().name(), "DerivedA");
+  EXPECT_EQ(derived_b->type_info().name(), "DerivedB");
+}
+
+TEST(type_info, base) {
+  check_type_info<int>();
+  check_type_info<float>();
+}
+
+}  // namespace tests
+}  // namespace pten
diff --git a/paddle/pten/tests/core/timer.h b/paddle/pten/tests/core/timer.h
new file mode 100644
index 0000000000000..1ed03192f1243
--- /dev/null
+++ b/paddle/pten/tests/core/timer.h
@@ -0,0 +1,39 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <chrono>  // NOLINT
+
+namespace pten {
+namespace tests {
+
+class Timer {
+ public:
+  std::chrono::high_resolution_clock::time_point start;
+  std::chrono::high_resolution_clock::time_point startu;
+
+  void tic() { start = std::chrono::high_resolution_clock::now(); }
+  double toc() {
+    startu = std::chrono::high_resolution_clock::now();
+    std::chrono::duration<double> time_span =
+        std::chrono::duration_cast<std::chrono::duration<double>>(startu -
+                                                                  start);
+    double used_time_ms = static_cast<double>(time_span.count()) * 1000.0;
+    return used_time_ms;
+  }
+};
+
+}  // namespace tests
+}  // namespace pten
diff --git a/paddle/pten/tests/kernels/CMakeLists.txt b/paddle/pten/tests/kernels/CMakeLists.txt
new file mode 100644
index 0000000000000..b0dc29de52140
--- /dev/null
+++ b/paddle/pten/tests/kernels/CMakeLists.txt
@@ -0,0 +1,6 @@
+cc_test(test_copy_dev_api SRCS test_copy_dev_api.cc DEPS pten pten_api_utils)
+cc_test(test_dot_dev_api SRCS test_dot_dev_api.cc DEPS pten pten_api_utils)
+cc_test(test_fill_dev_api SRCS test_fill_dev_api.cc DEPS pten pten_api_utils)
+cc_test(test_flatten_dev_api SRCS test_flatten_dev_api.cc DEPS pten pten_api_utils)
+cc_test(test_mean_dev_api SRCS test_mean_dev_api.cc DEPS pten pten_api_utils)
+cc_test(test_scale_dev_api SRCS test_scale_dev_api.cc DEPS pten pten_api_utils)
diff --git a/paddle/pten/tests/kernels/test_copy_dev_api.cc b/paddle/pten/tests/kernels/test_copy_dev_api.cc
new file mode 100644
index 0000000000000..dadbcf098dd17
--- /dev/null
+++ b/paddle/pten/tests/kernels/test_copy_dev_api.cc
@@ -0,0 +1,65 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <memory>
+
+#include "paddle/pten/core/kernel_registry.h"
+#include "paddle/pten/kernels/cpu/utils.h"
+
+#include "paddle/pten/api/lib/utils/allocator.h"
+#include "paddle/pten/core/dense_tensor.h"
+
+PT_DECLARE_MODULE(UtilsCPU);
+
+namespace framework = paddle::framework;
+using DDim = paddle::framework::DDim;
+
+// TODO(YuanRisheng): This TEST file need to be refactored after 'copy' realized
+// in
+// 'paddle/api',
+TEST(DEV_API, copy) {
+  // 1. create tensor
+  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+      paddle::platform::CPUPlace());
+  auto dense_src = std::make_shared<pten::DenseTensor>(
+      alloc,
+      pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                            framework::make_ddim({2, 3}),
+                            pten::DataLayout::NCHW));
+  auto* dense_x_data = dense_src->mutable_data<float>();
+
+  auto dense_dst = std::make_shared<pten::DenseTensor>(
+      alloc,
+      pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                            framework::make_ddim({2, 3}),
+                            pten::DataLayout::NCHW));
+
+  for (size_t i = 0; i < 2; ++i) {
+    for (size_t j = 0; j < 3; ++j) {
+      dense_x_data[i * 3 + j] = (i * 3 + j) * 1.0;
+    }
+  }
+  const auto& a = paddle::platform::CPUPlace();
+  std::cout << typeid(a).name() << std::endl;
+  // 2. test API
+  auto& pool = paddle::platform::DeviceContextPool::Instance();
+  auto* dev_ctx = pool.GetByPlace(paddle::platform::CPUPlace());
+  pten::Copy(*dev_ctx, *(dense_src.get()), dense_dst.get());
+
+  // 3. check result
+  for (int64_t i = 0; i < dense_src->numel(); i++) {
+    ASSERT_EQ(dense_src->data<float>()[i], dense_dst->data<float>()[i]);
+  }
+}
diff --git a/paddle/pten/tests/kernels/test_dot_dev_api.cc b/paddle/pten/tests/kernels/test_dot_dev_api.cc
new file mode 100644
index 0000000000000..897c49f4f255e
--- /dev/null
+++ b/paddle/pten/tests/kernels/test_dot_dev_api.cc
@@ -0,0 +1,81 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <memory>
+
+#include "paddle/pten/include/linalg.h"
+
+#include "paddle/pten/api/lib/utils/allocator.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/kernel_registry.h"
+
+PT_DECLARE_MODULE(LinalgCPU);
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PT_DECLARE_MODULE(LinalgCUDA);
+#endif
+
+namespace framework = paddle::framework;
+using DDim = paddle::framework::DDim;
+
+TEST(DEV_API, dot) {
+  // 1. create tensor
+  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+      paddle::platform::CPUPlace());
+  pten::DenseTensor dense_x(alloc,
+                            pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                                                  framework::make_ddim({3, 10}),
+                                                  pten::DataLayout::NCHW));
+  auto* dense_x_data = dense_x.mutable_data<float>();
+
+  pten::DenseTensor dense_y(alloc,
+                            pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                                                  framework::make_ddim({3, 10}),
+                                                  pten::DataLayout::NCHW));
+  auto* dense_y_data = dense_y.mutable_data<float>();
+
+  float sum[3] = {0.0, 0.0, 0.0};
+  for (size_t i = 0; i < 3; ++i) {
+    for (size_t j = 0; j < 10; ++j) {
+      dense_x_data[i * 10 + j] = (i * 10 + j) * 1.0;
+      dense_y_data[i * 10 + j] = (i * 10 + j) * 1.0;
+      sum[i] += (i * 10 + j) * (i * 10 + j) * 1.0;
+    }
+  }
+
+  paddle::platform::DeviceContextPool& pool =
+      paddle::platform::DeviceContextPool::Instance();
+  auto* dev_ctx = pool.Get(paddle::platform::CPUPlace());
+
+  // 2. test API
+  auto out = pten::Dot<float>(
+      *(static_cast<paddle::platform::CPUDeviceContext*>(dev_ctx)),
+      dense_x,
+      dense_y);
+
+  // 3. check result
+  ASSERT_EQ(out.dims().size(), 2);
+  ASSERT_EQ(out.dims()[0], 3);
+  ASSERT_EQ(out.meta().type, pten::DataType::FLOAT32);
+  ASSERT_EQ(out.meta().layout, pten::DataLayout::NCHW);
+
+  auto expect_result = sum;
+  auto actual_result0 = out.data<float>()[0];
+  auto actual_result1 = out.data<float>()[1];
+  auto actual_result2 = out.data<float>()[2];
+  ASSERT_NEAR(expect_result[0], actual_result0, 1e-6f);
+  ASSERT_NEAR(expect_result[1], actual_result1, 1e-6f);
+  ASSERT_NEAR(expect_result[2], actual_result2, 1e-6f);
+}
diff --git a/paddle/pten/tests/kernels/test_fill_dev_api.cc b/paddle/pten/tests/kernels/test_fill_dev_api.cc
new file mode 100644
index 0000000000000..8546396e18a9a
--- /dev/null
+++ b/paddle/pten/tests/kernels/test_fill_dev_api.cc
@@ -0,0 +1,66 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <memory>
+
+#include "paddle/pten/include/creation.h"
+
+#include "paddle/pten/api/lib/utils/allocator.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/kernel_registry.h"
+
+PT_DECLARE_MODULE(CreationCPU);
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PT_DECLARE_MODULE(CreationCUDA);
+#endif
+
+namespace framework = paddle::framework;
+using DDim = paddle::framework::DDim;
+
+TEST(DEV_API, fill_any_like) {
+  // 1. create tensor
+  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+      paddle::platform::CPUPlace());
+  pten::DenseTensor dense_x(alloc,
+                            pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                                                  framework::make_ddim({3, 2}),
+                                                  pten::DataLayout::NCHW));
+  auto* dense_x_data = dense_x.mutable_data<float>();
+  dense_x_data[0] = 0;
+  float val = 1.0;
+
+  paddle::platform::DeviceContextPool& pool =
+      paddle::platform::DeviceContextPool::Instance();
+  auto* dev_ctx = pool.Get(paddle::platform::CPUPlace());
+
+  // 2. test API
+  auto out = pten::FillAnyLike<float>(
+      *(static_cast<paddle::platform::CPUDeviceContext*>(dev_ctx)),
+      dense_x,
+      val);
+
+  // 3. check result
+  ASSERT_EQ(out.dims().size(), 2);
+  ASSERT_EQ(out.dims()[0], 3);
+  ASSERT_EQ(out.numel(), 6);
+  ASSERT_EQ(out.meta().type, pten::DataType::FLOAT32);
+  ASSERT_EQ(out.meta().layout, pten::DataLayout::NCHW);
+
+  auto* actual_result = out.data<float>();
+  for (auto i = 0; i < 6; i++) {
+    ASSERT_NEAR(actual_result[i], val, 1e-6f);
+  }
+}
diff --git a/paddle/pten/tests/kernels/test_flatten_dev_api.cc b/paddle/pten/tests/kernels/test_flatten_dev_api.cc
new file mode 100644
index 0000000000000..16e6aa5acdd98
--- /dev/null
+++ b/paddle/pten/tests/kernels/test_flatten_dev_api.cc
@@ -0,0 +1,75 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <memory>
+
+#include "paddle/pten/include/manipulation.h"
+
+#include "paddle/pten/api/lib/utils/allocator.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/kernel_registry.h"
+
+PT_DECLARE_MODULE(ManipulationCPU);
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PT_DECLARE_MODULE(ManipulationCUDA);
+#endif
+
+namespace framework = paddle::framework;
+using DDim = paddle::framework::DDim;
+
+TEST(DEV_API, flatten) {
+  // 1. create tensor
+  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+      paddle::platform::CPUPlace());
+  pten::DenseTensor dense_x(
+      alloc,
+      pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                            framework::make_ddim({3, 2, 2, 3}),
+                            pten::DataLayout::NCHW));
+  auto* dense_x_data = dense_x.mutable_data<float>();
+
+  for (int i = 0; i < dense_x.numel(); i++) {
+    dense_x_data[i] = i;
+  }
+  int start_axis = 1, stop_axis = 2;
+  paddle::platform::DeviceContextPool& pool =
+      paddle::platform::DeviceContextPool::Instance();
+  auto* dev_ctx = pool.Get(paddle::platform::CPUPlace());
+
+  // 2. test API
+  auto out = pten::Flatten<float>(
+      *(static_cast<paddle::platform::CPUDeviceContext*>(dev_ctx)),
+      dense_x,
+      start_axis,
+      stop_axis);
+
+  // 3. check result
+  std::vector<int> expect_shape = {3, 4, 3};
+  ASSERT_EQ(out.dims()[0], expect_shape[0]);
+  ASSERT_EQ(out.dims()[1], expect_shape[1]);
+  ASSERT_EQ(out.dims()[2], expect_shape[2]);
+  ASSERT_EQ(out.numel(), 36);
+  ASSERT_EQ(out.meta().type, pten::DataType::FLOAT32);
+  ASSERT_EQ(out.meta().layout, pten::DataLayout::NCHW);
+
+  bool value_equal = true;
+  auto* dense_out_data = out.data<float>();
+  for (int i = 0; i < dense_x.numel(); i++) {
+    if (std::abs(dense_x_data[i] - dense_out_data[i]) > 1e-6f)
+      value_equal = false;
+  }
+  ASSERT_EQ(value_equal, true);
+}
diff --git a/paddle/pten/tests/kernels/test_mean_dev_api.cc b/paddle/pten/tests/kernels/test_mean_dev_api.cc
new file mode 100644
index 0000000000000..43f9371d72a2f
--- /dev/null
+++ b/paddle/pten/tests/kernels/test_mean_dev_api.cc
@@ -0,0 +1,64 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <memory>
+
+#include "paddle/pten/include/math.h"
+
+#include "paddle/pten/api/lib/utils/allocator.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/kernel_registry.h"
+
+PT_DECLARE_MODULE(MathCPU);
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PT_DECLARE_MODULE(MathCUDA);
+#endif
+
+namespace framework = paddle::framework;
+using DDim = paddle::framework::DDim;
+
+TEST(DEV_API, mean) {
+  // 1. create tensor
+  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+      paddle::platform::CPUPlace());
+  pten::DenseTensor dense_x(alloc,
+                            pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                                                  framework::make_ddim({3, 4}),
+                                                  pten::DataLayout::NCHW));
+  auto* dense_x_data = dense_x.mutable_data<float>();
+
+  float sum = 0.0;
+  for (size_t i = 0; i < 12; ++i) {
+    dense_x_data[i] = i * 1.0;
+    sum += i * 1.0;
+  }
+  paddle::platform::DeviceContextPool& pool =
+      paddle::platform::DeviceContextPool::Instance();
+  auto* dev_ctx = pool.Get(paddle::platform::CPUPlace());
+  // 2. test API
+  auto out = pten::Mean<float>(
+      *(static_cast<paddle::platform::CPUDeviceContext*>(dev_ctx)), dense_x);
+
+  // 3. check result
+  ASSERT_EQ(out.dims().size(), 1);
+  ASSERT_EQ(out.numel(), 1);
+  ASSERT_EQ(out.meta().type, pten::DataType::FLOAT32);
+  ASSERT_EQ(out.meta().layout, pten::DataLayout::NCHW);
+
+  auto expect_result = sum / 12;
+  auto actual_result = out.data<float>()[0];
+  ASSERT_NEAR(expect_result, actual_result, 1e-6f);
+}
diff --git a/paddle/pten/tests/kernels/test_scale_dev_api.cc b/paddle/pten/tests/kernels/test_scale_dev_api.cc
new file mode 100644
index 0000000000000..f1764e803250a
--- /dev/null
+++ b/paddle/pten/tests/kernels/test_scale_dev_api.cc
@@ -0,0 +1,116 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <memory>
+
+#include "paddle/pten/include/math.h"
+
+#include "paddle/pten/api/lib/utils/allocator.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/kernel_registry.h"
+
+PT_DECLARE_MODULE(MathCPU);
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PT_DECLARE_MODULE(MathCUDA);
+#endif
+
+namespace framework = paddle::framework;
+using DDim = paddle::framework::DDim;
+
+TEST(DEV_API, scale) {
+  // 1. create tensor
+  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+      paddle::platform::CPUPlace());
+  pten::DenseTensor dense_x(alloc,
+                            pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                                                  framework::make_ddim({3, 4}),
+                                                  pten::DataLayout::NCHW));
+
+  auto* dense_x_data = dense_x.mutable_data<float>();
+  for (size_t i = 0; i < 12; ++i) {
+    dense_x_data[i] = i * 1.0;
+  }
+  float scale = 2;
+  float bias = 1;
+  bool bias_after_scale = true;
+
+  paddle::platform::DeviceContextPool& pool =
+      paddle::platform::DeviceContextPool::Instance();
+  auto* dev_ctx = pool.Get(paddle::platform::CPUPlace());
+
+  // 2. test API
+  auto out = pten::Scale<float>(
+      *(static_cast<paddle::platform::CPUDeviceContext*>(dev_ctx)),
+      dense_x,
+      scale,
+      bias,
+      bias_after_scale);
+
+  // 3. check result
+  ASSERT_EQ(out.dims().size(), 2);
+  ASSERT_EQ(out.numel(), 12);
+  ASSERT_EQ(out.meta().type, pten::DataType::FLOAT32);
+  ASSERT_EQ(out.meta().layout, pten::DataLayout::NCHW);
+
+  auto expect_result = 23;
+  auto actual_result = out.data<float>()[11];
+  ASSERT_NEAR(expect_result, actual_result, 1e-6f);
+}
+
+TEST(DEV_API, scale_host) {
+  // 1. create tensor
+  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+      paddle::platform::CPUPlace());
+  pten::DenseTensor dense_x(alloc,
+                            pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                                                  framework::make_ddim({3, 4}),
+                                                  pten::DataLayout::NCHW));
+  auto* dense_x_data = dense_x.mutable_data<float>();
+  for (size_t i = 0; i < 12; ++i) {
+    dense_x_data[i] = i * 1.0;
+  }
+  const auto alloc2 = std::make_shared<paddle::experimental::DefaultAllocator>(
+      paddle::platform::CPUPlace());
+  pten::DenseTensor scale(alloc2,
+                          pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                                                framework::make_ddim({1}),
+                                                pten::DataLayout::NCHW));
+  scale.mutable_data<float>()[0] = 2;
+  float bias = 1;
+  bool bias_after_scale = true;
+
+  paddle::platform::DeviceContextPool& pool =
+      paddle::platform::DeviceContextPool::Instance();
+  auto* dev_ctx = pool.Get(paddle::platform::CPUPlace());
+
+  // 2. test API
+  auto out = pten::Scale<float>(
+      *(static_cast<paddle::platform::CPUDeviceContext*>(dev_ctx)),
+      dense_x,
+      scale,
+      bias,
+      bias_after_scale);
+
+  // 3. check result
+  ASSERT_EQ(out.dims().size(), 2);
+  ASSERT_EQ(out.numel(), 12);
+  ASSERT_EQ(out.meta().type, pten::DataType::FLOAT32);
+  ASSERT_EQ(out.meta().layout, pten::DataLayout::NCHW);
+
+  auto expect_result = 23;
+  auto actual_result = out.data<float>()[11];
+  ASSERT_NEAR(expect_result, actual_result, 1e-6f);
+}
diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index e44c877d6a2f3..34a8f10458d7b 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -249,6 +249,22 @@ call :test_unit || goto test_unit_error
 :: call :check_change_of_unittest || goto check_change_of_unittest_error
 goto:success
 
+rem ------PR CI windows check for unittests and inference in CUDA11-MKL-AVX----------
+:CASE_wincheck_inference
+set WITH_MKL=ON
+set WITH_GPU=ON
+set WITH_AVX=ON
+set MSVC_STATIC_CRT=ON
+set ON_INFER=ON
+
+call :cmake || goto cmake_error
+call :build || goto build_error
+call :test_whl_pacakage || goto test_whl_pacakage_error
+:: call :test_unit || goto test_unit_error
+::call :test_inference || goto test_inference_error
+:: call :check_change_of_unittest || goto check_change_of_unittest_error
+goto:success
+
 rem ------Build windows avx whl package------
 :CASE_build_avx_whl
 set WITH_AVX=ON
@@ -659,6 +675,8 @@ setlocal enabledelayedexpansion
 :: for /F %%# in ('cmd /C nvidia-smi -L ^|find "GPU" /C') do set CUDA_DEVICE_COUNT=%%#
 set CUDA_DEVICE_COUNT=1
 
+:: For hypothesis tests(mkldnn op and inference pass), we set use 'ci' profile
+set HYPOTHESIS_TEST_PROFILE=ci
 echo cmake .. -G %GENERATOR% -DCMAKE_BUILD_TYPE=Release -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^
 -DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DON_INFER=%ON_INFER% ^
 -DWITH_INFERENCE_API_TEST=%WITH_INFERENCE_API_TEST% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% ^
@@ -676,6 +694,8 @@ echo    ========================================
 echo    Running CPU unit tests in parallel way ...
 echo    ========================================
 
+:: For hypothesis tests(mkldnn op and inference pass), we set use 'ci' profile
+set HYPOTHESIS_TEST_PROFILE=ci
 %cache_dir%\tools\busybox64.exe bash %work_dir%\tools\windows\run_unittests.sh %NIGHTLY_MODE% %PRECISION_TEST% %WITH_GPU%
 
 goto:eof
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 9bdd9e14d58dc..11ac1126e8d3b 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -202,6 +202,7 @@ function cmake_base() {
         -DWITH_GPU=${WITH_GPU:-OFF}
         -DWITH_TENSORRT=${WITH_TENSORRT:-ON}
         -DWITH_ROCM=${WITH_ROCM:-OFF}
+        -DWITH_CINN=${WITH_CINN:-OFF}
         -DWITH_DISTRIBUTE=${distibuted_flag}
         -DWITH_MKL=${WITH_MKL:-ON}
         -DWITH_AVX=${WITH_AVX:-OFF}
@@ -246,6 +247,7 @@ EOF
         -DWITH_GPU=${WITH_GPU:-OFF} \
         -DWITH_TENSORRT=${WITH_TENSORRT:-ON} \
         -DWITH_ROCM=${WITH_ROCM:-OFF} \
+        -DWITH_CINN=${WITH_CINN:-OFF} \
         -DWITH_DISTRIBUTE=${distibuted_flag} \
         -DWITH_MKL=${WITH_MKL:-ON} \
         -DWITH_AVX=${WITH_AVX:-OFF} \
@@ -581,12 +583,16 @@ EOF
 
         if [ "$1" == "cp36-cp36m" ]; then
             pip3.6 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
+            pip3.6 install --user hypothesis
         elif [ "$1" == "cp37-cp37m" ]; then
             pip3.7 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
+            pip3.7 install --user hypothesis
         elif [ "$1" == "cp38-cp38" ]; then
             pip3.8 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
+            pip3.8 install --user hypothesis
         elif [ "$1" == "cp39-cp39" ]; then
             pip3.9 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
+            pip3.9 install --user hypothesis
         fi
         tmpfile_rand=`date +%s%N`
         tmpfile=$tmp_dir/$tmpfile_rand
@@ -1893,6 +1899,7 @@ set -ex
 function parallel_test() {
     mkdir -p ${PADDLE_ROOT}/build
     cd ${PADDLE_ROOT}/build
+    pip install hypothesis
     pip install ${PADDLE_ROOT}/build/python/dist/*whl
     cp ${PADDLE_ROOT}/build/python/paddle/fluid/tests/unittests/op_test.py ${PADDLE_ROOT}/build/python
     ut_total_startTime_s=`date +%s`
@@ -2293,11 +2300,11 @@ function collect_ccache_hits() {
 
 function test_op_benchmark() {
     # The PR will pass quickly when get approval from specific person.
-    # Xreki 12538138, luotao1 6836917, Avin0323 16167147
+    # Xreki 12538138, luotao1 6836917, Avin0323 23427135
     set +x
     approval_line=$(curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000)
     if [ "${approval_line}" != "" ]; then
-        APPROVALS=$(echo ${approval_line} | python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 16167147 12538138 6836917)
+        APPROVALS=$(echo ${approval_line} | python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 23427135 12538138 6836917)
         echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
         if [ "${APPROVALS}" == "TRUE" ]; then
             echo "==================================="
@@ -2388,13 +2395,32 @@ function find_temporary_files() {
     fi
 }
 
+function trt_convert_test() {
+    set +e
+    cd ${PADDLE_ROOT}
+    result_num=0
+    export PYTHONPATH=$PYTHONPATH:${PADDLE_ROOT}/build/python
+    for file_name in `find python/ -name 'test_trt_convert*'`;do
+        echo "----- test trt ut: $file_name -----"
+        python $file_name
+        res=$?
+        if [ "$res" != "0" ];then
+            echo "$file_name convert test failed " >&2
+            result_num=11
+        fi
+    done
+    if [ "$result_num" != "0" ];then
+        exit 11
+    fi
+}
+
 function build_pr_and_develop() {
     cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number}
     mkdir ${PADDLE_ROOT}/build/pr_whl && cp ${PADDLE_ROOT}/build/python/dist/*.whl ${PADDLE_ROOT}/build/pr_whl
     rm -f ${PADDLE_ROOT}/build/python/dist/*.whl && rm -f ${PADDLE_ROOT}/build/python/build/.timestamp
-    rm -rf ${PADDLE_ROOT}/build/Makefile ${PADDLE_ROOT}/build/CMakeCache.txt
     cmake_change=`git diff --name-only upstream/$BRANCH | grep "cmake/external" || true`
     if [ ${cmake_change} ];then
+        rm -rf ${PADDLE_ROOT}/build/Makefile ${PADDLE_ROOT}/build/CMakeCache.txt
         rm -rf ${PADDLE_ROOT}/build/third_party
     fi
     git checkout .
@@ -2656,6 +2682,10 @@ function main() {
       test_model_benchmark)
         test_model_benchmark
         ;;
+      trt_convert_test)
+        # only test trt convert.
+        trt_convert_test
+        ;;
       *)
         print_usage
         exit 1
diff --git a/paddle/utils/small_vector.h b/paddle/utils/small_vector.h
index f51a3b623ce3b..e9e7996babcf7 100644
--- a/paddle/utils/small_vector.h
+++ b/paddle/utils/small_vector.h
@@ -3,6 +3,8 @@
 // 1. remove  macro
 // 2. remove LLVM_LIKELY and LLVM_UNLIKELY
 // 3. add at(index) method for small vector
+// 4. wrap the call to max and min with parenthesis to prevent the macro
+// expansion to fix the build error on windows platform
 
 //===- llvm/ADT/SmallVector.h - 'Normally small' vectors --------*- C++ -*-===//
 //
@@ -90,7 +92,7 @@ class SmallVectorBase {
 
   /// The maximum value of the Size_T used.
   static constexpr size_t SizeTypeMax() {
-    return std::numeric_limits<Size_T>::max();
+    return (std::numeric_limits<Size_T>::max)();
   }
 
   SmallVectorBase() = delete;
@@ -309,7 +311,7 @@ class SmallVectorTemplateCommon
 
   size_type size_in_bytes() const { return size() * sizeof(T); }
   size_type max_size() const {
-    return std::min(this->SizeTypeMax(), size_type(-1) / sizeof(T));
+    return (std::min)(this->SizeTypeMax(), size_type(-1) / sizeof(T));
   }
 
   size_t capacity_in_bytes() const { return capacity() * sizeof(T); }
@@ -727,7 +729,7 @@ class SmallVectorImpl : public SmallVectorTemplateBase<T> {
     }
 
     // Assign over existing elements.
-    std::fill_n(this->begin(), std::min(NumElts, this->size()), Elt);
+    std::fill_n(this->begin(), (std::min)(NumElts, this->size()), Elt);
     if (NumElts > this->size())
       std::uninitialized_fill_n(this->end(), NumElts - this->size(), Elt);
     else if (NumElts < this->size())
@@ -1393,7 +1395,7 @@ static void report_at_maximum_capacity(size_t MaxSize) {
 // Note: Moving this function into the header may cause performance regression.
 template <class Size_T>
 static size_t getNewCapacity(size_t MinSize, size_t TSize, size_t OldCapacity) {
-  constexpr size_t MaxSize = std::numeric_limits<Size_T>::max();
+  constexpr size_t MaxSize = (std::numeric_limits<Size_T>::max)();
 
   // Ensure we can fit the new capacity.
   // This is only going to be applicable when the capacity is 32 bit.
@@ -1408,7 +1410,7 @@ static size_t getNewCapacity(size_t MinSize, size_t TSize, size_t OldCapacity) {
   // In theory 2*capacity can overflow if the capacity is 64 bit, but the
   // original capacity would never be large enough for this to be a problem.
   size_t NewCapacity = 2 * OldCapacity + 1;  // Always grow.
-  return std::min(std::max(NewCapacity, MinSize), MaxSize);
+  return (std::min)((std::max)(NewCapacity, MinSize), MaxSize);
 }
 
 // Note: Moving this function into the header may cause performance regression.
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 29548a64f3dad..c0421eb9d46dc 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -101,6 +101,7 @@
 from .tensor.linalg import bincount  # noqa: F401
 from .tensor.linalg import mv  # noqa: F401
 from .tensor.logic import equal  # noqa: F401
+from .tensor.linalg import eigvalsh  # noqa: F401
 from .tensor.logic import greater_equal  # noqa: F401
 from .tensor.logic import greater_than  # noqa: F401
 from .tensor.logic import is_empty  # noqa: F401
@@ -229,6 +230,7 @@
 from .tensor.random import randn  # noqa: F401
 from .tensor.random import rand  # noqa: F401
 from .tensor.random import randint  # noqa: F401
+from .tensor.random import randint_like  # noqa: F401
 from .tensor.random import randperm  # noqa: F401
 from .tensor.search import argmax  # noqa: F401
 from .tensor.search import argmin  # noqa: F401
@@ -375,6 +377,7 @@
            'ParamAttr',
            'stanh',
            'randint',
+           'randint_like',
            'assign',
            'gather',
            'scale',
diff --git a/python/paddle/cost_model/__init__.py b/python/paddle/cost_model/__init__.py
new file mode 100644
index 0000000000000..65f2533032ae3
--- /dev/null
+++ b/python/paddle/cost_model/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .cost_model import CostModel  # noqa: F401
+__all__ = ['CostModel']
diff --git a/python/paddle/cost_model/cost_model.py b/python/paddle/cost_model/cost_model.py
index 93c89d0c89297..e6a87468a1172 100644
--- a/python/paddle/cost_model/cost_model.py
+++ b/python/paddle/cost_model/cost_model.py
@@ -15,6 +15,8 @@
 import paddle
 import paddle.static as static
 import numpy as np
+import json
+import os
 from paddle.fluid import core
 
 
@@ -36,7 +38,6 @@ def build_program(self):
             paddle.optimizer.SGD(learning_rate=0.01).minimize(loss)
 
         print("main program is: {}".format(main_program))
-        #print("start up program is: {}".format(startup_program))
 
         return startup_program, main_program
 
@@ -44,7 +45,7 @@ def profile_measure(self,
                         startup_program,
                         main_program,
                         device='gpu',
-                        fetch_cost_list=['time', 'memory']):
+                        fetch_cost_list=['time']):
 
         place = paddle.set_device('gpu')
         x = np.random.random(size=(10, 1)).astype('float32')
@@ -53,17 +54,33 @@ def profile_measure(self,
         exe.run(startup_program)
         paddle.fluid.profiler.start_profiler("All")
         exe.run(main_program, feed={"X": x}, fetch_list=[])
-        # core.CostModel.ProfileMeasure(main_program, device)
-        print("core:<<<<<<<")
 
         cost_model = core.CostModel()
         cost_data = cost_model.ProfileMeasure(device)
-        # cost_list = self.stop_cost_model()
-        # return cost_list
 
+    def static_cost_data(self):
+        static_cost_data_path = os.path.join(
+            os.path.dirname(__file__), "static_op_benchmark.json")
+        with open(static_cost_data_path, 'r') as load_f:
+            load_dict = json.load(load_f)
+        self._static_cost_data = load_dict
+        # return all static cost data
+        return load_dict
 
-cost_model = CostModel()
+    def get_static_op_time(self, op_name, forward=True, dtype="float32"):
+        # if forward is True, return op forward time, otherwise return op backward time.
+        if op_name == None:
+            raise ValueError(
+                'op_name should not be empty when you want to get static op time'
+            )
 
-startup_program, main_program = cost_model.build_program()
+        op_cost = {}
+        for op_data in self._static_cost_data:
+            if (op_data["op"] == op_name) and (dtype in op_data["config"]):
+                if (forward):
+                    op_cost["op_time"] = op_data["paddle_gpu_time"]
+                else:
+                    op_cost["op_time"] = op_data["paddle_gpu_time_backward"]
+                op_cost["config"] = op_data["config"]
 
-cost_model.profile_measure(startup_program, main_program)
+        return op_cost
diff --git a/python/paddle/cost_model/static_op_benchmark.json b/python/paddle/cost_model/static_op_benchmark.json
new file mode 100644
index 0000000000000..b516f810e53e8
--- /dev/null
+++ b/python/paddle/cost_model/static_op_benchmark.json
@@ -0,0 +1 @@
+[{"name":"abs_0","op":"abs","op_count":2,"config":"x (Variable) - dtype: float32, shape: [16, 128, 257, 257]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/abs_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/abs_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/abs_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/abs_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/abs_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/abs_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"1.331090353772731","paddle_perf_backwards":"3.3697429305326962","paddle_gpu_time":"1.3211645008562505","paddle_gpu_time_backward":"3.387969185379446"},{"name":"abs_1","op":"abs","op_count":2,"config":"x (Variable) - dtype: float16, shape: [16, 128, 257, 257]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/abs_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/abs_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/abs_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/abs_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/abs_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/abs_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.6877529119441886","paddle_perf_backwards":"2.01086037622425","paddle_gpu_time":"0.6747270346494761","paddle_gpu_time_backward":"2.078491124260355"},{"name":"accuracy_0","op":"accuracy","op_count":62,"config":"input (Variable) - dtype: float16, shape: [16, 3]\nlabel (Variable) - dtype: int64, shape: [16, 1]\ntotal (Variable) - dtype: int64, shape: []\ncorrect (string): None\nk (int): 1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/accuracy_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/accuracy_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/accuracy_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/accuracy_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/accuracy_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/accuracy_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.08176540841861646","paddle_perf_backwards":"--","paddle_gpu_time":"0.03162362718907688","paddle_gpu_time_backward":"--"},{"name":"accuracy_1","op":"accuracy","op_count":62,"config":"input (Variable) - dtype: float32, shape: [16, 1000]\nlabel (Variable) - dtype: int64, shape: [16, 1]\ncorrect (string): None\nk (int): 1\ntotal (string): None\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/accuracy_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/accuracy_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/accuracy_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/accuracy_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/accuracy_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/accuracy_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.09902667026130521","paddle_perf_backwards":"--","paddle_gpu_time":"0.05448726932959588","paddle_gpu_time_backward":"--"},{"name":"add_0","op":"add","op_count":0,"config":"x (Variable) - dtype: float32, shape: [50, 128, 1000]\ny (Variable) - dtype: float32, shape: [128, 1000]\naxis (int): -1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/add_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/add_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/add_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/add_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/add_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/add_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.08375929879235315","paddle_perf_backwards":"0.19740152645397474","paddle_gpu_time":"0.06437907840578255","paddle_gpu_time_backward":"0.16722634967805844"},{"name":"add_1","op":"add","op_count":0,"config":"x (Variable) - dtype: float32, shape: [50, 128, 1000]\ny (Variable) - dtype: float32, shape: [1, 128, 1000]\naxis (int): 0\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/add_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/add_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/add_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/add_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/add_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/add_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.0836590487875776","paddle_perf_backwards":"0.19311931185827463","paddle_gpu_time":"0.06452402538531278","paddle_gpu_time_backward":"0.16717375093214018"},{"name":"add_2","op":"add","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 2048, 7, 7]\ny (Variable) - dtype: float32, shape: [16, 2048]\naxis (int): 0\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/add_2-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/add_2-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/add_2-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/add_2-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/add_2-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/add_2-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.038008699436225965","paddle_perf_backwards":"0.10008589777057778","paddle_gpu_time":"0.01757676432095037","paddle_gpu_time_backward":"0.04874236252545825"},{"name":"add_3","op":"add","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 2048, 16, 16]\ny (Variable) - dtype: float32, shape: [16, 2048, 16, 16]\naxis (int): -1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/add_3-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/add_3-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/add_3-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/add_3-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/add_3-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/add_3-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.14193958653237873","paddle_perf_backwards":"0.26894370635191284","paddle_gpu_time":"0.121376489598061","paddle_gpu_time_backward":"0.25036374246255594"},{"name":"add_4","op":"add","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 1, 513, 513]\ny (Variable) - dtype: float32, shape: [1]\naxis (int): -1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/add_4-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/add_4-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/add_4-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/add_4-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/add_4-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/add_4-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.06155439751420566","paddle_perf_backwards":"0.14206386042501262","paddle_gpu_time":"0.04269137792103142","paddle_gpu_time_backward":"0.11509082102203923"},{"name":"add_5","op":"add","op_count":0,"config":"x (Variable) - dtype: float32, shape: [512, 896, 4, 12]\ny (Variable) - dtype: float32, shape: [512, 896, 4, 1]\naxis (int): -1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/add_5-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/add_5-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/add_5-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/add_5-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/add_5-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/add_5-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.2454338905089843","paddle_perf_backwards":"0.6398088468578392","paddle_gpu_time":"0.22470623803285297","paddle_gpu_time_backward":"0.6160299727520435"},{"name":"add_6","op":"add","op_count":0,"config":"x (Variable) - dtype: float16, shape: [512, 896, 4, 12]\ny (Variable) - dtype: float16, shape: [512, 896, 4, 1]\naxis (int): -1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/add_6-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/add_6-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/add_6-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/add_6-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/add_6-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/add_6-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.13866807562280373","paddle_perf_backwards":"0.42508208608052817","paddle_gpu_time":"0.12125367684349325","paddle_gpu_time_backward":"0.39961179007907977"},{"name":"add_7","op":"add","op_count":0,"config":"x (Variable) - dtype: float16, shape: [32, 12, 128, 128]\ny (Variable) - dtype: float16, shape: [32, 1, 1, 128]\naxis (int): -1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/add_7-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/add_7-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/add_7-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/add_7-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/add_7-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/add_7-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.05299101871658998","paddle_perf_backwards":"0.11762414112627266","paddle_gpu_time":"0.03463261376304855","paddle_gpu_time_backward":"0.09322676781360066"},{"name":"add_8","op":"add","op_count":0,"config":"x (Variable) - dtype: float16, shape: [32, 1, 1, 128]\ny (Variable) - dtype: float16, shape: [1, 12, 128, 1]\naxis (int): -1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/add_8-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/add_8-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/add_8-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/add_8-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/add_8-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/add_8-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"False","paddle_perf":"0.05768260401570965","paddle_perf_backwards":"0.1170390832400274","paddle_gpu_time":"0.04146914691469147","paddle_gpu_time_backward":"0.09352805887764491"},{"name":"add_n_0","op":"add_n","op_count":0,"config":"inputs (list<Variable>[2]) - dtype: float32, shape: [1]; \n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/add_n_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/add_n_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/add_n_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/add_n_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/add_n_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/add_n_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.02119030271257673","paddle_perf_backwards":"--","paddle_gpu_time":"0.001314003261312678","paddle_gpu_time_backward":"--"},{"name":"add_n_1","op":"add_n","op_count":0,"config":"inputs (list<Variable>[8]) - dtype: float32, shape: [1]; \n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/add_n_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/add_n_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/add_n_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/add_n_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/add_n_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/add_n_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.03457020740119778","paddle_perf_backwards":"--","paddle_gpu_time":"0.004663965424095083","paddle_gpu_time_backward":"--"},{"name":"add_n_2","op":"add_n","op_count":0,"config":"inputs (list<Variable>[4]) - dtype: float32, shape: [16, 256]; \n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/add_n_2-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/add_n_2-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/add_n_2-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/add_n_2-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/add_n_2-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/add_n_2-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.033566903094856104","paddle_perf_backwards":"--","paddle_gpu_time":"0.004455732567249934","paddle_gpu_time_backward":"--"},{"name":"affine_grid_0","op":"affine_grid","op_count":0,"config":"theta (Variable) - dtype: float32, shape: [32, 2, 3]\nalign_corners (bool): True\nout_shape (list): [32, 3, 128, 128]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/affine_grid_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/affine_grid_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/affine_grid_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/affine_grid_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/affine_grid_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/affine_grid_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"0.025103321994643615","paddle_perf_backwards":"0.05442976472846954","paddle_gpu_time":"0.008732239343606164","paddle_gpu_time_backward":"0.02619939577039275"},{"name":"affine_grid_1","op":"affine_grid","op_count":0,"config":"theta (Variable) - dtype: float32, shape: [32, 2, 3]\nalign_corners (bool): False\nout_shape (list): [32, 3, 128, 128]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/affine_grid_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/affine_grid_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/affine_grid_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/affine_grid_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/affine_grid_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/affine_grid_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"0.02413375310629726","paddle_perf_backwards":"3.074037597840091","paddle_gpu_time":"0.008943471735867934","paddle_gpu_time_backward":"3.0098403454162064"},{"name":"any_0","op":"any","op_count":0,"config":"x (Variable) - dtype: bool, shape: [16, 2048, 33, 33]\naxis (list): [2, 3]\nkeepdim (bool): True\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/any_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/any_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/any_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/any_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/any_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/any_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.09390927985579313","paddle_perf_backwards":"--","paddle_gpu_time":"0.08013408609738885","paddle_gpu_time_backward":"--"},{"name":"any_1","op":"any","op_count":0,"config":"x (Variable) - dtype: bool, shape: [16, 8, 128]\naxis (list): [1]\nkeepdim (bool): False\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/any_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/any_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/any_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/any_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/any_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/any_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.04386244987954899","paddle_perf_backwards":"--","paddle_gpu_time":"0.0015045274188625493","paddle_gpu_time_backward":"--"},{"name":"any_2","op":"any","op_count":0,"config":"x (Variable) - dtype: bool, shape: [16, 16, 1, 1]\naxis (list): [0]\nkeepdim (bool): False\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/any_2-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/any_2-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/any_2-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/any_2-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/any_2-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/any_2-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.024226490332155813","paddle_perf_backwards":"--","paddle_gpu_time":"0.0016181301967146623","paddle_gpu_time_backward":"--"},{"name":"any_3","op":"any","op_count":0,"config":"x (Variable) - dtype: bool, shape: [30522, 1024]\naxis (string): None\nkeepdim (bool): False\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/any_3-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/any_3-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/any_3-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/any_3-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/any_3-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/any_3-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"0.06490458467441475","paddle_perf_backwards":"--","paddle_gpu_time":"0.04269003332974949","paddle_gpu_time_backward":"--"},{"name":"arange_0","op":"arange","op_count":0,"config":"dtype (string): None\nend (int): 65536\nstart (int): 0\nstep (int): 2\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/arange_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/arange_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/arange_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/arange_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/arange_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/arange_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.05341379009947485","paddle_perf_backwards":"--","paddle_gpu_time":"0.0013192679206914084","paddle_gpu_time_backward":"--"},{"name":"argmax_0","op":"argmax","op_count":4,"config":"x (Variable) - dtype: float32, shape: [16, 513, 513, 19]\naxis (int): 3\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/argmax_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/argmax_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/argmax_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/argmax_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/argmax_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/argmax_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"5.978440877163049","paddle_perf_backwards":"--","paddle_gpu_time":"5.975735959007335","paddle_gpu_time_backward":"--"},{"name":"argmax_1","op":"argmax","op_count":4,"config":"x (Variable) - dtype: float32, shape: [16, 513, 513, 19]\naxis (int): 1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/argmax_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/argmax_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/argmax_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/argmax_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/argmax_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/argmax_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"3.04252150082829","paddle_perf_backwards":"--","paddle_gpu_time":"3.0681368591037543","paddle_gpu_time_backward":"--"},{"name":"argmax_2","op":"argmax","op_count":4,"config":"x (Variable) - dtype: float32, shape: [1000, 1000]\naxis (int): -1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/argmax_2-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/argmax_2-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/argmax_2-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/argmax_2-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/argmax_2-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/argmax_2-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.0355234049787425","paddle_perf_backwards":"--","paddle_gpu_time":"0.02124123208295212","paddle_gpu_time_backward":"--"},{"name":"argmax_3","op":"argmax","op_count":4,"config":"x (Variable) - dtype: float32, shape: [1000000]\naxis (int): 0\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/argmax_3-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/argmax_3-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/argmax_3-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/argmax_3-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/argmax_3-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/argmax_3-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.30277931328975793","paddle_perf_backwards":"--","paddle_gpu_time":"0.2796205446535843","paddle_gpu_time_backward":"--"},{"name":"argmin_0","op":"argmin","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 513, 513, 19]\naxis (int): 3\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/argmin_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/argmin_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/argmin_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/argmin_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/argmin_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/argmin_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"5.970599795832779","paddle_perf_backwards":"--","paddle_gpu_time":"5.9837235004521245","paddle_gpu_time_backward":"--"},{"name":"argmin_1","op":"argmin","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 513, 513, 19]\naxis (int): 1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/argmin_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/argmin_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/argmin_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/argmin_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/argmin_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/argmin_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"3.0515416704042995","paddle_perf_backwards":"--","paddle_gpu_time":"3.0645906934490768","paddle_gpu_time_backward":"--"},{"name":"argmin_2","op":"argmin","op_count":0,"config":"x (Variable) - dtype: float32, shape: [1000, 1000]\naxis (int): -1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/argmin_2-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/argmin_2-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/argmin_2-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/argmin_2-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/argmin_2-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/argmin_2-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.03490568411470664","paddle_perf_backwards":"--","paddle_gpu_time":"0.021399898373983737","paddle_gpu_time_backward":"--"},{"name":"argmin_3","op":"argmin","op_count":0,"config":"x (Variable) - dtype: float32, shape: [1000000]\naxis (int): 0\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/argmin_3-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/argmin_3-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/argmin_3-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/argmin_3-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/argmin_3-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/argmin_3-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.29497230895841964","paddle_perf_backwards":"--","paddle_gpu_time":"0.2901381657989588","paddle_gpu_time_backward":"--"},{"name":"argsort_0","op":"argsort","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 1785]\naxis (int): 1\ndescending (bool): False\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/argsort_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/argsort_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/argsort_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/argsort_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/argsort_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/argsort_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.07801299192467515","paddle_perf_backwards":"--","paddle_gpu_time":"0.04899283219996324","paddle_gpu_time_backward":"--"},{"name":"argsort_1","op":"argsort","op_count":0,"config":"x (Variable) - dtype: float32, shape: [1700971, 1]\naxis (int): 0\ndescending (bool): True\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/argsort_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/argsort_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/argsort_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/argsort_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/argsort_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/argsort_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"1.512936426668751","paddle_perf_backwards":"--","paddle_gpu_time":"0.8877671333824613","paddle_gpu_time_backward":"--"},{"name":"assign_0","op":"assign","op_count":75,"config":"input (Variable) - dtype: float32, shape: [2, 768]\noutput (Variable) - dtype: float32, shape: [2, 768]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/assign_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/assign_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/assign_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/assign_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/assign_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/assign_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.02295970916748047","paddle_perf_backwards":"--","paddle_gpu_time":"0.0013110930019354181","paddle_gpu_time_backward":"--"},{"name":"assign_1","op":"assign","op_count":75,"config":"input (Variable) - dtype: float32, shape: [30522, 1024]\noutput (Variable) - dtype: float32, shape: [30522, 1024]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/assign_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/assign_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/assign_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/assign_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/assign_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/assign_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.3451830412010591","paddle_perf_backwards":"--","paddle_gpu_time":"0.32475108993206936","paddle_gpu_time_backward":"--"},{"name":"assign_2","op":"assign","op_count":75,"config":"input (Variable) - dtype: float32, shape: [1]\noutput (Variable) - dtype: float32, shape: [1]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/assign_2-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/assign_2-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/assign_2-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/assign_2-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/assign_2-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/assign_2-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.024633991475007968","paddle_perf_backwards":"--","paddle_gpu_time":"0.0012902208201892745","paddle_gpu_time_backward":"--"},{"name":"avg_pool2d_0","op":"avg_pool2d","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 2048, 7, 7]\nceil_mode (bool): False\ndata_format (string): NCHW\nkernel_size (list): [7, 7]\npadding (list): [0, 0]\nstride (list): [1, 1]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/avg_pool2d_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/avg_pool2d_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/avg_pool2d_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/avg_pool2d_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/avg_pool2d_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/avg_pool2d_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.0295015702764672","paddle_perf_backwards":"0.1579984603636715","paddle_gpu_time":"0.008882733148661125","paddle_gpu_time_backward":"0.13906863314365345"},{"name":"avg_pool2d_1","op":"avg_pool2d","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 256, 16, 16]\nceil_mode (bool): False\ndata_format (string): NCHW\nkernel_size (list): [2, 2]\npadding (list): [0, 0]\nstride (list): [2, 2]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/avg_pool2d_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/avg_pool2d_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/avg_pool2d_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/avg_pool2d_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/avg_pool2d_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/avg_pool2d_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.031752744036351505","paddle_perf_backwards":"0.057186034017192106","paddle_gpu_time":"0.005965454361260257","paddle_gpu_time_backward":"0.030319879417183046"},{"name":"avg_pool2d_2","op":"avg_pool2d","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 1024, 16, 16]\nceil_mode (bool): True\ndata_format (string): NCHW\nkernel_size (list): [2, 2]\npadding (list): [0, 0]\nstride (list): [2, 2]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/avg_pool2d_2-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/avg_pool2d_2-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/avg_pool2d_2-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/avg_pool2d_2-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/avg_pool2d_2-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/avg_pool2d_2-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.054760781939856285","paddle_perf_backwards":"0.10909902786682985","paddle_gpu_time":"0.029859567589412002","paddle_gpu_time_backward":"0.08597355003186744"},{"name":"batch_norm_0","op":"batch_norm","op_count":85,"config":"x (Variable) - dtype: float32, shape: [16, 256]\ndata_format (string): NCHW\nepsilon (float): 1e-05\nmomentum (float): 0.9\ntraining (bool): True\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/batch_norm_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/batch_norm_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/batch_norm_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/batch_norm_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/batch_norm_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/batch_norm_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"0.04047812247762875","paddle_perf_backwards":"0.05833007851425483","paddle_gpu_time":"0.004966265441875198","paddle_gpu_time_backward":"0.009052583447645176"},{"name":"batch_norm_1","op":"batch_norm","op_count":85,"config":"x (Variable) - dtype: float32, shape: [16, 32768]\ndata_format (string): NCHW\nepsilon (float): 1e-05\nmomentum (float): 0.9\ntraining (bool): True\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/batch_norm_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/batch_norm_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/batch_norm_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/batch_norm_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/batch_norm_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/batch_norm_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"0.041234493255615234","paddle_perf_backwards":"0.062144289211351046","paddle_gpu_time":"0.011674594395280236","paddle_gpu_time_backward":"0.02364326765188834"},{"name":"batch_norm_2","op":"batch_norm","op_count":85,"config":"x (Variable) - dtype: float32, shape: [16, 1536, 33, 33]\ndata_format (string): NCHW\nepsilon (float): 0.001\nmomentum (float): 0.99\ntraining (bool): True\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/batch_norm_2-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/batch_norm_2-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/batch_norm_2-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/batch_norm_2-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/batch_norm_2-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/batch_norm_2-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"0.5401310671764205","paddle_perf_backwards":"1.2945437048333717","paddle_gpu_time":"0.5109449929478138","paddle_gpu_time_backward":"1.2523399932272266"},{"name":"batch_norm_3","op":"batch_norm","op_count":85,"config":"x (Variable) - dtype: float32, shape: [16, 256, 1, 1]\ndata_format (string): NCHW\nepsilon (float): 1e-05\nmomentum (float): 0.99\ntraining (bool): True\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/batch_norm_3-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/batch_norm_3-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/batch_norm_3-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/batch_norm_3-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/batch_norm_3-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/batch_norm_3-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"0.04015749238102312","paddle_perf_backwards":"0.06386807644702346","paddle_gpu_time":"0.005085971396432589","paddle_gpu_time_backward":"0.0090054819552307"},{"name":"batch_norm_4","op":"batch_norm","op_count":85,"config":"x (Variable) - dtype: float32, shape: [16, 32, 256, 256]\ndata_format (string): NCHW\nepsilon (float): 1e-05\nmomentum (float): 0.9\ntraining (bool): True\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/batch_norm_4-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/batch_norm_4-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/batch_norm_4-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/batch_norm_4-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/batch_norm_4-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/batch_norm_4-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"0.7429562419293875","paddle_perf_backwards":"2.1714267003009597","paddle_gpu_time":"0.7165747080144986","paddle_gpu_time_backward":"2.1268894911171334"},{"name":"bernoulli_0","op":"bernoulli","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 1785]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/bernoulli_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/bernoulli_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/bernoulli_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/bernoulli_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/bernoulli_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/bernoulli_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"0.03951682623131687","paddle_perf_backwards":"--","paddle_gpu_time":"0.012009806864805362","paddle_gpu_time_backward":"--"},{"name":"bernoulli_1","op":"bernoulli","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 1, 513, 513]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/bernoulli_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/bernoulli_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/bernoulli_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/bernoulli_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/bernoulli_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/bernoulli_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"0.7047428184723759","paddle_perf_backwards":"--","paddle_gpu_time":"0.681329196276649","paddle_gpu_time_backward":"--"},{"name":"bernoulli_2","op":"bernoulli","op_count":0,"config":"x (Variable) - dtype: float32, shape: [30522, 1024]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/bernoulli_2-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/bernoulli_2-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/bernoulli_2-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/bernoulli_2-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/bernoulli_2-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/bernoulli_2-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"4.935687421316124","paddle_perf_backwards":"--","paddle_gpu_time":"4.891302171954759","paddle_gpu_time_backward":"--"},{"name":"binary_cross_entropy_with_logits_0","op":"binary_cross_entropy_with_logits","op_count":0,"config":"label (Variable) - dtype: float32, shape: [16, 3, 64, 64, 80]\nlogit (Variable) - dtype: float32, shape: [16, 3, 64, 64, 80]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/binary_cross_entropy_with_logits_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/binary_cross_entropy_with_logits_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/binary_cross_entropy_with_logits_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/binary_cross_entropy_with_logits_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/binary_cross_entropy_with_logits_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/binary_cross_entropy_with_logits_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.3616172440198003","paddle_perf_backwards":"0.7916302097087003","paddle_gpu_time":"0.36139619414154367","paddle_gpu_time_backward":"0.7699570013086557"},{"name":"binary_cross_entropy_with_logits_1","op":"binary_cross_entropy_with_logits","op_count":0,"config":"label (Variable) - dtype: float32, shape: [16, 900]\nlogit (Variable) - dtype: float32, shape: [16, 900]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/binary_cross_entropy_with_logits_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/binary_cross_entropy_with_logits_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/binary_cross_entropy_with_logits_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/binary_cross_entropy_with_logits_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/binary_cross_entropy_with_logits_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/binary_cross_entropy_with_logits_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.03756382027450873","paddle_perf_backwards":"0.04715554568232322","paddle_gpu_time":"0.002957526780680323","paddle_gpu_time_backward":"0.004496260683760684"},{"name":"case_0","op":"case","op_count":0,"config":"input (Variable) - dtype: float32, shape: [1]\nx (Variable) - dtype: float32, shape: [16, 256, 6, 6]\ny (Variable) - dtype: float32, shape: [16, 256, 6, 6]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/case_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/case_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/case_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/case_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/case_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/case_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.26228841470212355","paddle_perf_backwards":"0.46846355710710796","paddle_gpu_time":"0.019793856402664695","paddle_gpu_time_backward":"0.04631597591215019"},{"name":"cast_0","op":"cast","op_count":154,"config":"x (Variable) - dtype: bool, shape: [16, 1785]\ndtype (string): bool\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cast_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cast_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cast_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cast_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cast_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cast_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.019454469486158723","paddle_perf_backwards":"--","paddle_gpu_time":"0.0012923970648185893","paddle_gpu_time_backward":"--"},{"name":"cast_1","op":"cast","op_count":154,"config":"x (Variable) - dtype: int32, shape: [16, 1]\ndtype (string): int64\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cast_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cast_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cast_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cast_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cast_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cast_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.01880149452053771","paddle_perf_backwards":"--","paddle_gpu_time":"0.001318357783211084","paddle_gpu_time_backward":"--"},{"name":"cast_2","op":"cast","op_count":154,"config":"x (Variable) - dtype: int32, shape: [16, 1, 513, 513]\ndtype (string): float32\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cast_2-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cast_2-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cast_2-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cast_2-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cast_2-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cast_2-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.058897192698405924","paddle_perf_backwards":"--","paddle_gpu_time":"0.04410708853057499","paddle_gpu_time_backward":"--"},{"name":"cast_3","op":"cast","op_count":154,"config":"x (Variable) - dtype: float16, shape: [30522, 1024]\ndtype (string): float32\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cast_3-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cast_3-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cast_3-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cast_3-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cast_3-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cast_3-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.2635867241396004","paddle_perf_backwards":"--","paddle_gpu_time":"0.24973197127541216","paddle_gpu_time_backward":"--"},{"name":"cast_4","op":"cast","op_count":154,"config":"x (Variable) - dtype: int64, shape: [1]\ndtype (string): float32\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cast_4-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cast_4-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cast_4-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cast_4-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cast_4-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cast_4-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.01859932529683016","paddle_perf_backwards":"--","paddle_gpu_time":"0.001314701690083486","paddle_gpu_time_backward":"--"},{"name":"cast_5","op":"cast","op_count":154,"config":"x (Variable) - dtype: float32, shape: [16, 16, 1]\ndtype (string): float16\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cast_5-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cast_5-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cast_5-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cast_5-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cast_5-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cast_5-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.018823390104332753","paddle_perf_backwards":"--","paddle_gpu_time":"0.0013060393115388532","paddle_gpu_time_backward":"--"},{"name":"cast_6","op":"cast","op_count":154,"config":"x (Variable) - dtype: float32, shape: [16, 16, 1024]\ndtype (string): float16\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cast_6-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cast_6-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cast_6-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cast_6-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cast_6-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cast_6-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.01965110081745439","paddle_perf_backwards":"--","paddle_gpu_time":"0.001813003159074697","paddle_gpu_time_backward":"--"},{"name":"cholesky_0","op":"cholesky","op_count":0,"config":"x (Variable) - dtype: float32, shape: [200, 200]\nupper (bool): False\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cholesky_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cholesky_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cholesky_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cholesky_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cholesky_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cholesky_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.21754843848092215","paddle_perf_backwards":"0.4098045582673988","paddle_gpu_time":"0.12430577531645567","paddle_gpu_time_backward":"0.31029067706487057"},{"name":"cholesky_1","op":"cholesky","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 10, 20]\nupper (bool): False\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cholesky_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cholesky_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cholesky_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cholesky_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cholesky_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cholesky_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.1542894207701391","paddle_perf_backwards":"0.3933154806798818","paddle_gpu_time":"0.02254530386740331","paddle_gpu_time_backward":"0.14846453210684593"},{"name":"clip_by_norm_0","op":"clip_by_norm","op_count":0,"config":"","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/clip_by_norm_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/clip_by_norm_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/clip_by_norm_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/clip_by_norm_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/clip_by_norm_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/clip_by_norm_0-tensorflow_gpu_speed_forward.txt","tensorflow_consistency":"--","tensorflow_consistency_backwards":"--","tensorflow_perf":"0.07725905398933254","tensorflow_perf_backwards":"--","tensorflow_gpu_time":"0.016432170542635658","tensorflow_gpu_time_backward":"--"},{"name":"concat_0","op":"concat","op_count":207,"config":"x (list<Variable>[2]) - dtype: float32, shape: [16, 256, 129, 129]; dtype: float32, shape: [16, 48, 129, 129]; \naxis (int): 1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/concat_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/concat_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/concat_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/concat_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/concat_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/concat_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.948416947360977","paddle_perf_backwards":"1.8728502304199708","paddle_gpu_time":"0.9265941660737879","paddle_gpu_time_backward":"1.8538063740228503"},{"name":"concat_1","op":"concat","op_count":207,"config":"x (list<Variable>[5]) - dtype: float32, shape: [16, 16, 1]; \naxis (int): 1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/concat_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/concat_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/concat_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/concat_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/concat_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/concat_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.030198145885856783","paddle_perf_backwards":"0.053738574592434626","paddle_gpu_time":"0.003917914586799778","paddle_gpu_time_backward":"0.007942266264541146"},{"name":"concat_2","op":"concat","op_count":207,"config":"x (list<Variable>[201]) - dtype: float32, shape: [1]; \naxis (int): 0\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/concat_2-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/concat_2-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/concat_2-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/concat_2-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/concat_2-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/concat_2-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.10845588178050761","paddle_perf_backwards":"0.34467979353301376","paddle_gpu_time":"0.004424460431654676","paddle_gpu_time_backward":"0.008462154585502603"},{"name":"concat_3","op":"concat","op_count":207,"config":"x (list<Variable>[2]) - dtype: float32, shape: [1, 16, 200]; \naxis (int): 0\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/concat_3-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/concat_3-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/concat_3-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/concat_3-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/concat_3-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/concat_3-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.029992570682447785","paddle_perf_backwards":"0.054718523609394935","paddle_gpu_time":"0.0025547024952015354","paddle_gpu_time_backward":"0.005137355455002514"},{"name":"concat_4","op":"concat","op_count":207,"config":"x (list<Variable>[4]) - dtype: float32, shape: [16, 256, 14, 14]; \naxis (int): 0\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/concat_4-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/concat_4-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/concat_4-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/concat_4-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/concat_4-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/concat_4-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.06534588863571963","paddle_perf_backwards":"0.10642852170400352","paddle_gpu_time":"0.04283491614467569","paddle_gpu_time_backward":"0.07899617783142224"},{"name":"cond_0","op":"cond","op_count":0,"config":"input (Variable) - dtype: float32, shape: [1]\nx (Variable) - dtype: float32, shape: [16, 256, 6, 6]\ny (Variable) - dtype: float32, shape: [16, 256, 6, 6]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cond_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cond_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cond_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cond_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cond_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cond_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.2421449641792142","paddle_perf_backwards":"0.40900707244873047","paddle_gpu_time":"0.015987691383968775","paddle_gpu_time_backward":"0.038438516590761224"},{"name":"conv2d_0","op":"conv2d","op_count":135,"config":"weight (Variable) - dtype: float32, shape: [512, 512, 3, 3]\nx (Variable) - dtype: float32, shape: [16, 512, 7, 7]\ndata_format (string): NCHW\ndilation (int): 1\ngroups (int): 1\npadding (int): 1\nstride (int): 1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.310855009117905","paddle_perf_backwards":"0.7684639522007534","paddle_gpu_time":"0.2683156654888104","paddle_gpu_time_backward":"0.7073700543056634"},{"name":"conv2d_1","op":"conv2d","op_count":135,"config":"weight (Variable) - dtype: float32, shape: [5, 2048, 2, 2]\nx (Variable) - dtype: float32, shape: [16, 2048, 2, 2]\ndata_format (string): NCHW\ndilation (int): 1\ngroups (string): None\npadding (int): 0\nstride (int): 1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.47340076796862546","paddle_perf_backwards":"0.5542210170200893","paddle_gpu_time":"0.4259777933380014","paddle_gpu_time_backward":"0.4552249946683728"},{"name":"conv2d_2","op":"conv2d","op_count":135,"config":"weight (Variable) - dtype: float32, shape: [1024, 512, 1, 1]\nx (Variable) - dtype: float32, shape: [16, 512, 64, 402]\ndata_format (string): NCHW\ndilation (int): 1\ngroups (string): None\npadding (int): 0\nstride (tuple): [2, 1]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_2-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_2-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_2-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_2-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_2-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_2-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"16.446034032471324","paddle_perf_backwards":"53.39936504558641","paddle_gpu_time":"16.638510445049956","paddle_gpu_time_backward":"53.42495921696574"},{"name":"conv2d_3","op":"conv2d","op_count":135,"config":"weight (Variable) - dtype: float32, shape: [128, 128, 1, 1]\nx (Variable) - dtype: float32, shape: [16, 128, 257, 257]\ndata_format (string): NCHW\ndilation (int): 1\ngroups (int): 1\npadding (int): 0\nstride (int): 1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_3-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_3-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_3-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_3-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_3-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_3-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"2.8805257804901245","paddle_perf_backwards":"9.568641942188922","paddle_gpu_time":"2.849249370277078","paddle_gpu_time_backward":"9.46820342205323"},{"name":"conv2d_4","op":"conv2d","op_count":135,"config":"weight (Variable) - dtype: float32, shape: [256, 2048, 1, 1]\nx (Variable) - dtype: float32, shape: [16, 2048, 1, 1]\ndata_format (string): NCHW\ndilation (int): 1\ngroups (int): 1\npadding (int): 0\nstride (int): 1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_4-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_4-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_4-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_4-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_4-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_4-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.1551832471575056","paddle_perf_backwards":"0.23110594068254742","paddle_gpu_time":"0.10960650471792811","paddle_gpu_time_backward":"0.18311956361491963"},{"name":"conv2d_5","op":"conv2d","op_count":135,"config":"weight (Variable) - dtype: float32, shape: [64, 3, 4, 4]\nx (Variable) - dtype: float32, shape: [16, 3, 128, 128]\ndata_format (string): NCHW\ndilation (int): 1\ngroups (string): None\npadding (list): [1, 1]\nstride (int): 2\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_5-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_5-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_5-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_5-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_5-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_5-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.1062653502639459","paddle_perf_backwards":"0.3358765524260852","paddle_gpu_time":"0.06659271957245134","paddle_gpu_time_backward":"0.2899744572158365"},{"name":"conv2d_6","op":"conv2d","op_count":135,"config":"weight (Variable) - dtype: float32, shape: [32, 3, 3, 3]\nx (Variable) - dtype: float32, shape: [16, 3, 513, 513]\ndata_format (string): NCHW\ndilation (int): 1\ngroups (string): None\npadding (int): 1\nstride (int): 2\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_6-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_6-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_6-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_6-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_6-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_6-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.41322154228133384","paddle_perf_backwards":"2.01024106054595","paddle_gpu_time":"0.36629671112921297","paddle_gpu_time_backward":"1.9432972614840986"},{"name":"conv2d_7","op":"conv2d","op_count":135,"config":"weight (Variable) - dtype: float32, shape: [32, 1, 7, 1]\nx (Variable) - dtype: float32, shape: [16, 1, 512, 402]\ndata_format (string): NCHW\ndilation (int): 1\ngroups (string): None\npadding (tuple): [3, 0]\nstride (tuple): [2, 1]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_7-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_7-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_7-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_7-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_7-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_7-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.39066295234524473","paddle_perf_backwards":"1.8467893405836457","paddle_gpu_time":"0.33348572589657627","paddle_gpu_time_backward":"1.7525220372184134"},{"name":"conv2d_8","op":"conv2d","op_count":135,"config":"weight (Variable) - dtype: float32, shape: [256, 8, 3, 3]\nx (Variable) - dtype: float32, shape: [16, 256, 56, 56]\ndata_format (string): NCHW\ndilation (int): 1\ngroups (int): 32\npadding (int): 1\nstride (int): 2\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_8-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_8-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_8-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_8-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_8-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_8-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.37124473221448007","paddle_perf_backwards":"1.9316305919569365","paddle_gpu_time":"0.4329995913363302","paddle_gpu_time_backward":"6.0645422357106735"},{"name":"conv2d_9","op":"conv2d","op_count":135,"config":"weight (Variable) - dtype: float32, shape: [512, 1024, 3, 3]\nx (Variable) - dtype: float32, shape: [16, 1024, 8, 8]\ndata_format (string): NCHW\ndilation (int): 1\ngroups (string): None\npadding (list): [1, 1]\nstride (int): 1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_9-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_9-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_9-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_9-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_9-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_9-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.5592431340898787","paddle_perf_backwards":"1.4172532120529486","paddle_gpu_time":"0.5389499566097773","paddle_gpu_time_backward":"1.3585378323108386"},{"name":"conv2d_10","op":"conv2d","op_count":135,"config":"weight (Variable) - dtype: float16, shape: [1, 1, 3, 32]\nx (Variable) - dtype: float16, shape: [1, 1, 80, 1008]\ndata_format (string): NCHW\ndilation (tuple): [1, 1]\ngroups (int): 1\npadding (tuple): [1, 8]\nstride (tuple): [1, 16]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_10-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_10-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_10-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_10-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_10-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_10-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.06094124852394572","paddle_perf_backwards":"0.8485815962966607","paddle_gpu_time":"0.009655744295909138","paddle_gpu_time_backward":"0.6963649238758409"},{"name":"conv2d_11","op":"conv2d","op_count":135,"config":"weight (Variable) - dtype: float32, shape: [512, 512, 3, 3]\nx (Variable) - dtype: float32, shape: [2, 512, 129, 129]\ndata_format (string): NCHW\ndilation (tuple): [16, 16]\ngroups (int): 1\npadding (tuple): [0, 0]\nstride (tuple): [1, 1]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_11-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_11-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_11-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_11-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_11-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_11-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"6.61750180380685","paddle_perf_backwards":"168.68680204663957","paddle_gpu_time":"6.587961106655974","paddle_gpu_time_backward":"169.36035741527735"},{"name":"conv2d_transpose_0","op":"conv2d_transpose","op_count":7,"config":"weight (Variable) - dtype: float32, shape: [1549, 512, 4, 4]\nx (Variable) - dtype: float32, shape: [16, 1549, 8, 8]\ndata_format (string): NCHW\ndilation (int): 1\ngroups (string): None\noutput_size (list): [16, 16]\npadding (list): [1, 1]\nstride (int): 2\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_transpose_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_transpose_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_transpose_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_transpose_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_transpose_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_transpose_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"3.3510884460137818","paddle_perf_backwards":"8.04602321313352","paddle_gpu_time":"3.2952342569269524","paddle_gpu_time_backward":"8.044063126252505"},{"name":"conv2d_transpose_1","op":"conv2d_transpose","op_count":7,"config":"weight (Variable) - dtype: float32, shape: [128, 64, 3, 3]\nx (Variable) - dtype: float32, shape: [16, 128, 64, 64]\ndata_format (string): NCHW\ndilation (int): 1\ngroups (string): None\noutput_size (list): [127, 127]\npadding (list): [1, 1]\nstride (int): 2\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_transpose_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_transpose_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_transpose_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_transpose_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_transpose_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_transpose_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"1.5625759046904895","paddle_perf_backwards":"3.229873764271639","paddle_gpu_time":"1.5274933190807056","paddle_gpu_time_backward":"3.2251096491228073"},{"name":"conv2d_transpose_2","op":"conv2d_transpose","op_count":7,"config":"weight (Variable) - dtype: float32, shape: [512, 512, 4, 4]\nx (Variable) - dtype: float32, shape: [16, 512, 1, 1]\ndata_format (string): NCHW\ndilation (int): 1\ngroups (string): None\noutput_size (list): [2, 2]\npadding (int): 1\nstride (int): 2\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_transpose_2-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_transpose_2-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_transpose_2-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_transpose_2-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_transpose_2-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_transpose_2-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"1.0706354160698093","paddle_perf_backwards":"2.0013972204558708","paddle_gpu_time":"1.0121723410665864","paddle_gpu_time_backward":"1.9265515903801396"},{"name":"conv2d_transpose_3","op":"conv2d_transpose","op_count":7,"config":"weight (Variable) - dtype: float32, shape: [256, 256, 2, 2]\nx (Variable) - dtype: float32, shape: [32, 256, 14, 14]\ndata_format (string): NCHW\ndilation (int): 1\ngroups (string): None\noutput_size (list): [28, 28]\npadding (int): 0\nstride (int): 2\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_transpose_3-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_transpose_3-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_transpose_3-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_transpose_3-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_transpose_3-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_transpose_3-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.6111553737095424","paddle_perf_backwards":"1.2583662052543796","paddle_gpu_time":"0.5752676659528908","paddle_gpu_time_backward":"1.2029733959311424"},{"name":"conv2d_transpose_4","op":"conv2d_transpose","op_count":7,"config":"weight (Variable) - dtype: float32, shape: [1, 1, 3, 32]\nx (Variable) - dtype: float32, shape: [1, 1, 80, 63]\ndata_format (string): NCHW\ndilation (list): [1, 1]\ngroups (int): 1\noutput_size (string): None\npadding (list): [1, 8]\nstride (list): [1, 16]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_transpose_4-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_transpose_4-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_transpose_4-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_transpose_4-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_transpose_4-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv2d_transpose_4-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"0.05015864664194536","paddle_perf_backwards":"0.6130943492967255","paddle_gpu_time":"0.013404064219153475","paddle_gpu_time_backward":"0.5153545868575147"},{"name":"conv3d_0","op":"conv3d","op_count":2,"config":"weight (Variable) - dtype: float32, shape: [128, 64, 3, 3, 3]\nx (Variable) - dtype: float32, shape: [1, 64, 5, 360, 360]\ndata_format (string): NCDHW\ndilation (int): 1\ngroups (int): 1\npadding (int): 0\nstride (list): [1, 2, 2]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv3d_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv3d_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv3d_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv3d_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv3d_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv3d_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"3.5029351711273193","paddle_perf_backwards":"68.78609657287598","paddle_gpu_time":"3.700387331966279","paddle_gpu_time_backward":"70.30565232048545"},{"name":"conv3d_transpose_0","op":"conv3d_transpose","op_count":0,"config":"input (Variable) - dtype: float32, shape: [16, 3, 8, 8, 8]\nact (string): None\ndata_format (string): NCDHW\ndilation (int): 1\nfilter_size (int): 3\ngroups (string): None\nnum_filters (int): 6\noutput_size (list): [10, 10, 10]\npadding (int): 0\nstride (int): 1\nuse_cudnn (bool): True\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv3d_transpose_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv3d_transpose_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv3d_transpose_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv3d_transpose_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv3d_transpose_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/conv3d_transpose_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.08663644596022002","paddle_perf_backwards":"0.24809472414912007","paddle_gpu_time":"0.04269474589019018","paddle_gpu_time_backward":"0.2125585754451734"},{"name":"cos_0","op":"cos","op_count":3,"config":"x (Variable) - dtype: float32, shape: [16, 128, 257, 257]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cos_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cos_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cos_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cos_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cos_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cos_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"1.3334699288637701","paddle_perf_backwards":"3.2444036078596405","paddle_gpu_time":"1.3231815434213179","paddle_gpu_time_backward":"3.229014227642276"},{"name":"cos_1","op":"cos","op_count":3,"config":"x (Variable) - dtype: float16, shape: [16, 128, 257, 257]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cos_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cos_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cos_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cos_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cos_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cos_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.6916530624420226","paddle_perf_backwards":"1.6653602252264539","paddle_gpu_time":"0.6782866639806607","paddle_gpu_time_backward":"1.6492914188615122"},{"name":"cosine_similarity_0","op":"cosine_similarity","op_count":0,"config":"x1 (Variable) - dtype: float32, shape: [16, 256]\nx2 (Variable) - dtype: float32, shape: [16, 256]\naxis (int): 1\neps (float): 1e-08\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cosine_similarity_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cosine_similarity_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cosine_similarity_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cosine_similarity_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cosine_similarity_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cosine_similarity_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.12081958809677434","paddle_perf_backwards":"0.27974089797662227","paddle_gpu_time":"0.01432777149321267","paddle_gpu_time_backward":"0.04033333333333334"},{"name":"cumsum_0","op":"cumsum","op_count":0,"config":"x (Variable) - dtype: float32, shape: [1700971, 1]\naxis (int): 0\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cumsum_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cumsum_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cumsum_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cumsum_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cumsum_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cumsum_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.0724313210467903","paddle_perf_backwards":"--","paddle_gpu_time":"0.02262709966405375","paddle_gpu_time_backward":"--"},{"name":"cumsum_1","op":"cumsum","op_count":0,"config":"x (Variable) - dtype: float32, shape: [1700971, 100]\naxis (int): 0\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cumsum_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cumsum_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cumsum_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cumsum_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cumsum_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/cumsum_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"10.890969634056091","paddle_perf_backwards":"--","paddle_gpu_time":"12.299464316571624","paddle_gpu_time_backward":"--"},{"name":"data_norm_0","op":"data_norm","op_count":0,"config":"x (Variable) - dtype: float32, shape: [100, 1785]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/data_norm_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/data_norm_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/data_norm_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/data_norm_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/data_norm_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/data_norm_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"0.029009215685786034","paddle_perf_backwards":"0.05558923799164441","paddle_gpu_time":"0.004954785229841749","paddle_gpu_time_backward":"0.014293701657458562"},{"name":"depthwise_conv2d_0","op":"depthwise_conv2d","op_count":0,"config":"weight (Variable) - dtype: float32, shape: [2048, 1, 3, 3]\nx (Variable) - dtype: float32, shape: [16, 2048, 33, 33]\ndata_format (string): NCHW\ndilation (int): 18\ngroups (long): 2048\npadding (int): 18\nstride (int): 1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"0.43426046566087373","paddle_perf_backwards":"3.7328143509066836","paddle_gpu_time":"0.4443133503401361","paddle_gpu_time_backward":"3.7436203818156844"},{"name":"depthwise_conv2d_1","op":"depthwise_conv2d","op_count":0,"config":"weight (Variable) - dtype: float32, shape: [2048, 1, 3, 3]\nx (Variable) - dtype: float32, shape: [16, 33, 33, 2048]\ndata_format (string): NHWC\ndilation (int): 18\ngroups (long): 2048\npadding (int): 18\nstride (int): 1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"0.5845999231143874","paddle_perf_backwards":"1.9711740162907814","paddle_gpu_time":"0.5683715523618303","paddle_gpu_time_backward":"1.9434684684684687"},{"name":"depthwise_conv2d_2","op":"depthwise_conv2d","op_count":0,"config":"weight (Variable) - dtype: float32, shape: [2048, 1, 3, 3]\nx (Variable) - dtype: float32, shape: [4, 2048, 64, 128]\ndata_format (string): NCHW\ndilation (int): 12\ngroups (long): 2048\npadding (int): 12\nstride (int): 1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_2-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_2-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_2-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_2-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_2-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_2-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"0.7685410733125648","paddle_perf_backwards":"5.95715654139616","paddle_gpu_time":"0.7922445060806486","paddle_gpu_time_backward":"5.936238797340272"},{"name":"depthwise_conv2d_3","op":"depthwise_conv2d","op_count":0,"config":"weight (Variable) - dtype: float32, shape: [2048, 1, 3, 3]\nx (Variable) - dtype: float32, shape: [4, 64, 128, 2048]\ndata_format (string): NHWC\ndilation (int): 12\ngroups (long): 2048\npadding (int): 12\nstride (int): 1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_3-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_3-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_3-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_3-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_3-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_3-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"1.5200675750265316","paddle_perf_backwards":"4.812486074408706","paddle_gpu_time":"1.509220078821821","paddle_gpu_time_backward":"4.890282131661442"},{"name":"depthwise_conv2d_4","op":"depthwise_conv2d","op_count":0,"config":"weight (Variable) - dtype: float32, shape: [728, 1, 3, 3]\nx (Variable) - dtype: float32, shape: [8, 728, 65, 65]\ndata_format (string): NCHW\ndilation (int): 1\ngroups (long): 728\npadding (int): 1\nstride (int): 1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_4-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_4-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_4-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_4-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_4-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_4-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"0.35119835211306205","paddle_perf_backwards":"2.4080191339765276","paddle_gpu_time":"0.33802445709466583","paddle_gpu_time_backward":"2.4072117524855314"},{"name":"depthwise_conv2d_5","op":"depthwise_conv2d","op_count":0,"config":"weight (Variable) - dtype: float32, shape: [728, 1, 3, 3]\nx (Variable) - dtype: float32, shape: [8, 65, 65, 728]\ndata_format (string): NHWC\ndilation (int): 1\ngroups (long): 728\npadding (int): 1\nstride (int): 1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_5-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_5-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_5-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_5-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_5-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_5-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"0.563528586407097","paddle_perf_backwards":"1.7572308073238452","paddle_gpu_time":"0.5410089332632684","paddle_gpu_time_backward":"1.7080296678490807"},{"name":"depthwise_conv2d_6","op":"depthwise_conv2d","op_count":0,"config":"weight (Variable) - dtype: float32, shape: [128, 1, 3, 3]\nx (Variable) - dtype: float32, shape: [8, 128, 257, 257]\ndata_format (string): NCHW\ndilation (int): 1\ngroups (long): 128\npadding (int): 1\nstride (int): 2\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_6-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_6-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_6-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_6-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_6-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_6-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"0.49675484092868105","paddle_perf_backwards":"3.044777500386141","paddle_gpu_time":"0.5214900662251656","paddle_gpu_time_backward":"3.046329526916803"},{"name":"depthwise_conv2d_7","op":"depthwise_conv2d","op_count":0,"config":"weight (Variable) - dtype: float32, shape: [128, 1, 3, 3]\nx (Variable) - dtype: float32, shape: [8, 257, 257, 128]\ndata_format (string): NHWC\ndilation (int): 1\ngroups (long): 128\npadding (int): 1\nstride (int): 2\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_7-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_7-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_7-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_7-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_7-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_7-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"0.6079625110236966","paddle_perf_backwards":"2.484661948924162","paddle_gpu_time":"0.6178194993412385","paddle_gpu_time_backward":"2.4841522157996145"},{"name":"depthwise_conv2d_8","op":"depthwise_conv2d","op_count":0,"config":"weight (Variable) - dtype: float32, shape: [304, 1, 3, 3]\nx (Variable) - dtype: float32, shape: [4, 304, 128, 256]\ndata_format (string): NCHW\ndilation (int): 1\ngroups (long): 304\npadding (int): 1\nstride (int): 1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_8-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_8-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_8-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_8-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_8-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_8-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"0.4766225814819336","paddle_perf_backwards":"2.5614524374202805","paddle_gpu_time":"0.485045140732873","paddle_gpu_time_backward":"2.572104144527099"},{"name":"depthwise_conv2d_9","op":"depthwise_conv2d","op_count":0,"config":"weight (Variable) - dtype: float32, shape: [304, 1, 3, 3]\nx (Variable) - dtype: float32, shape: [4, 128, 256, 304]\ndata_format (string): NHWC\ndilation (int): 1\ngroups (long): 304\npadding (int): 1\nstride (int): 1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_9-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_9-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_9-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_9-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_9-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_9-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"1.140820736787757","paddle_perf_backwards":"3.47914525440761","paddle_gpu_time":"1.1070727929788333","paddle_gpu_time_backward":"3.459722659943271"},{"name":"depthwise_conv2d_10","op":"depthwise_conv2d","op_count":0,"config":"weight (Variable) - dtype: float32, shape: [256, 1, 3, 3]\nx (Variable) - dtype: float32, shape: [4, 256, 128, 256]\ndata_format (string): NCHW\ndilation (int): 1\ngroups (long): 256\npadding (int): 1\nstride (int): 1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_10-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_10-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_10-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_10-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_10-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_10-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"0.4326187834447744","paddle_perf_backwards":"2.279125914281728","paddle_gpu_time":"0.43586922018833985","paddle_gpu_time_backward":"2.27520572450805"},{"name":"depthwise_conv2d_11","op":"depthwise_conv2d","op_count":0,"config":"weight (Variable) - dtype: float32, shape: [256, 1, 3, 3]\nx (Variable) - dtype: float32, shape: [4, 128, 256, 256]\ndata_format (string): NHWC\ndilation (int): 1\ngroups (long): 256\npadding (int): 1\nstride (int): 1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_11-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_11-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_11-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_11-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_11-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_11-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"0.6356820768239547","paddle_perf_backwards":"1.9307727716406997","paddle_gpu_time":"0.6124776245130041","paddle_gpu_time_backward":"1.9250651890482398"},{"name":"depthwise_conv2d_12","op":"depthwise_conv2d","op_count":0,"config":"weight (Variable) - dtype: float32, shape: [256, 1, 5, 5]\nx (Variable) - dtype: float32, shape: [4, 256, 128, 256]\ndata_format (string): NCHW\ndilation (int): 1\ngroups (long): 256\npadding (int): 1\nstride (int): 1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_12-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_12-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_12-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_12-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_12-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_12-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"0.8036460195268903","paddle_perf_backwards":"4.661791178644919","paddle_gpu_time":"0.8048330585325639","paddle_gpu_time_backward":"4.650039815257207"},{"name":"depthwise_conv2d_13","op":"depthwise_conv2d","op_count":0,"config":"weight (Variable) - dtype: float32, shape: [256, 1, 5, 5]\nx (Variable) - dtype: float32, shape: [4, 128, 256, 256]\ndata_format (string): NHWC\ndilation (int): 1\ngroups (long): 256\npadding (int): 1\nstride (int): 1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_13-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_13-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_13-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_13-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_13-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_13-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"1.4924990887544594","paddle_perf_backwards":"4.679340245772382","paddle_gpu_time":"1.4495046471249107","paddle_gpu_time_backward":"4.671232876712329"},{"name":"depthwise_conv2d_transpose_0","op":"depthwise_conv2d_transpose","op_count":0,"config":"input (Variable) - dtype: float32, shape: [16, 256, 8, 8]\ndata_format (string): NCHW\ndilation (int): 1\nfilter_size (int): 4\ngroups (int): 128\nnum_filters (int): 128\noutput_size (list): [16, 16]\npadding (list): [1, 1]\nstride (int): 2\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_transpose_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_transpose_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_transpose_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_transpose_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_transpose_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/depthwise_conv2d_transpose_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"0.0863104450459383","paddle_perf_backwards":"0.14083580094940806","paddle_gpu_time":"0.05643294212057379","paddle_gpu_time_backward":"0.09976872415377545"},{"name":"diag_0","op":"diag","op_count":2,"config":"x (Variable) - dtype: float32, shape: [1000]\noffset (int): 0\npadding_value (int): 0\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/diag_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/diag_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/diag_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/diag_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/diag_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/diag_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.025036140364043565","paddle_perf_backwards":"--","paddle_gpu_time":"0.006701951819940215","paddle_gpu_time_backward":"--"},{"name":"diag_1","op":"diag","op_count":2,"config":"x (Variable) - dtype: int64, shape: [1000]\noffset (int): 0\npadding_value (int): 0\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/diag_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/diag_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/diag_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/diag_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/diag_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/diag_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.027585516170579562","paddle_perf_backwards":"--","paddle_gpu_time":"0.013081013655462187","paddle_gpu_time_backward":"--"},{"name":"diag_2","op":"diag","op_count":2,"config":"x (Variable) - dtype: float32, shape: [1000]\noffset (int): 5\npadding_value (int): 0\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/diag_2-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/diag_2-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/diag_2-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/diag_2-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/diag_2-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/diag_2-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.025799323101432958","paddle_perf_backwards":"--","paddle_gpu_time":"0.0066919156414762745","paddle_gpu_time_backward":"--"},{"name":"diag_3","op":"diag","op_count":2,"config":"x (Variable) - dtype: float32, shape: [1000]\noffset (int): 0\npadding_value (int): 9\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/diag_3-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/diag_3-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/diag_3-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/diag_3-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/diag_3-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/diag_3-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.02306748409660495","paddle_perf_backwards":"--","paddle_gpu_time":"0.006686291739894552","paddle_gpu_time_backward":"--"},{"name":"dist_0","op":"dist","op_count":0,"config":"x (Variable) - dtype: float32, shape: [1000, 1000]\ny (Variable) - dtype: float32, shape: [1000, 1000]\np (float): 2.0\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/dist_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/dist_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/dist_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/dist_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/dist_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/dist_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"0.1180021130308813","paddle_perf_backwards":"0.1907808440072196","paddle_gpu_time":"0.10002602398732746","paddle_gpu_time_backward":"0.1782452010141253"},{"name":"dist_1","op":"dist","op_count":0,"config":"x (Variable) - dtype: float32, shape: [1000, 1000]\ny (Variable) - dtype: float32, shape: [1000, 1000]\np (float): inf\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/dist_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/dist_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/dist_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/dist_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/dist_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/dist_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"0.07541325627541057","paddle_perf_backwards":"0.14050517763410295","paddle_gpu_time":"0.05691709314227225","paddle_gpu_time_backward":"0.1274175715695953"},{"name":"dist_2","op":"dist","op_count":0,"config":"x (Variable) - dtype: float32, shape: [1000, 1000]\ny (Variable) - dtype: float32, shape: [1000, 1000]\np (float): 0.0\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/dist_2-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/dist_2-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/dist_2-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/dist_2-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/dist_2-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/dist_2-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"0.07628445722618882","paddle_perf_backwards":"0.12399274475720462","paddle_gpu_time":"0.06029667519181585","paddle_gpu_time_backward":"0.09524459438736392"},{"name":"divide_0","op":"divide","op_count":0,"config":"x (Variable) - dtype: float32, shape: [50, 128, 1000]\ny (Variable) - dtype: float32, shape: [128, 1000]\naxis (int): -1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/divide_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/divide_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/divide_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/divide_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/divide_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/divide_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.08323659171332587","paddle_perf_backwards":"0.2511458592610555","paddle_gpu_time":"0.06488759534323564","paddle_gpu_time_backward":"0.22392268694550058"},{"name":"divide_1","op":"divide","op_count":0,"config":"x (Variable) - dtype: float32, shape: [50, 128, 1000]\ny (Variable) - dtype: float32, shape: [1, 128, 1000]\naxis (int): 0\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/divide_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/divide_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/divide_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/divide_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/divide_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/divide_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.08376612214143864","paddle_perf_backwards":"0.2444098851007068","paddle_gpu_time":"0.06524529062153724","paddle_gpu_time_backward":"0.22379870129870133"},{"name":"divide_2","op":"divide","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 2048, 7, 7]\ny (Variable) - dtype: float32, shape: [16, 2048]\naxis (int): 0\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/divide_2-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/divide_2-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/divide_2-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/divide_2-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/divide_2-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/divide_2-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.037375839057570706","paddle_perf_backwards":"0.09080319700833551","paddle_gpu_time":"0.017779792485141534","paddle_gpu_time_backward":"0.06764565043894652"},{"name":"divide_3","op":"divide","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 2048, 16, 16]\ny (Variable) - dtype: float32, shape: [16, 2048, 16, 16]\naxis (int): -1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/divide_3-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/divide_3-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/divide_3-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/divide_3-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/divide_3-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/divide_3-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.13824632029256267","paddle_perf_backwards":"0.3580100550680218","paddle_gpu_time":"0.12188989898989898","paddle_gpu_time_backward":"0.3311556118410638"},{"name":"divide_4","op":"divide","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 1, 513, 513]\ny (Variable) - dtype: float32, shape: [1]\naxis (int): -1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/divide_4-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/divide_4-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/divide_4-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/divide_4-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/divide_4-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/divide_4-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.06182458931076264","paddle_perf_backwards":"4.226368319295451","paddle_gpu_time":"0.042980159129821734","paddle_gpu_time_backward":"4.2174628825371165"},{"name":"divide_5","op":"divide","op_count":0,"config":"x (Variable) - dtype: float32, shape: [512, 896, 4, 12]\ny (Variable) - dtype: float32, shape: [512, 896, 4, 1]\naxis (int): -1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/divide_5-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/divide_5-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/divide_5-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/divide_5-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/divide_5-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/divide_5-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.2457542027643544","paddle_perf_backwards":"2.8514309254342423","paddle_gpu_time":"0.22795365239294707","paddle_gpu_time_backward":"2.822320069580343"},{"name":"divide_6","op":"divide","op_count":0,"config":"x (Variable) - dtype: float16, shape: [512, 896, 4, 12]\ny (Variable) - dtype: float16, shape: [512, 896, 4, 1]\naxis (int): -1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/divide_6-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/divide_6-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/divide_6-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/divide_6-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/divide_6-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/divide_6-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"--","paddle_perf_backwards":"--","paddle_gpu_time":"--","paddle_gpu_time_backward":"--"},{"name":"divide_7","op":"divide","op_count":0,"config":"x (Variable) - dtype: float16, shape: [32, 12, 128, 128]\ny (Variable) - dtype: float16, shape: [32, 1, 1, 128]\naxis (int): -1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/divide_7-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/divide_7-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/divide_7-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/divide_7-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/divide_7-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/divide_7-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"--","paddle_perf_backwards":"--","paddle_gpu_time":"--","paddle_gpu_time_backward":"--"},{"name":"divide_8","op":"divide","op_count":0,"config":"x (Variable) - dtype: float16, shape: [32, 1, 1, 128]\ny (Variable) - dtype: float16, shape: [1, 12, 128, 1]\naxis (int): -1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/divide_8-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/divide_8-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/divide_8-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/divide_8-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/divide_8-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/divide_8-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"--","paddle_perf_backwards":"--","paddle_gpu_time":"--","paddle_gpu_time_backward":"--"},{"name":"dropout_0","op":"dropout","op_count":151,"config":"x (Variable) - dtype: float32, shape: [16, 36864]\naxis (string): None\nmode (string): downscale_in_infer\np (float): 0.5\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/dropout_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/dropout_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/dropout_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/dropout_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/dropout_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/dropout_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.026738282405968872","paddle_perf_backwards":"0.0391202743607338","paddle_gpu_time":"0.005506079185285254","paddle_gpu_time_backward":"0.013207371556217422"},{"name":"dropout_1","op":"dropout","op_count":151,"config":"x (Variable) - dtype: float32, shape: [16, 16, 16, 16]\naxis (string): None\nmode (string): downscale_in_infer\np (float): 0.1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/dropout_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/dropout_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/dropout_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/dropout_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/dropout_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/dropout_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.025975945019962816","paddle_perf_backwards":"0.07928381062517262","paddle_gpu_time":"0.0028879057538679343","paddle_gpu_time_backward":"0.004261524690437118"},{"name":"dropout_2","op":"dropout","op_count":151,"config":"x (Variable) - dtype: float32, shape: [16, 35, 1500]\naxis (string): None\nmode (string): upscale_in_train\np (float): 0.65\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/dropout_2-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/dropout_2-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/dropout_2-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/dropout_2-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/dropout_2-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/dropout_2-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.03055574918034101","paddle_perf_backwards":"0.04656615883412987","paddle_gpu_time":"0.010632041343669251","paddle_gpu_time_backward":"0.01932886557886558"},{"name":"dropout_3","op":"dropout","op_count":151,"config":"x (Variable) - dtype: float32, shape: [32, 128, 768]\naxis (string): None\nmode (string): upscale_in_train\np (float): 0.1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/dropout_3-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/dropout_3-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/dropout_3-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/dropout_3-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/dropout_3-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/dropout_3-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.05879548131203165","paddle_perf_backwards":"0.0966923577444894","paddle_gpu_time":"0.04038380207231973","paddle_gpu_time_backward":"0.07795755968169761"},{"name":"dropout_4","op":"dropout","op_count":151,"config":"x (Variable) - dtype: float16, shape: [32, 128, 768]\naxis (string): None\nmode (string): upscale_in_train\np (float): 0.1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/dropout_4-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/dropout_4-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/dropout_4-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/dropout_4-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/dropout_4-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/dropout_4-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.04361989546795281","paddle_perf_backwards":"0.06851979664393834","paddle_gpu_time":"0.025809663557279112","paddle_gpu_time_backward":"0.04777950612482987"},{"name":"elu_0","op":"elu","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 128, 257, 257]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/elu_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/elu_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/elu_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/elu_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/elu_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/elu_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"1.333071688611904","paddle_perf_backwards":"3.259305461852966","paddle_gpu_time":"1.3226150901581544","paddle_gpu_time_backward":"3.2286077235772352"},{"name":"elu_1","op":"elu","op_count":0,"config":"x (Variable) - dtype: float16, shape: [16, 128, 257, 257]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/elu_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/elu_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/elu_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/elu_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/elu_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/elu_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.6918176859318613","paddle_perf_backwards":"1.6651800018035339","paddle_gpu_time":"0.6800674858984689","paddle_gpu_time_backward":"1.6487527650161646"},{"name":"embedding_0","op":"embedding","op_count":99,"config":"weight (Variable) - dtype: float16, shape: [2, 768]\nx (Variable) - dtype: int64, shape: [16, 128]\npadding_idx (string): None\nsparse (bool): False\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/embedding_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/embedding_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/embedding_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/embedding_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/embedding_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/embedding_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.038689496565838255","paddle_perf_backwards":"1.0400660183964943","paddle_gpu_time":"0.006286876192388794","paddle_gpu_time_backward":"1.0182413376309427"},{"name":"embedding_1","op":"embedding","op_count":99,"config":"weight (Variable) - dtype: float32, shape: [37007, 1024]\nx (Variable) - dtype: int64, shape: [16, 16]\npadding_idx (int): 0\nsparse (bool): False\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/embedding_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/embedding_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/embedding_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/embedding_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/embedding_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/embedding_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"0.024465824671948855","paddle_perf_backwards":"0.22513989703891255","paddle_gpu_time":"0.003002562197188122","paddle_gpu_time_backward":"0.20019169928682504"},{"name":"embedding_2","op":"embedding","op_count":99,"config":"weight (Variable) - dtype: float32, shape: [10000, 1500]\nx (Variable) - dtype: int64, shape: [16, 35, 1]\npadding_idx (string): None\nsparse (bool): False\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/embedding_2-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/embedding_2-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/embedding_2-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/embedding_2-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/embedding_2-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/embedding_2-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.03266325233506784","paddle_perf_backwards":"0.16424649237251013","paddle_gpu_time":"0.011698263678578636","paddle_gpu_time_backward":"0.14137664440396683"},{"name":"embedding_3","op":"embedding","op_count":99,"config":"weight (Variable) - dtype: float32, shape: [2, 768]\nx (Variable) - dtype: int64, shape: [16, 128]\npadding_idx (string): None\nsparse (bool): False\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/embedding_3-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/embedding_3-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/embedding_3-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/embedding_3-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/embedding_3-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/embedding_3-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.04492389912507973","paddle_perf_backwards":"0.14276017948072783","paddle_gpu_time":"0.009297934215804252","paddle_gpu_time_backward":"0.10268521585513968"},{"name":"empty_0","op":"empty","op_count":0,"config":"dtype (string): float32\nshape (list): [1]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/empty_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/empty_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/empty_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/empty_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/empty_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/empty_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"0.01054977884097975","paddle_perf_backwards":"--","paddle_gpu_time":"2.259325044404973e-05","paddle_gpu_time_backward":"--"},{"name":"equal_0","op":"equal","op_count":12,"config":"x (Variable) - dtype: int32, shape: [1]\ny (Variable) - dtype: int32, shape: [1]\ncond (string): None\nforce_cpu (string): None\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/equal_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/equal_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/equal_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/equal_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/equal_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/equal_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.021166529110772814","paddle_perf_backwards":"--","paddle_gpu_time":"0.0013521042084168335","paddle_gpu_time_backward":"--"},{"name":"equal_1","op":"equal","op_count":12,"config":"x (Variable) - dtype: float32, shape: [256, 1024]\ny (Variable) - dtype: float32, shape: [256, 1024]\ncond (string): None\nforce_cpu (string): None\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/equal_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/equal_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/equal_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/equal_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/equal_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/equal_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.021524300317248266","paddle_perf_backwards":"--","paddle_gpu_time":"0.002615563886918634","paddle_gpu_time_backward":"--"},{"name":"equal_2","op":"equal","op_count":12,"config":"x (Variable) - dtype: int32, shape: [1024]\ny (Variable) - dtype: int32, shape: [256, 1024]\ncond (string): None\nforce_cpu (string): None\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/equal_2-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/equal_2-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/equal_2-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/equal_2-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/equal_2-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/equal_2-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.024002061817115672","paddle_perf_backwards":"--","paddle_gpu_time":"0.0028576175611720873","paddle_gpu_time_backward":"--"},{"name":"equal_all_0","op":"equal_all","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 1000, 2000]\ny (Variable) - dtype: float32, shape: [16, 1000, 2000]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/equal_all_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/equal_all_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/equal_all_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/equal_all_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/equal_all_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/equal_all_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"0.42210647038051063","paddle_perf_backwards":"--","paddle_gpu_time":"0.4457207490260149","paddle_gpu_time_backward":"--"},{"name":"equal_all_1","op":"equal_all","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 1000]\ny (Variable) - dtype: float32, shape: [16, 1000]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/equal_all_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/equal_all_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/equal_all_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/equal_all_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/equal_all_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/equal_all_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"0.035654282083316724","paddle_perf_backwards":"--","paddle_gpu_time":"0.00542569776213226","paddle_gpu_time_backward":"--"},{"name":"equal_all_2","op":"equal_all","op_count":0,"config":"x (Variable) - dtype: int32, shape: [16, 1000]\ny (Variable) - dtype: int32, shape: [16, 1000]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/equal_all_2-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/equal_all_2-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/equal_all_2-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/equal_all_2-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/equal_all_2-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/equal_all_2-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"0.03613622821107203","paddle_perf_backwards":"--","paddle_gpu_time":"0.005278946096204979","paddle_gpu_time_backward":"--"},{"name":"equal_all_3","op":"equal_all","op_count":0,"config":"x (Variable) - dtype: int64, shape: [16, 1000]\ny (Variable) - dtype: int64, shape: [16, 1000]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/equal_all_3-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/equal_all_3-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/equal_all_3-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/equal_all_3-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/equal_all_3-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/equal_all_3-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"0.0344853011929259","paddle_perf_backwards":"--","paddle_gpu_time":"0.005342283563362609","paddle_gpu_time_backward":"--"},{"name":"exp_0","op":"exp","op_count":7,"config":"x (Variable) - dtype: float32, shape: [16, 128, 257, 257]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/exp_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/exp_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/exp_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/exp_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/exp_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/exp_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"1.3392261608330187","paddle_perf_backwards":"3.2573307444432933","paddle_gpu_time":"1.322446101148499","paddle_gpu_time_backward":"3.2279376904842536"},{"name":"exp_1","op":"exp","op_count":7,"config":"x (Variable) - dtype: float16, shape: [16, 128, 257, 257]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/exp_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/exp_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/exp_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/exp_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/exp_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/exp_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.6928261630759688","paddle_perf_backwards":"1.6666504806411528","paddle_gpu_time":"0.678978749118743","paddle_gpu_time_backward":"1.640249146757679"},{"name":"expand_0","op":"expand","op_count":25,"config":"x (Variable) - dtype: float32, shape: [16, 1785, 1]\nshape (list): [1785, 2]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/expand_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/expand_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/expand_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/expand_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/expand_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/expand_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.022324493953159878","paddle_perf_backwards":"0.03478016172136579","paddle_gpu_time":"0.002523275949882856","paddle_gpu_time_backward":"0.005592423473702012"},{"name":"expand_1","op":"expand","op_count":25,"config":"x (Variable) - dtype: float32, shape: [16, 5, 1, 1]\nshape (list): [5, 128, 128]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/expand_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/expand_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/expand_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/expand_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/expand_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/expand_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.03326498732274892","paddle_perf_backwards":"6.035813993337203","paddle_gpu_time":"0.017062180835085608","paddle_gpu_time_backward":"5.98336174907231"},{"name":"expand_2","op":"expand","op_count":25,"config":"x (Variable) - dtype: float32, shape: [32, 807, 1]\nshape (list): [807, 807]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/expand_2-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/expand_2-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/expand_2-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/expand_2-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/expand_2-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/expand_2-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.11817411500580458","paddle_perf_backwards":"0.728682109287807","paddle_gpu_time":"0.10151060424169668","paddle_gpu_time_backward":"0.7138475836431227"},{"name":"expand_as_0","op":"expand_as","op_count":0,"config":"x (Variable) - dtype: float32, shape: [1785, 1]\ny (Variable) - dtype: float32, shape: [1785, 128]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/expand_as_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/expand_as_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/expand_as_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/expand_as_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/expand_as_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/expand_as_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.03571267030677017","paddle_perf_backwards":"0.13750207667448083","paddle_gpu_time":"0.004719107719401232","paddle_gpu_time_backward":"0.11357570663024064"},{"name":"expand_as_1","op":"expand_as","op_count":0,"config":"x (Variable) - dtype: float32, shape: [5, 1, 1]\ny (Variable) - dtype: float32, shape: [5, 128, 128]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/expand_as_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/expand_as_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/expand_as_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/expand_as_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/expand_as_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/expand_as_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.03656951748594946","paddle_perf_backwards":"4.012747443452173","paddle_gpu_time":"0.0038263436790310374","paddle_gpu_time_backward":"3.983383383383383"},{"name":"expand_as_2","op":"expand_as","op_count":0,"config":"x (Variable) - dtype: float32, shape: [32, 807, 1]\ny (Variable) - dtype: float32, shape: [32, 807, 807]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/expand_as_2-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/expand_as_2-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/expand_as_2-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/expand_as_2-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/expand_as_2-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/expand_as_2-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.4109163673556581","paddle_perf_backwards":"1.0003987623720754","paddle_gpu_time":"0.36247785198309934","paddle_gpu_time_backward":"0.9843564673825933"},{"name":"feed_0","op":"feed","op_count":0,"config":"None","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/feed_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/feed_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/feed_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/feed_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/feed_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/feed_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"1.1870459634430555","paddle_perf_backwards":"--","paddle_gpu_time":"0.95568","paddle_gpu_time_backward":"--"},{"name":"fetch_0","op":"fetch","op_count":0,"config":"None","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/fetch_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/fetch_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/fetch_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/fetch_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/fetch_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/fetch_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"1.4941086574476592","paddle_perf_backwards":"--","paddle_gpu_time":"1.1962959255180698","paddle_gpu_time_backward":"--"},{"name":"flatten_0","op":"flatten","op_count":6,"config":"x (Variable) - dtype: float32, shape: [100, 1785, 100]\nstart_axis (int): 1\nstop_axis (int): 2\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/flatten_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/flatten_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/flatten_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/flatten_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/flatten_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/flatten_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.2052234143626933","paddle_perf_backwards":"0.39470536368233816","paddle_gpu_time":"0.19870698867279332","paddle_gpu_time_backward":"0.3846774193548387"},{"name":"flip_0","op":"flip","op_count":0,"config":"x (Variable) - dtype: float32, shape: [100, 1785]\naxis (int): 1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/flip_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/flip_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/flip_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/flip_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/flip_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/flip_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.038005624498639784","paddle_perf_backwards":"0.05868308398188377","paddle_gpu_time":"0.012275830678197541","paddle_gpu_time_backward":"0.02507102593010146"},{"name":"floor_0","op":"floor","op_count":1,"config":"x (Variable) - dtype: float32, shape: [16, 128, 257, 257]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/floor_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/floor_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/floor_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/floor_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/floor_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/floor_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"1.3394457065987444","paddle_perf_backwards":"1.9435950176032608","paddle_gpu_time":"1.3214364863503576","paddle_gpu_time_backward":"1.9249046081596712"},{"name":"floor_1","op":"floor","op_count":1,"config":"x (Variable) - dtype: float16, shape: [16, 128, 257, 257]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/floor_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/floor_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/floor_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/floor_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/floor_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/floor_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.6903533706206357","paddle_perf_backwards":"1.0030668101950972","paddle_gpu_time":"0.6758086606243705","paddle_gpu_time_backward":"0.9779023323615161"},{"name":"floor_divide_0","op":"floor_divide","op_count":0,"config":"x (Variable) - dtype: int64, shape: [16, 128, 8]\ny (Variable) - dtype: int64, shape: [16, 128, 8]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/floor_divide_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/floor_divide_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/floor_divide_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/floor_divide_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/floor_divide_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/floor_divide_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.022743429456438338","paddle_perf_backwards":"--","paddle_gpu_time":"0.0018792762815846937","paddle_gpu_time_backward":"--"},{"name":"floor_divide_1","op":"floor_divide","op_count":0,"config":"x (Variable) - dtype: int32, shape: [300, 128, 100]\ny (Variable) - dtype: int32, shape: [300, 128, 100]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/floor_divide_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/floor_divide_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/floor_divide_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/floor_divide_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/floor_divide_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/floor_divide_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.08261674391721675","paddle_perf_backwards":"--","paddle_gpu_time":"0.057438408723747975","paddle_gpu_time_backward":"--"},{"name":"floor_divide_2","op":"floor_divide","op_count":0,"config":"x (Variable) - dtype: int64, shape: [300, 128, 100]\ny (Variable) - dtype: int64, shape: [1]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/floor_divide_2-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/floor_divide_2-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/floor_divide_2-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/floor_divide_2-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/floor_divide_2-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/floor_divide_2-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.10862849757284344","paddle_perf_backwards":"--","paddle_gpu_time":"0.07756653225806451","paddle_gpu_time_backward":"--"},{"name":"full_0","op":"full","op_count":0,"config":"dtype (string): float32\nfill_value (float): 210000.0\nshape (list): [1]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/full_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/full_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/full_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/full_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/full_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/full_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.025311538151332313","paddle_perf_backwards":"--","paddle_gpu_time":"0.001291967830601649","paddle_gpu_time_backward":"--"},{"name":"full_1","op":"full","op_count":0,"config":"dtype (string): int32\nfill_value (int): 0\nshape (list): [1]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/full_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/full_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/full_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/full_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/full_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/full_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.02579737682731784","paddle_perf_backwards":"--","paddle_gpu_time":"0.0013547534316217592","paddle_gpu_time_backward":"--"},{"name":"gather_0","op":"gather","op_count":35,"config":"index (Variable) - dtype: int32, shape: [16]\ninput (Variable) - dtype: float32, shape: [16, 1]\naxis (int): 0\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/gather_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/gather_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/gather_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/gather_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/gather_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/gather_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.021042872448356786","paddle_perf_backwards":"0.04101845682883749","paddle_gpu_time":"0.0034056613276921524","paddle_gpu_time_backward":"0.005989716312056738"},{"name":"gather_1","op":"gather","op_count":35,"config":"index (Variable) - dtype: int32, shape: [16, 1]\ninput (Variable) - dtype: float32, shape: [16, 256, 14, 14]\naxis (int): 0\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/gather_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/gather_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/gather_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/gather_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/gather_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/gather_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.023766119317357316","paddle_perf_backwards":"0.04980880093861775","paddle_gpu_time":"0.00678702570379437","paddle_gpu_time_backward":"0.02527811023622047"},{"name":"gather_nd_0","op":"gather_nd","op_count":0,"config":"index (Variable) - dtype: int32, shape: [16, 2]\ninput (Variable) - dtype: float32, shape: [16, 10, 10]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/gather_nd_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/gather_nd_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/gather_nd_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/gather_nd_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/gather_nd_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/gather_nd_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.02819883580110511","paddle_perf_backwards":"0.05324762694689693","paddle_gpu_time":"0.003556058036555492","paddle_gpu_time_backward":"0.00862923832923833"},{"name":"gather_nd_1","op":"gather_nd","op_count":0,"config":"index (Variable) - dtype: int32, shape: [16, 3]\ninput (Variable) - dtype: float32, shape: [16, 256, 14, 14]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/gather_nd_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/gather_nd_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/gather_nd_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/gather_nd_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/gather_nd_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/gather_nd_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.029893237424184045","paddle_perf_backwards":"0.04806949431637683","paddle_gpu_time":"0.005825424721734036","paddle_gpu_time_backward":"0.010443475733798316"},{"name":"greater_equal_0","op":"greater_equal","op_count":0,"config":"x (Variable) - dtype: int32, shape: [1]\ny (Variable) - dtype: int32, shape: [1]\ncond (string): None\nforce_cpu (string): None\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/greater_equal_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/greater_equal_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/greater_equal_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/greater_equal_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/greater_equal_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/greater_equal_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.024361911422026185","paddle_perf_backwards":"--","paddle_gpu_time":"0.0022884172589848835","paddle_gpu_time_backward":"--"},{"name":"greater_equal_1","op":"greater_equal","op_count":0,"config":"x (Variable) - dtype: float32, shape: [256, 1024]\ny (Variable) - dtype: float32, shape: [256, 1024]\ncond (string): None\nforce_cpu (string): None\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/greater_equal_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/greater_equal_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/greater_equal_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/greater_equal_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/greater_equal_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/greater_equal_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.020867789197780325","paddle_perf_backwards":"--","paddle_gpu_time":"0.0025632218844984797","paddle_gpu_time_backward":"--"},{"name":"greater_equal_2","op":"greater_equal","op_count":0,"config":"x (Variable) - dtype: int32, shape: [1024]\ny (Variable) - dtype: int32, shape: [256, 1024]\ncond (string): None\nforce_cpu (string): None\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/greater_equal_2-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/greater_equal_2-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/greater_equal_2-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/greater_equal_2-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/greater_equal_2-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/greater_equal_2-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.025900045712151844","paddle_perf_backwards":"--","paddle_gpu_time":"0.0034325955734406436","paddle_gpu_time_backward":"--"},{"name":"greater_than_0","op":"greater_than","op_count":0,"config":"x (Variable) - dtype: int32, shape: [1]\ny (Variable) - dtype: int32, shape: [1]\ncond (string): None\nforce_cpu (string): None\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/greater_than_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/greater_than_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/greater_than_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/greater_than_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/greater_than_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/greater_than_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.022114780479538177","paddle_perf_backwards":"--","paddle_gpu_time":"0.0013094880272517782","paddle_gpu_time_backward":"--"},{"name":"greater_than_1","op":"greater_than","op_count":0,"config":"x (Variable) - dtype: float32, shape: [256, 1024]\ny (Variable) - dtype: float32, shape: [256, 1024]\ncond (string): None\nforce_cpu (string): None\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/greater_than_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/greater_than_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/greater_than_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/greater_than_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/greater_than_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/greater_than_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.027605383572931998","paddle_perf_backwards":"--","paddle_gpu_time":"0.003134035229803604","paddle_gpu_time_backward":"--"},{"name":"greater_than_2","op":"greater_than","op_count":0,"config":"x (Variable) - dtype: int32, shape: [1024]\ny (Variable) - dtype: int32, shape: [256, 1024]\ncond (string): None\nforce_cpu (string): None\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/greater_than_2-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/greater_than_2-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/greater_than_2-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/greater_than_2-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/greater_than_2-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/greater_than_2-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.031637404868024625","paddle_perf_backwards":"--","paddle_gpu_time":"0.00285287310053336","paddle_gpu_time_backward":"--"},{"name":"grid_sample_0","op":"grid_sample","op_count":0,"config":"grid (Variable) - dtype: float32, shape: [4, 12, 16, 2]\nx (Variable) - dtype: float32, shape: [4, 1, 32, 32]\nalign_corners (bool): True\nmode (string): bilinear\nout_shape (list): [4, 1, 12, 16]\npadding_mode (string): zeros\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/grid_sample_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/grid_sample_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/grid_sample_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/grid_sample_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/grid_sample_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/grid_sample_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"0.024002668808917615","paddle_perf_backwards":"0.041827133723667684","paddle_gpu_time":"0.0018206788511749348","paddle_gpu_time_backward":"0.006088800530152419"},{"name":"grid_sample_1","op":"grid_sample","op_count":0,"config":"grid (Variable) - dtype: float32, shape: [4, 128, 128, 2]\nx (Variable) - dtype: float32, shape: [4, 1, 64, 64]\nalign_corners (bool): True\nmode (string): nearest\nout_shape (list): [4, 1, 128, 128]\npadding_mode (string): zeros\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/grid_sample_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/grid_sample_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/grid_sample_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/grid_sample_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/grid_sample_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/grid_sample_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"0.023510261457793565","paddle_perf_backwards":"0.048361019212372444","paddle_gpu_time":"0.004799011997177136","paddle_gpu_time_backward":"0.015757652072839985"},{"name":"grid_sample_2","op":"grid_sample","op_count":0,"config":"grid (Variable) - dtype: float32, shape: [4, 256, 246, 2]\nx (Variable) - dtype: float32, shape: [4, 1, 128, 128]\nalign_corners (bool): False\nmode (string): bilinear\nout_shape (list): [4, 1, 256, 256]\npadding_mode (string): zeros\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/grid_sample_2-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/grid_sample_2-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/grid_sample_2-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/grid_sample_2-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/grid_sample_2-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/grid_sample_2-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"0.032491100077726404","paddle_perf_backwards":"0.08906685576146964","paddle_gpu_time":"0.015161632845681925","paddle_gpu_time_backward":"0.06789697802197803"},{"name":"grid_sample_3","op":"grid_sample","op_count":0,"config":"grid (Variable) - dtype: float32, shape: [4, 256, 246, 2]\nx (Variable) - dtype: float32, shape: [4, 1, 128, 128]\nalign_corners (bool): False\nmode (string): bilinear\nout_shape (list): [4, 1, 256, 256]\npadding_mode (string): reflection\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/grid_sample_3-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/grid_sample_3-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/grid_sample_3-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/grid_sample_3-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/grid_sample_3-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/grid_sample_3-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"0.035432406834193644","paddle_perf_backwards":"0.09401379799356266","paddle_gpu_time":"0.017641325536062378","paddle_gpu_time_backward":"0.07483816964285712"},{"name":"grid_sample_4","op":"grid_sample","op_count":0,"config":"grid (Variable) - dtype: float32, shape: [4, 256, 246, 2]\nx (Variable) - dtype: float32, shape: [4, 1, 128, 128]\nalign_corners (bool): False\nmode (string): bilinear\nout_shape (list): [4, 1, 256, 256]\npadding_mode (string): border\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/grid_sample_4-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/grid_sample_4-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/grid_sample_4-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/grid_sample_4-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/grid_sample_4-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/grid_sample_4-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"0.033201003561214526","paddle_perf_backwards":"0.09058251672861528","paddle_gpu_time":"0.015051062331259538","paddle_gpu_time_backward":"0.05826056921801602"},{"name":"group_norm_0","op":"group_norm","op_count":3,"config":"x (Variable) - dtype: float32, shape: [8, 6, 10, 10]\ndata_format (string): NCHW\nepsilon (float): 1e-05\nnum_groups (int): 3\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/group_norm_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/group_norm_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/group_norm_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/group_norm_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/group_norm_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/group_norm_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"0.039115730597048384","paddle_perf_backwards":"0.07276024137224471","paddle_gpu_time":"0.006600294623127915","paddle_gpu_time_backward":"0.016712980269989616"},{"name":"histogram_0","op":"histogram","op_count":0,"config":"input (Variable) - dtype: int32, shape: [16, 64]\nbins (int32): 100\nmax (int32): 0\nmin (int32): 0\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/histogram_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/histogram_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/histogram_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/histogram_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/histogram_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/histogram_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"0.12597259210080516","paddle_perf_backwards":"0.12511525835309711","paddle_gpu_time":"0.05507212657049791","paddle_gpu_time_backward":"0.055234867959803695"},{"name":"histogram_1","op":"histogram","op_count":0,"config":"input (Variable) - dtype: int64, shape: [16, 64]\nbins (int32): 100\nmax (int32): 0\nmin (int32): 0\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/histogram_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/histogram_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/histogram_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/histogram_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/histogram_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/histogram_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"0.12825532835357042","paddle_perf_backwards":"0.1258419484508281","paddle_gpu_time":"0.0611083180987203","paddle_gpu_time_backward":"0.061410902427851584"},{"name":"histogram_2","op":"histogram","op_count":0,"config":"input (Variable) - dtype: float32, shape: [16, 64]\nbins (int32): 100\nmax (int32): 0\nmin (int32): 0\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/histogram_2-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/histogram_2-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/histogram_2-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/histogram_2-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/histogram_2-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/histogram_2-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"0.08051541386818399","paddle_perf_backwards":"0.07784220637107382","paddle_gpu_time":"0.0126065188172043","paddle_gpu_time_backward":"0.012797489263296994"},{"name":"increment_0","op":"increment","op_count":11,"config":"x (Variable) - dtype: int32, shape: [1]\nin_place (bool): True\nvalue (float): 1.0\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/increment_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/increment_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/increment_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/increment_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/increment_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/increment_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.017014085030069154","paddle_perf_backwards":"--","paddle_gpu_time":"0.001333706492977814","paddle_gpu_time_backward":"--"},{"name":"index_sample_0","op":"index_sample","op_count":0,"config":"index (Variable) - dtype: int64, shape: [5100, 1]\nx (Variable) - dtype: float32, shape: [5100, 38506]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/index_sample_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/index_sample_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/index_sample_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/index_sample_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/index_sample_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/index_sample_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"0.020728347537753822","paddle_perf_backwards":"1.0043110814061131","paddle_gpu_time":"0.009198625858838226","paddle_gpu_time_backward":"0.9757478957509382"},{"name":"index_sample_1","op":"index_sample","op_count":0,"config":"index (Variable) - dtype: int64, shape: [100, 64]\nx (Variable) - dtype: float32, shape: [100, 128]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/index_sample_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/index_sample_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/index_sample_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/index_sample_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/index_sample_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/index_sample_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"0.019601899750378664","paddle_perf_backwards":"0.037160941532679966","paddle_gpu_time":"0.0015662502559901702","paddle_gpu_time_backward":"0.0046728314665277415"},{"name":"index_sample_2","op":"index_sample","op_count":0,"config":"index (Variable) - dtype: int64, shape: [5100, 96]\nx (Variable) - dtype: float32, shape: [5100, 128]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/index_sample_2-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/index_sample_2-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/index_sample_2-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/index_sample_2-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/index_sample_2-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/index_sample_2-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"0.02770572302331886","paddle_perf_backwards":"0.044789898347663115","paddle_gpu_time":"0.013315280464216633","paddle_gpu_time_backward":"0.02855679093089165"},{"name":"index_select_0","op":"index_select","op_count":0,"config":"index (Variable) - dtype: int64, shape: [10]\nx (Variable) - dtype: float32, shape: [100, 1785]\naxis (int): 0\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/index_select_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/index_select_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/index_select_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/index_select_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/index_select_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/index_select_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"0.03041004648013991","paddle_perf_backwards":"0.05556904539770009","paddle_gpu_time":"0.0024032584389308913","paddle_gpu_time_backward":"0.00918456980937661"},{"name":"index_select_1","op":"index_select","op_count":0,"config":"index (Variable) - dtype: int, shape: [10]\nx (Variable) - dtype: float32, shape: [100, 100, 100]\naxis (int): 1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/index_select_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/index_select_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/index_select_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/index_select_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/index_select_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/index_select_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"0.032358266869369816","paddle_perf_backwards":"0.08342047126925721","paddle_gpu_time":"0.004383363471971067","paddle_gpu_time_backward":"0.0404170403587444"},{"name":"instance_norm_0","op":"instance_norm","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 256, 32, 32]\neps (float): 1e-05\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/instance_norm_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/instance_norm_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/instance_norm_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/instance_norm_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/instance_norm_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/instance_norm_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"0.10722511743445952","paddle_perf_backwards":"0.19126249604435808","paddle_gpu_time":"0.07680715748625634","paddle_gpu_time_backward":"0.1610071663761379"},{"name":"interp_area_0","op":"interp_area","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 512, 64, 64]\nalign_corners (bool): False\ndata_format (string): NHWC\ninterp_mode (string): area\nscale_factor (string): None\nsize (list): [128, 128]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_area_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_area_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_area_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_area_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_area_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_area_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"3.817060772253542","paddle_perf_backwards":"5.061308461792615","paddle_gpu_time":"4.030509156771986","paddle_gpu_time_backward":"5.285037603905528"},{"name":"interp_area_1","op":"interp_area","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 64, 64, 64]\nalign_corners (bool): True\ndata_format (string): NCHW\ninterp_mode (string): area\nscale_factor (string): None\nsize (list): [32, 64]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_area_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_area_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_area_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_area_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_area_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_area_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"0.1044382854383819","paddle_perf_backwards":"0.20921716884690889","paddle_gpu_time":"0.09818303755674783","paddle_gpu_time_backward":"0.2090782800441014"},{"name":"interp_area_2","op":"interp_area","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 2, 5, 12, 12]\nalign_corners (bool): False\ndata_format (string): NDHWC\ninterp_mode (string): area\nscale_factor (string): None\nsize (list): [10, 6, 4]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_area_2-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_area_2-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_area_2-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_area_2-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_area_2-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_area_2-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"0.02519281543031031","paddle_perf_backwards":"0.042801487202547034","paddle_gpu_time":"0.0062362775707523405","paddle_gpu_time_backward":"0.013821175950486294"},{"name":"interp_bicubic_0","op":"interp_bicubic","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 512, 64, 64]\nalign_corners (bool): False\ndata_format (string): NHWC\ninterp_mode (string): bicubic\nscale_factor (string): None\nsize (list): [128, 128]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_bicubic_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_bicubic_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_bicubic_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_bicubic_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_bicubic_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_bicubic_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"0.8012362888881138","paddle_perf_backwards":"2.5448738312234687","paddle_gpu_time":"0.8278441879637263","paddle_gpu_time_backward":"2.5320562560620754"},{"name":"interp_bicubic_1","op":"interp_bicubic","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 64, 64, 64]\nalign_corners (bool): True\ndata_format (string): NCHW\ninterp_mode (string): bicubic\nscale_factor (float32): 2.0\nsize (string): None\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_bicubic_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_bicubic_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_bicubic_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_bicubic_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_bicubic_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_bicubic_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"0.6469135381737534","paddle_perf_backwards":"2.1377130430571887","paddle_gpu_time":"0.639441430580671","paddle_gpu_time_backward":"2.135330548754141"},{"name":"interp_bicubic_2","op":"interp_bicubic","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 64, 64, 64]\nalign_corners (bool): False\ndata_format (string): NHWC\ninterp_mode (string): bicubic\nscale_factor (list): [2, 3]\nsize (string): None\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_bicubic_2-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_bicubic_2-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_bicubic_2-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_bicubic_2-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_bicubic_2-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_bicubic_2-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"1.1511053357805523","paddle_perf_backwards":"2.6090643843825982","paddle_gpu_time":"1.107781789009226","paddle_gpu_time_backward":"2.5848727531589253"},{"name":"interp_bilinear_0","op":"interp_bilinear","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 512, 64, 402]\nalign_corners (bool): True\ndata_format (string): NCHW\nscale_factor (string): None\nsize (list): [128, 402]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_bilinear_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_bilinear_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_bilinear_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_bilinear_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_bilinear_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_bilinear_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"8.696240308333415","paddle_perf_backwards":"33.324586371986236","paddle_gpu_time":"8.824231943031535","paddle_gpu_time_backward":"33.34018426647768"},{"name":"interp_bilinear_1","op":"interp_bilinear","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 64, 402, 512]\nalign_corners (bool): True\ndata_format (string): NHWC\nscale_factor (string): None\nsize (list): [128, 402]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_bilinear_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_bilinear_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_bilinear_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_bilinear_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_bilinear_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_bilinear_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"11.210425289309754","paddle_perf_backwards":"32.068995067051475","paddle_gpu_time":"11.353298875038005","paddle_gpu_time_backward":"32.096297499188054"},{"name":"interp_bilinear_2","op":"interp_bilinear","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 256, 1, 1]\nalign_corners (bool): True\ndata_format (string): NCHW\nscale_factor (string): None\nsize (tuple): [33, 33]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_bilinear_2-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_bilinear_2-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_bilinear_2-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_bilinear_2-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_bilinear_2-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_bilinear_2-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"0.10739394596644809","paddle_perf_backwards":"1.6636850882549674","paddle_gpu_time":"0.09763028908672601","paddle_gpu_time_backward":"1.658714918759232"},{"name":"interp_bilinear_3","op":"interp_bilinear","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 1, 1, 256]\nalign_corners (bool): True\ndata_format (string): NHWC\nscale_factor (string): None\nsize (tuple): [33, 33]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_bilinear_3-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_bilinear_3-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_bilinear_3-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_bilinear_3-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_bilinear_3-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_bilinear_3-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"0.14252200418589067","paddle_perf_backwards":"0.5603838940056003","paddle_gpu_time":"0.11390278055611121","paddle_gpu_time_backward":"0.5767805732882714"},{"name":"interp_bilinear_4","op":"interp_bilinear","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 19, 129, 129]\nalign_corners (bool): True\ndata_format (string): NCHW\nscale_factor (string): None\nsize (tuple): [513, 513]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_bilinear_4-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_bilinear_4-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_bilinear_4-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_bilinear_4-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_bilinear_4-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_bilinear_4-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"1.5597649982997348","paddle_perf_backwards":"4.268120016370501","paddle_gpu_time":"1.520406935952691","paddle_gpu_time_backward":"4.187873045949787"},{"name":"interp_bilinear_5","op":"interp_bilinear","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 129, 129, 19]\nalign_corners (bool): True\ndata_format (string): NHWC\nscale_factor (string): None\nsize (tuple): [513, 513]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_bilinear_5-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_bilinear_5-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_bilinear_5-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_bilinear_5-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_bilinear_5-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_bilinear_5-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"1.9198765560072295","paddle_perf_backwards":"3.837694440569196","paddle_gpu_time":"1.8958016032064129","paddle_gpu_time_backward":"3.814294330518697"},{"name":"interp_bilinear_6","op":"interp_bilinear","op_count":0,"config":"x (Variable) - dtype: float32, shape: [4, 256, 1, 1]\nalign_corners (bool): False\ndata_format (string): NCHW\nscale_factor (string): None\nsize (tuple): [64, 128]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_bilinear_6-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_bilinear_6-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_bilinear_6-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_bilinear_6-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_bilinear_6-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_bilinear_6-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"0.2006504000449667","paddle_perf_backwards":"4.9196644705169055","paddle_gpu_time":"0.15369073814762954","paddle_gpu_time_backward":"4.929890518487916"},{"name":"interp_linear_0","op":"interp_linear","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 512, 64]\nalign_corners (bool): False\ndata_format (string): NCW\ninterp_mode (string): linear\nscale_factor (string): None\nsize (list): [128]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_linear_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_linear_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_linear_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_linear_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_linear_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_linear_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"0.036274900241773954","paddle_perf_backwards":"0.0617457895862813","paddle_gpu_time":"0.01837799140928615","paddle_gpu_time_backward":"0.03985640805829405"},{"name":"interp_linear_1","op":"interp_linear","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 64, 64]\nalign_corners (bool): True\ndata_format (string): NCW\ninterp_mode (string): linear\nscale_factor (float32): 2.0\nsize (string): None\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_linear_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_linear_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_linear_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_linear_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_linear_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_linear_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"0.02454494943424147","paddle_perf_backwards":"0.04094717453937141","paddle_gpu_time":"0.004960742544126598","paddle_gpu_time_backward":"0.010877189093327421"},{"name":"interp_nearest_0","op":"interp_nearest","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 256, 16, 16]\nalign_corners (bool): False\ndata_format (string): NCHW\nscale_factor (string): None\nsize (list): [32, 32]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_nearest_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_nearest_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_nearest_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_nearest_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_nearest_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_nearest_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"0.09962125700347278","paddle_perf_backwards":"0.17396308937851263","paddle_gpu_time":"0.08315327140549274","paddle_gpu_time_backward":"0.16094248234106961"},{"name":"interp_nearest_1","op":"interp_nearest","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 16, 16, 256]\nalign_corners (bool): False\ndata_format (string): NHWC\nscale_factor (string): None\nsize (list): [32, 32]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_nearest_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_nearest_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_nearest_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_nearest_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_nearest_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_nearest_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"0.11432998034418847","paddle_perf_backwards":"0.20904881613595144","paddle_gpu_time":"0.10488664987405541","paddle_gpu_time_backward":"0.20972809667673714"},{"name":"interp_trilinear_0","op":"interp_trilinear","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 25, 12, 32, 64]\nalign_corners (bool): False\ndata_format (string): NCDHW\ninterp_mode (string): trilinear\nscale_factor (string): None\nsize (list): [64, 16, 32]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_trilinear_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_trilinear_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_trilinear_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_trilinear_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_trilinear_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_trilinear_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"0.461788323460793","paddle_perf_backwards":"1.2910850194035743","paddle_gpu_time":"0.4185230393652731","paddle_gpu_time_backward":"1.2455647734524569"},{"name":"interp_trilinear_1","op":"interp_trilinear","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 25, 7, 8, 9]\nalign_corners (bool): True\ndata_format (string): NCDHW\ninterp_mode (string): trilinear\nscale_factor (float32): 2.0\nsize (string): None\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_trilinear_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_trilinear_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_trilinear_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_trilinear_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_trilinear_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_trilinear_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"0.07926468946495835","paddle_perf_backwards":"0.21818827609626615","paddle_gpu_time":"0.05245561239843515","paddle_gpu_time_backward":"0.17359583092067168"},{"name":"interp_trilinear_2","op":"interp_trilinear","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 5, 12, 24, 8]\nalign_corners (bool): False\ndata_format (string): NCDHW\ninterp_mode (string): trilinear\nscale_factor (list): [2, 3, 4]\nsize (string): None\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_trilinear_2-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_trilinear_2-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_trilinear_2-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_trilinear_2-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_trilinear_2-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/interp_trilinear_2-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"0.15537787456901708","paddle_perf_backwards":"0.4412288568457779","paddle_gpu_time":"0.14975975975975975","paddle_gpu_time_backward":"0.41418031577394127"},{"name":"inverse_0","op":"inverse","op_count":0,"config":"x (Variable) - dtype: float32, shape: [128, 64, 64]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/inverse_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/inverse_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/inverse_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/inverse_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/inverse_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/inverse_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.33941998773691606","paddle_perf_backwards":"0.3875311540097606","paddle_gpu_time":"0.3068543689320388","paddle_gpu_time_backward":"0.3396419995634141"},{"name":"isfinite_0","op":"isfinite","op_count":6,"config":"x (Variable) - dtype: float32, shape: [1]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/isfinite_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/isfinite_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/isfinite_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/isfinite_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/isfinite_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/isfinite_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.03277160683456733","paddle_perf_backwards":"--","paddle_gpu_time":"0.004143530644316396","paddle_gpu_time_backward":"--"},{"name":"isfinite_1","op":"isfinite","op_count":6,"config":"x (Variable) - dtype: float32, shape: [300, 1000]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/isfinite_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/isfinite_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/isfinite_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/isfinite_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/isfinite_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/isfinite_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.029468049808424344","paddle_perf_backwards":"--","paddle_gpu_time":"0.006800171037628278","paddle_gpu_time_backward":"--"},{"name":"isfinite_2","op":"isfinite","op_count":6,"config":"x (Variable) - dtype: float16, shape: [300, 1000]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/isfinite_2-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/isfinite_2-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/isfinite_2-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/isfinite_2-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/isfinite_2-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/isfinite_2-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.030615135115020134","paddle_perf_backwards":"--","paddle_gpu_time":"0.0064143126177024475","paddle_gpu_time_backward":"--"},{"name":"isinf_0","op":"isinf","op_count":0,"config":"x (Variable) - dtype: float32, shape: [1]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/isinf_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/isinf_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/isinf_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/isinf_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/isinf_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/isinf_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.01952721148121114","paddle_perf_backwards":"--","paddle_gpu_time":"0.001327466937945066","paddle_gpu_time_backward":"--"},{"name":"isinf_1","op":"isinf","op_count":0,"config":"x (Variable) - dtype: float32, shape: [300, 1000]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/isinf_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/isinf_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/isinf_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/isinf_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/isinf_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/isinf_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.020951397564946385","paddle_perf_backwards":"--","paddle_gpu_time":"0.002009669889963321","paddle_gpu_time_backward":"--"},{"name":"isinf_2","op":"isinf","op_count":0,"config":"x (Variable) - dtype: float16, shape: [300, 1000]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/isinf_2-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/isinf_2-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/isinf_2-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/isinf_2-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/isinf_2-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/isinf_2-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.020397196010667452","paddle_perf_backwards":"--","paddle_gpu_time":"0.001816211020931226","paddle_gpu_time_backward":"--"},{"name":"isnan_0","op":"isnan","op_count":0,"config":"x (Variable) - dtype: float32, shape: [1]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/isnan_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/isnan_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/isnan_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/isnan_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/isnan_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/isnan_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.01970918811097437","paddle_perf_backwards":"--","paddle_gpu_time":"0.001351002136534744","paddle_gpu_time_backward":"--"},{"name":"isnan_1","op":"isnan","op_count":0,"config":"x (Variable) - dtype: float32, shape: [300, 1000]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/isnan_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/isnan_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/isnan_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/isnan_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/isnan_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/isnan_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.018678149398492303","paddle_perf_backwards":"--","paddle_gpu_time":"0.002005590339892666","paddle_gpu_time_backward":"--"},{"name":"isnan_2","op":"isnan","op_count":0,"config":"x (Variable) - dtype: float16, shape: [300, 1000]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/isnan_2-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/isnan_2-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/isnan_2-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/isnan_2-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/isnan_2-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/isnan_2-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.020170211791992188","paddle_perf_backwards":"--","paddle_gpu_time":"0.0017497854077253218","paddle_gpu_time_backward":"--"},{"name":"layer_norm_0","op":"layer_norm","op_count":20,"config":"x (Variable) - dtype: float32, shape: [16, 128, 768]\nepsilon (float): 1e-05\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/layer_norm_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/layer_norm_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/layer_norm_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/layer_norm_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/layer_norm_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/layer_norm_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"0.13521642101054288","paddle_perf_backwards":"0.36057014854586855","paddle_gpu_time":"0.1100323297635886","paddle_gpu_time_backward":"0.33689263033525335"},{"name":"leaky_relu_0","op":"leaky_relu","op_count":3,"config":"x (Variable) - dtype: float32, shape: [16, 512, 31, 31]\nnegative_slope (float): 0.1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/leaky_relu_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/leaky_relu_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/leaky_relu_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/leaky_relu_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/leaky_relu_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/leaky_relu_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.09324076664016907","paddle_perf_backwards":"0.20965026564387432","paddle_gpu_time":"0.07900791235544735","paddle_gpu_time_backward":"0.19197085239789868"},{"name":"leaky_relu_1","op":"leaky_relu","op_count":3,"config":"x (Variable) - dtype: float16, shape: [16, 512, 31, 31]\nnegative_slope (float): 0.1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/leaky_relu_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/leaky_relu_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/leaky_relu_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/leaky_relu_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/leaky_relu_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/leaky_relu_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.054547154759786214","paddle_perf_backwards":"0.11477183146649096","paddle_gpu_time":"0.041238075908260605","paddle_gpu_time_backward":"0.09906842818428184"},{"name":"less_equal_0","op":"less_equal","op_count":0,"config":"x (Variable) - dtype: int32, shape: [1]\ny (Variable) - dtype: int32, shape: [1]\ncond (string): None\nforce_cpu (string): None\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/less_equal_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/less_equal_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/less_equal_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/less_equal_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/less_equal_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/less_equal_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.0410406527395","paddle_perf_backwards":"--","paddle_gpu_time":"0.0013217400020046105","paddle_gpu_time_backward":"--"},{"name":"less_equal_1","op":"less_equal","op_count":0,"config":"x (Variable) - dtype: float32, shape: [256, 1024]\ny (Variable) - dtype: float32, shape: [256, 1024]\ncond (string): None\nforce_cpu (string): None\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/less_equal_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/less_equal_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/less_equal_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/less_equal_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/less_equal_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/less_equal_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.020836398214519856","paddle_perf_backwards":"--","paddle_gpu_time":"0.00220711743772242","paddle_gpu_time_backward":"--"},{"name":"less_equal_2","op":"less_equal","op_count":0,"config":"x (Variable) - dtype: int32, shape: [1024]\ny (Variable) - dtype: int32, shape: [256, 1024]\ncond (string): None\nforce_cpu (string): None\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/less_equal_2-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/less_equal_2-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/less_equal_2-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/less_equal_2-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/less_equal_2-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/less_equal_2-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.029476324398675282","paddle_perf_backwards":"--","paddle_gpu_time":"0.0028485305958132045","paddle_gpu_time_backward":"--"},{"name":"less_than_0","op":"less_than","op_count":23,"config":"x (Variable) - dtype: int32, shape: [1]\ny (Variable) - dtype: int32, shape: [1]\ncond (string): None\nforce_cpu (string): None\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/less_than_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/less_than_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/less_than_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/less_than_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/less_than_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/less_than_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.014719098268864386","paddle_perf_backwards":"--","paddle_gpu_time":"0.001259897764859176","paddle_gpu_time_backward":"--"},{"name":"less_than_1","op":"less_than","op_count":23,"config":"x (Variable) - dtype: float32, shape: [256, 1024]\ny (Variable) - dtype: float32, shape: [256, 1024]\ncond (string): None\nforce_cpu (string): None\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/less_than_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/less_than_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/less_than_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/less_than_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/less_than_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/less_than_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.021601559403903018","paddle_perf_backwards":"--","paddle_gpu_time":"0.0025733252254991385","paddle_gpu_time_backward":"--"},{"name":"less_than_2","op":"less_than","op_count":23,"config":"x (Variable) - dtype: int32, shape: [1024]\ny (Variable) - dtype: int32, shape: [256, 1024]\ncond (string): None\nforce_cpu (string): None\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/less_than_2-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/less_than_2-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/less_than_2-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/less_than_2-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/less_than_2-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/less_than_2-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.02500320484260758","paddle_perf_backwards":"--","paddle_gpu_time":"0.0033214321633309864","paddle_gpu_time_backward":"--"},{"name":"lgamma_0","op":"lgamma","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 128, 257, 257]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/lgamma_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/lgamma_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/lgamma_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/lgamma_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/lgamma_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/lgamma_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"1.3457936848810537","paddle_perf_backwards":"4.0218410606613615","paddle_gpu_time":"1.329283771532185","paddle_gpu_time_backward":"4.1104329835968665"},{"name":"lgamma_1","op":"lgamma","op_count":0,"config":"x (Variable) - dtype: float16, shape: [16, 128, 257, 257]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/lgamma_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/lgamma_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/lgamma_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/lgamma_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/lgamma_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/lgamma_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"--","paddle_perf_backwards":"--","paddle_gpu_time":"--","paddle_gpu_time_backward":"--"},{"name":"linear_0","op":"linear","op_count":0,"config":"bias (Variable) - dtype: float32, shape: [2048]\nweight (Variable) - dtype: float32, shape: [36864, 2048]\nx (Variable) - dtype: float32, shape: [16, 36864]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/linear_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/linear_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/linear_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/linear_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/linear_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/linear_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.449491277032969","paddle_perf_backwards":"1.321918623788016","paddle_gpu_time":"0.4374772405421809","paddle_gpu_time_backward":"1.2757328072153327"},{"name":"linear_1","op":"linear","op_count":0,"config":"bias (Variable) - dtype: float32, shape: [1024]\nweight (Variable) - dtype: float32, shape: [1024, 1024]\nx (Variable) - dtype: float32, shape: [16, 16, 1024]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/linear_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/linear_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/linear_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/linear_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/linear_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/linear_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.10060801797983597","paddle_perf_backwards":"0.2169280636067293","paddle_gpu_time":"0.06479671232876713","paddle_gpu_time_backward":"0.17398142414860684"},{"name":"linear_2","op":"linear","op_count":0,"config":"bias (Variable) - dtype: float32, shape: [1024]\nweight (Variable) - dtype: float32, shape: [12544, 1024]\nx (Variable) - dtype: float32, shape: [16, 12544]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/linear_2-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/linear_2-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/linear_2-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/linear_2-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/linear_2-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/linear_2-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.12507908272020743","paddle_perf_backwards":"0.3091891606648763","paddle_gpu_time":"0.10488331892826273","paddle_gpu_time_backward":"0.2647800504342953"},{"name":"linear_3","op":"linear","op_count":0,"config":"bias (Variable) - dtype: float32, shape: [256]\nweight (Variable) - dtype: float32, shape: [16, 256]\nx (Variable) - dtype: float32, shape: [16, 16]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/linear_3-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/linear_3-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/linear_3-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/linear_3-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/linear_3-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/linear_3-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.040891949011355025","paddle_perf_backwards":"0.10224288823653241","paddle_gpu_time":"0.004309962497961846","paddle_gpu_time_backward":"0.013321499800558434"},{"name":"linspace_0","op":"linspace","op_count":0,"config":"dtype (string): float32\nnum (int): 5\nstart (float64): -100.0\nstop (float64): 100.0\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/linspace_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/linspace_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/linspace_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/linspace_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/linspace_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/linspace_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.049633882483657535","paddle_perf_backwards":"--","paddle_gpu_time":"0.0012627885691040375","paddle_gpu_time_backward":"--"},{"name":"linspace_1","op":"linspace","op_count":0,"config":"dtype (string): float32\nnum (int): 1000\nstart (float64): 32.0\nstop (float64): 82.0\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/linspace_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/linspace_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/linspace_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/linspace_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/linspace_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/linspace_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.04977863662096919","paddle_perf_backwards":"--","paddle_gpu_time":"0.0013703364847006201","paddle_gpu_time_backward":"--"},{"name":"log_0","op":"log","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 128, 257, 257]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/log_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/log_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/log_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/log_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/log_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/log_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"1.340620025604187","paddle_perf_backwards":"3.2543008456488174","paddle_gpu_time":"1.322904915390814","paddle_gpu_time_backward":"3.229098915989159"},{"name":"log_1","op":"log","op_count":0,"config":"x (Variable) - dtype: float16, shape: [16, 128, 257, 257]\n","timestamp":"2021.0603.112711.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.0603.112711.gcc82.post107.develop/log_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.0603.112711.gcc82.post107.develop/log_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.0603.112711.gcc82.post107.develop/log_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.0603.112711.gcc82.post107.develop/log_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.0603.112711.gcc82.post107.develop/log_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.0603.112711.gcc82.post107.develop/log_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"--","paddle_perf_backwards":"--","paddle_gpu_time":"--","paddle_gpu_time_backward":"--"},{"name":"log_softmax_0","op":"log_softmax","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 1000]\naxis (int): -1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/log_softmax_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/log_softmax_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/log_softmax_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/log_softmax_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/log_softmax_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/log_softmax_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.019272492856395488","paddle_perf_backwards":"0.028342130232830436","paddle_gpu_time":"0.0057528169014084505","paddle_gpu_time_backward":"0.00923935052531041"},{"name":"log_softmax_1","op":"log_softmax","op_count":0,"config":"x (Variable) - dtype: float16, shape: [16, 1000]\naxis (int): -1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/log_softmax_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/log_softmax_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/log_softmax_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/log_softmax_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/log_softmax_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/log_softmax_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"False","paddle_perf":"0.0211384831642618","paddle_perf_backwards":"0.02587887705588827","paddle_gpu_time":"0.007826928865255343","paddle_gpu_time_backward":"0.012914138902701808"},{"name":"log_softmax_2","op":"log_softmax","op_count":0,"config":"x (Variable) - dtype: float32, shape: [32, 12, 128, 128]\naxis (int): -1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/log_softmax_2-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/log_softmax_2-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/log_softmax_2-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/log_softmax_2-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/log_softmax_2-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/log_softmax_2-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.0784462235538835","paddle_perf_backwards":"0.17099332617947374","paddle_gpu_time":"0.06363765469669304","paddle_gpu_time_backward":"0.15468924640135479"},{"name":"log_softmax_3","op":"log_softmax","op_count":0,"config":"x (Variable) - dtype: float16, shape: [32, 12, 128, 128]\naxis (int): -1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/log_softmax_3-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/log_softmax_3-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/log_softmax_3-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/log_softmax_3-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/log_softmax_3-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/log_softmax_3-tensorflow_gpu_speed_forward.txt","paddle_consistency":"False","paddle_consistency_backwards":"False","paddle_perf":"0.04895312719076991","paddle_perf_backwards":"0.09749850116101613","paddle_gpu_time":"0.03363663998378762","paddle_gpu_time_backward":"0.08140840278959006"},{"name":"log_softmax_4","op":"log_softmax","op_count":0,"config":"x (Variable) - dtype: float32, shape: [15, 16, 33, 33]\naxis (int): -1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/log_softmax_4-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/log_softmax_4-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/log_softmax_4-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/log_softmax_4-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/log_softmax_4-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/log_softmax_4-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.019455601074891478","paddle_perf_backwards":"0.029212105011414434","paddle_gpu_time":"0.005085014553849242","paddle_gpu_time_backward":"0.009998027613412229"},{"name":"log_softmax_5","op":"log_softmax","op_count":0,"config":"x (Variable) - dtype: float16, shape: [15, 16, 33, 33]\naxis (int): -1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/log_softmax_5-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/log_softmax_5-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/log_softmax_5-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/log_softmax_5-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/log_softmax_5-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/log_softmax_5-tensorflow_gpu_speed_forward.txt","paddle_consistency":"False","paddle_consistency_backwards":"False","paddle_perf":"0.01930910981967597","paddle_perf_backwards":"0.02980210738096065","paddle_gpu_time":"0.004378771173699509","paddle_gpu_time_backward":"0.01007200791295747"},{"name":"log_softmax_6","op":"log_softmax","op_count":0,"config":"x (Variable) - dtype: float32, shape: [128, 128, 16, 16]\naxis (int): 0\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/log_softmax_6-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/log_softmax_6-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/log_softmax_6-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/log_softmax_6-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/log_softmax_6-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/log_softmax_6-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.21445142979524573","paddle_perf_backwards":"1.1767450644045458","paddle_gpu_time":"0.2026585489599188","paddle_gpu_time_backward":"1.187977420129714"},{"name":"log_softmax_7","op":"log_softmax","op_count":0,"config":"x (Variable) - dtype: float16, shape: [128, 128, 16, 16]\naxis (int): 0\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/log_softmax_7-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/log_softmax_7-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/log_softmax_7-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/log_softmax_7-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/log_softmax_7-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/log_softmax_7-tensorflow_gpu_speed_forward.txt","paddle_consistency":"False","paddle_consistency_backwards":"False","paddle_perf":"0.20630529948643275","paddle_perf_backwards":"0.7982173744513064","paddle_gpu_time":"0.18132311415893507","paddle_gpu_time_backward":"0.7899249870667356"},{"name":"log_softmax_8","op":"log_softmax","op_count":0,"config":"x (Variable) - dtype: float32, shape: [512, 896, 4, 12]\naxis (int): 1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/log_softmax_8-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/log_softmax_8-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/log_softmax_8-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/log_softmax_8-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/log_softmax_8-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/log_softmax_8-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.7913399715812839","paddle_perf_backwards":"14.760586193629672","paddle_gpu_time":"0.7667415042351261","paddle_gpu_time_backward":"14.744752663221178"},{"name":"log_softmax_9","op":"log_softmax","op_count":0,"config":"x (Variable) - dtype: float16, shape: [512, 896, 4, 12]\naxis (int): 1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/log_softmax_9-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/log_softmax_9-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/log_softmax_9-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/log_softmax_9-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/log_softmax_9-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/log_softmax_9-tensorflow_gpu_speed_forward.txt","paddle_consistency":"False","paddle_consistency_backwards":"False","paddle_perf":"0.624443803514753","paddle_perf_backwards":"14.446062457804777","paddle_gpu_time":"0.632985724410246","paddle_gpu_time_backward":"14.357635262168374"},{"name":"logical_and_0","op":"logical_and","op_count":4,"config":"x (Variable) - dtype: bool, shape: [16, 1785]\ny (Variable) - dtype: bool, shape: [16, 1785]\nout (string): None\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/logical_and_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/logical_and_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/logical_and_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/logical_and_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/logical_and_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/logical_and_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.021037763478804605","paddle_perf_backwards":"--","paddle_gpu_time":"0.0014450465187608628","paddle_gpu_time_backward":"--"},{"name":"logical_and_1","op":"logical_and","op_count":4,"config":"x (Variable) - dtype: bool, shape: [1]\ny (Variable) - dtype: bool, shape: [1]\nout (string): None\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/logical_and_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/logical_and_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/logical_and_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/logical_and_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/logical_and_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/logical_and_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.021480008929908152","paddle_perf_backwards":"--","paddle_gpu_time":"0.001342951608055305","paddle_gpu_time_backward":"--"},{"name":"logical_not_0","op":"logical_not","op_count":2,"config":"x (Variable) - dtype: bool, shape: [1]\nout (string): None\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/logical_not_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/logical_not_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/logical_not_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/logical_not_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/logical_not_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/logical_not_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.020251225452033842","paddle_perf_backwards":"--","paddle_gpu_time":"0.0013275704261161395","paddle_gpu_time_backward":"--"},{"name":"logical_or_0","op":"logical_or","op_count":0,"config":"x (Variable) - dtype: bool, shape: [16, 1785]\ny (Variable) - dtype: bool, shape: [16, 1785]\nout (string): None\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/logical_or_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/logical_or_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/logical_or_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/logical_or_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/logical_or_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/logical_or_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.02388662221480389","paddle_perf_backwards":"--","paddle_gpu_time":"0.0014454219948849105","paddle_gpu_time_backward":"--"},{"name":"logical_or_1","op":"logical_or","op_count":0,"config":"x (Variable) - dtype: bool, shape: [1]\ny (Variable) - dtype: bool, shape: [1]\nout (string): None\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/logical_or_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/logical_or_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/logical_or_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/logical_or_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/logical_or_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/logical_or_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.024704799384535677","paddle_perf_backwards":"--","paddle_gpu_time":"0.0012268764405251027","paddle_gpu_time_backward":"--"},{"name":"logsumexp_0","op":"logsumexp","op_count":0,"config":"x (Variable) - dtype: float32, shape: [64, 64]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/logsumexp_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/logsumexp_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/logsumexp_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/logsumexp_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/logsumexp_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/logsumexp_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.047530933302275986","paddle_perf_backwards":"0.05550408849910815","paddle_gpu_time":"0.026229007633587785","paddle_gpu_time_backward":"0.03236606116348586"},{"name":"logsumexp_1","op":"logsumexp","op_count":0,"config":"x (Variable) - dtype: float32, shape: [1024, 512]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/logsumexp_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/logsumexp_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/logsumexp_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/logsumexp_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/logsumexp_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/logsumexp_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.09582845532164282","paddle_perf_backwards":"0.10422589827557","paddle_gpu_time":"0.07801795173631548","paddle_gpu_time_backward":"0.08362762099952704"},{"name":"lstm_0","op":"lstm","op_count":1,"config":"inital_states (Variable) - dtype: float32, shape: [2, 20, 200]\ninputs (Variable) - dtype: float32, shape: [20, 20, 200]\ndirection (string): forward\nhidden_size (int): 200\nnum_layers (int): 2\nsequence_length (int): 20\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/lstm_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/lstm_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/lstm_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/lstm_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/lstm_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/lstm_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"37.90393362239916","paddle_perf_backwards":"--","paddle_gpu_time":"2.858360491351216","paddle_gpu_time_backward":"--"},{"name":"masked_select_0","op":"masked_select","op_count":0,"config":"mask (Variable) - dtype: bool, shape: [524288]\nx (Variable) - dtype: int32, shape: [524288]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/masked_select_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/masked_select_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/masked_select_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/masked_select_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/masked_select_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/masked_select_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.8505439271732252","paddle_perf_backwards":"4.249060883813975","paddle_gpu_time":"0.03860883720930233","paddle_gpu_time_backward":"0.0714328743545611"},{"name":"masked_select_1","op":"masked_select","op_count":0,"config":"mask (Variable) - dtype: bool, shape: [524288, 2]\nx (Variable) - dtype: float32, shape: [524288, 2]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/masked_select_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/masked_select_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/masked_select_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/masked_select_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/masked_select_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/masked_select_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.9208954002962534","paddle_perf_backwards":"1.3770919248282192","paddle_gpu_time":"0.06021037253469686","paddle_gpu_time_backward":"0.11727630838491841"},{"name":"matmul_0","op":"matmul","op_count":126,"config":"x (Variable) - dtype: float32, shape: [16, 128, 8]\ny (Variable) - dtype: float32, shape: [16, 8, 32]\ntranspose_x (bool): False\ntranspose_y (bool): False\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/matmul_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/matmul_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/matmul_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/matmul_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/matmul_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/matmul_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.02391192377830038","paddle_perf_backwards":"0.05573910109850825","paddle_gpu_time":"0.005289309442319308","paddle_gpu_time_backward":"0.02840518816222141"},{"name":"matmul_1","op":"matmul","op_count":126,"config":"x (Variable) - dtype: float32, shape: [16, 35, 1500]\ny (Variable) - dtype: float32, shape: [1500, 10000]\ntranspose_x (bool): False\ntranspose_y (bool): False\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/matmul_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/matmul_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/matmul_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/matmul_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/matmul_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/matmul_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"1.4757951911614864","paddle_perf_backwards":"4.188800101377526","paddle_gpu_time":"1.448476286579213","paddle_gpu_time_backward":"4.178450704225352"},{"name":"matmul_2","op":"matmul","op_count":126,"config":"x (Variable) - dtype: float32, shape: [16, 3000]\ny (Variable) - dtype: float32, shape: [3000, 6000]\ntranspose_x (bool): False\ntranspose_y (bool): False\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/matmul_2-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/matmul_2-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/matmul_2-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/matmul_2-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/matmul_2-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/matmul_2-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.13569729600497382","paddle_perf_backwards":"0.36570089374611037","paddle_gpu_time":"0.11201809396525134","paddle_gpu_time_backward":"0.3400709040263358"},{"name":"matmul_3","op":"matmul","op_count":126,"config":"x (Variable) - dtype: float32, shape: [16, 1, 512]\ny (Variable) - dtype: float32, shape: [16, 16, 512]\ntranspose_x (bool): False\ntranspose_y (bool): True\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/matmul_3-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/matmul_3-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/matmul_3-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/matmul_3-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/matmul_3-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/matmul_3-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.02594379463581124","paddle_perf_backwards":"0.04593158009076359","paddle_gpu_time":"0.003687982527427875","paddle_gpu_time_backward":"0.012859339758047978"},{"name":"matmul_4","op":"matmul","op_count":126,"config":"x (Variable) - dtype: float32, shape: [16, 1024]\ny (Variable) - dtype: float32, shape: [37007, 1024]\ntranspose_x (bool): False\ntranspose_y (bool): True\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/matmul_4-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/matmul_4-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/matmul_4-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/matmul_4-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/matmul_4-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/matmul_4-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.26045815500324376","paddle_perf_backwards":"0.6984235528475774","paddle_gpu_time":"0.23884417029022148","paddle_gpu_time_backward":"0.6687573303546496"},{"name":"matmul_5","op":"matmul","op_count":126,"config":"x (Variable) - dtype: float32, shape: [512, 4, 896, 2]\ny (Variable) - dtype: float32, shape: [512, 4, 12, 2]\ntranspose_x (bool): False\ntranspose_y (bool): True\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/matmul_5-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/matmul_5-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/matmul_5-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/matmul_5-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/matmul_5-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/matmul_5-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.6028146159892179","paddle_perf_backwards":"3.433480554697465","paddle_gpu_time":"0.587262682069312","paddle_gpu_time_backward":"3.430220514533912"},{"name":"matmul_6","op":"matmul","op_count":126,"config":"x (Variable) - dtype: float16, shape: [512, 4, 896, 2]\ny (Variable) - dtype: float16, shape: [512, 4, 12, 2]\ntranspose_x (bool): False\ntranspose_y (bool): True\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/matmul_6-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/matmul_6-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/matmul_6-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/matmul_6-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/matmul_6-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/matmul_6-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.415692037465621","paddle_perf_backwards":"2.0259876640475527","paddle_gpu_time":"0.39826392373306574","paddle_gpu_time_backward":"1.9641432791728215"},{"name":"matmul_7","op":"matmul","op_count":126,"config":"x (Variable) - dtype: float16, shape: [512, 4, 896, 8]\ny (Variable) - dtype: float16, shape: [512, 4, 16, 8]\ntranspose_x (bool): False\ntranspose_y (bool): True\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/matmul_7-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/matmul_7-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/matmul_7-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/matmul_7-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/matmul_7-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/matmul_7-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.26750296962504483","paddle_perf_backwards":"0.7261492768112494","paddle_gpu_time":"0.27538743882544864","paddle_gpu_time_backward":"0.7251646149441741"},{"name":"matmul_8","op":"matmul","op_count":126,"config":"x (Variable) - dtype: float32, shape: [4, 12, 64, 85]\ny (Variable) - dtype: float32, shape: [4, 12, 85, 512]\ntranspose_x (bool): False\ntranspose_y (bool): False\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/matmul_8-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/matmul_8-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/matmul_8-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/matmul_8-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/matmul_8-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/matmul_8-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.05597873609893176","paddle_perf_backwards":"0.16441369543270187","paddle_gpu_time":"0.03829849498327759","paddle_gpu_time_backward":"0.14591898297780653"},{"name":"matmul_9","op":"matmul","op_count":126,"config":"x (Variable) - dtype: float16, shape: [4, 12, 64, 85]\ny (Variable) - dtype: float16, shape: [4, 12, 85, 512]\ntranspose_x (bool): False\ntranspose_y (bool): False\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/matmul_9-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/matmul_9-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/matmul_9-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/matmul_9-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/matmul_9-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/matmul_9-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.04513069074981067","paddle_perf_backwards":"0.13665617728719906","paddle_gpu_time":"0.023698458975426905","paddle_gpu_time_backward":"0.1255202156334232"},{"name":"matmul_10","op":"matmul","op_count":126,"config":"x (Variable) - dtype: float16, shape: [4, 12, 64, 88]\ny (Variable) - dtype: float16, shape: [4, 12, 88, 512]\ntranspose_x (bool): False\ntranspose_y (bool): False\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/matmul_10-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/matmul_10-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/matmul_10-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/matmul_10-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/matmul_10-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/matmul_10-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.03124280851714465","paddle_perf_backwards":"0.07538162932103994","paddle_gpu_time":"0.013614432109308282","paddle_gpu_time_backward":"0.05356629966491144"},{"name":"max_0","op":"max","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 2048, 33, 33]\naxis (list): [2, 3]\nkeepdim (bool): True\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/max_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/max_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/max_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/max_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/max_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/max_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.1996272312615343","paddle_perf_backwards":"1.6810679722405626","paddle_gpu_time":"0.1831203731873035","paddle_gpu_time_backward":"1.6361644454483848"},{"name":"max_1","op":"max","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 8, 128]\naxis (list): [1]\nkeepdim (bool): False\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/max_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/max_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/max_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/max_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/max_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/max_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.02513029137436225","paddle_perf_backwards":"0.03712907129404496","paddle_gpu_time":"0.0015795060216370687","paddle_gpu_time_backward":"0.005473691384950927"},{"name":"max_2","op":"max","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 16, 1, 1]\naxis (list): [0]\nkeepdim (bool): False\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/max_2-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/max_2-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/max_2-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/max_2-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/max_2-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/max_2-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.024892359363789456","paddle_perf_backwards":"0.03859777839816346","paddle_gpu_time":"0.0016271564846762733","paddle_gpu_time_backward":"0.0060422764227642265"},{"name":"max_3","op":"max","op_count":0,"config":"x (Variable) - dtype: float32, shape: [30522, 1024]\naxis (string): None\nkeepdim (bool): False\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/max_3-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/max_3-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/max_3-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/max_3-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/max_3-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/max_3-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"False","paddle_perf":"0.1707858934192237","paddle_perf_backwards":"0.5447126342681701","paddle_gpu_time":"0.1477621364248007","paddle_gpu_time_backward":"0.5022187050359712"},{"name":"max_pool2d_0","op":"max_pool2d","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 64, 112, 112]\nceil_mode (bool): False\ndata_format (string): NCHW\nkernel_size (list): [3, 3]\npadding (list): [1, 1]\nreturn_indices (bool): False\nstride (list): [2, 2]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/max_pool2d_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/max_pool2d_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/max_pool2d_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/max_pool2d_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/max_pool2d_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/max_pool2d_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"0.1351168835498243","paddle_perf_backwards":"0.466054271023915","paddle_gpu_time":"0.115032002438281","paddle_gpu_time_backward":"0.43973380035026255"},{"name":"maximum_0","op":"maximum","op_count":0,"config":"x (Variable) - dtype: float32, shape: [50, 128, 1000]\ny (Variable) - dtype: float32, shape: [128, 1000]\naxis (int): -1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/maximum_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/maximum_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/maximum_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/maximum_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/maximum_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/maximum_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.08477045370413137","paddle_perf_backwards":"0.2148625967619536","paddle_gpu_time":"0.06434112806101967","paddle_gpu_time_backward":"0.19760766260766258"},{"name":"maximum_1","op":"maximum","op_count":0,"config":"x (Variable) - dtype: float32, shape: [50, 128, 1000]\ny (Variable) - dtype: float32, shape: [1, 128, 1000]\naxis (int): 0\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/maximum_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/maximum_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/maximum_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/maximum_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/maximum_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/maximum_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.0842081759879011","paddle_perf_backwards":"0.216521385437501","paddle_gpu_time":"0.06455680902497984","paddle_gpu_time_backward":"0.1983405012605665"},{"name":"maximum_2","op":"maximum","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 2048, 7, 7]\ny (Variable) - dtype: float32, shape: [16, 2048]\naxis (int): 0\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/maximum_2-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/maximum_2-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/maximum_2-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/maximum_2-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/maximum_2-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/maximum_2-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.03605414488033684","paddle_perf_backwards":"0.0872597665729408","paddle_gpu_time":"0.017323262839879155","paddle_gpu_time_backward":"0.06688649500788228"},{"name":"maximum_3","op":"maximum","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 2048, 16, 16]\ny (Variable) - dtype: float32, shape: [16, 2048, 16, 16]\naxis (int): -1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/maximum_3-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/maximum_3-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/maximum_3-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/maximum_3-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/maximum_3-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/maximum_3-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.13847967426858113","paddle_perf_backwards":"0.3543486337145727","paddle_gpu_time":"0.12139682860317139","paddle_gpu_time_backward":"0.3307553729456384"},{"name":"maximum_4","op":"maximum","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 1, 513, 513]\ny (Variable) - dtype: float32, shape: [1]\naxis (int): -1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/maximum_4-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/maximum_4-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/maximum_4-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/maximum_4-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/maximum_4-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/maximum_4-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.06175253816501411","paddle_perf_backwards":"2.0458938602455152","paddle_gpu_time":"0.042671701913393756","paddle_gpu_time_backward":"1.9967626633986926"},{"name":"maximum_5","op":"maximum","op_count":0,"config":"x (Variable) - dtype: float32, shape: [512, 896, 4, 12]\ny (Variable) - dtype: float32, shape: [512, 896, 4, 1]\naxis (int): -1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/maximum_5-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/maximum_5-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/maximum_5-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/maximum_5-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/maximum_5-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/maximum_5-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.24409814922508594","paddle_perf_backwards":"2.8593028236725524","paddle_gpu_time":"0.22475163727959693","paddle_gpu_time_backward":"2.818225263386554"},{"name":"maximum_6","op":"maximum","op_count":0,"config":"x (Variable) - dtype: float16, shape: [512, 896, 4, 12]\ny (Variable) - dtype: float16, shape: [512, 896, 4, 1]\naxis (int): -1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/maximum_6-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/maximum_6-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/maximum_6-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/maximum_6-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/maximum_6-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/maximum_6-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"--","paddle_perf_backwards":"--","paddle_gpu_time":"--","paddle_gpu_time_backward":"--"},{"name":"maximum_7","op":"maximum","op_count":0,"config":"x (Variable) - dtype: float16, shape: [32, 12, 128, 128]\ny (Variable) - dtype: float16, shape: [32, 1, 1, 128]\naxis (int): -1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/maximum_7-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/maximum_7-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/maximum_7-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/maximum_7-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/maximum_7-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/maximum_7-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"--","paddle_perf_backwards":"--","paddle_gpu_time":"--","paddle_gpu_time_backward":"--"},{"name":"maximum_8","op":"maximum","op_count":0,"config":"x (Variable) - dtype: float16, shape: [32, 1, 1, 128]\ny (Variable) - dtype: float16, shape: [1, 12, 128, 1]\naxis (int): -1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/maximum_8-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/maximum_8-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/maximum_8-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/maximum_8-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/maximum_8-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/maximum_8-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"--","paddle_perf_backwards":"--","paddle_gpu_time":"--","paddle_gpu_time_backward":"--"},{"name":"mean_0","op":"mean","op_count":89,"config":"x (Variable) - dtype: float32, shape: [16, 2048, 33, 33]\naxis (list): [2, 3]\nkeepdim (bool): True\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/mean_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/mean_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/mean_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/mean_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/mean_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/mean_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.19941306066417502","paddle_perf_backwards":"0.8797586083651067","paddle_gpu_time":"0.17356331168831166","paddle_gpu_time_backward":"0.8514483020780537"},{"name":"mean_1","op":"mean","op_count":89,"config":"x (Variable) - dtype: float32, shape: [16, 8, 128]\naxis (list): [1]\nkeepdim (bool): False\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/mean_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/mean_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/mean_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/mean_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/mean_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/mean_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.02769718364793427","paddle_perf_backwards":"0.03843818392072405","paddle_gpu_time":"0.0014884505314799671","paddle_gpu_time_backward":"0.003995257763500076"},{"name":"mean_2","op":"mean","op_count":89,"config":"x (Variable) - dtype: float32, shape: [16, 16, 1, 1]\naxis (list): [0]\nkeepdim (bool): False\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/mean_2-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/mean_2-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/mean_2-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/mean_2-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/mean_2-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/mean_2-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.024625476525754343","paddle_perf_backwards":"0.03720108343630421","paddle_gpu_time":"0.0018345498783454988","paddle_gpu_time_backward":"0.0038994013967409377"},{"name":"mean_3","op":"mean","op_count":89,"config":"x (Variable) - dtype: float32, shape: [30522, 1024]\naxis (string): None\nkeepdim (bool): False\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/mean_3-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/mean_3-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/mean_3-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/mean_3-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/mean_3-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/mean_3-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.17038841763574755","paddle_perf_backwards":"0.33359004404836284","paddle_gpu_time":"0.14776168804302853","paddle_gpu_time_backward":"0.30683249749247743"},{"name":"min_0","op":"min","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 2048, 33, 33]\naxis (list): [2, 3]\nkeepdim (bool): True\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/min_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/min_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/min_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/min_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/min_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/min_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.2019447649648051","paddle_perf_backwards":"1.6795070950158375","paddle_gpu_time":"0.18284729263841004","paddle_gpu_time_backward":"1.638775971093044"},{"name":"min_1","op":"min","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 8, 128]\naxis (list): [1]\nkeepdim (bool): False\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/min_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/min_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/min_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/min_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/min_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/min_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.025422475775893856","paddle_perf_backwards":"0.037416876578817565","paddle_gpu_time":"0.00158078022875817","paddle_gpu_time_backward":"0.006353684776761698"},{"name":"min_2","op":"min","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 16, 1, 1]\naxis (list): [0]\nkeepdim (bool): False\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/min_2-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/min_2-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/min_2-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/min_2-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/min_2-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/min_2-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.028192510410230985","paddle_perf_backwards":"0.03805306493019571","paddle_gpu_time":"0.0016269156602050138","paddle_gpu_time_backward":"0.005136037114636337"},{"name":"min_3","op":"min","op_count":0,"config":"x (Variable) - dtype: float32, shape: [30522, 1024]\naxis (string): None\nkeepdim (bool): False\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/min_3-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/min_3-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/min_3-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/min_3-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/min_3-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/min_3-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.17143982445787573","paddle_perf_backwards":"0.5299362247597001","paddle_gpu_time":"0.14782977842203354","paddle_gpu_time_backward":"0.5029080972242197"},{"name":"minimum_0","op":"minimum","op_count":0,"config":"x (Variable) - dtype: float32, shape: [50, 128, 1000]\ny (Variable) - dtype: float32, shape: [128, 1000]\naxis (int): -1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/minimum_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/minimum_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/minimum_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/minimum_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/minimum_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/minimum_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.0823591803167914","paddle_perf_backwards":"0.22396069985848885","paddle_gpu_time":"0.06434012444801285","paddle_gpu_time_backward":"0.19712989138521056"},{"name":"minimum_1","op":"minimum","op_count":0,"config":"x (Variable) - dtype: float32, shape: [50, 128, 1000]\ny (Variable) - dtype: float32, shape: [1, 128, 1000]\naxis (int): 0\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/minimum_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/minimum_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/minimum_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/minimum_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/minimum_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/minimum_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.08796587257920381","paddle_perf_backwards":"0.2209703764600123","paddle_gpu_time":"0.06456679427765465","paddle_gpu_time_backward":"0.19724758364312267"},{"name":"minimum_2","op":"minimum","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 2048, 7, 7]\ny (Variable) - dtype: float32, shape: [16, 2048]\naxis (int): 0\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/minimum_2-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/minimum_2-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/minimum_2-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/minimum_2-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/minimum_2-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/minimum_2-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.04870748233221815","paddle_perf_backwards":"0.09130344601097948","paddle_gpu_time":"0.01732749068197844","paddle_gpu_time_backward":"0.06639831623257038"},{"name":"minimum_3","op":"minimum","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 2048, 16, 16]\ny (Variable) - dtype: float32, shape: [16, 2048, 16, 16]\naxis (int): -1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/minimum_3-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/minimum_3-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/minimum_3-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/minimum_3-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/minimum_3-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/minimum_3-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.13916365847081125","paddle_perf_backwards":"0.3519819829172505","paddle_gpu_time":"0.12137850939204199","paddle_gpu_time_backward":"0.32924904942965777"},{"name":"minimum_4","op":"minimum","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 1, 513, 513]\ny (Variable) - dtype: float32, shape: [1]\naxis (int): -1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/minimum_4-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/minimum_4-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/minimum_4-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/minimum_4-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/minimum_4-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/minimum_4-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.06165571346550523","paddle_perf_backwards":"1.9735966034547123","paddle_gpu_time":"0.04274889157597743","paddle_gpu_time_backward":"1.941657300500664"},{"name":"minimum_5","op":"minimum","op_count":0,"config":"x (Variable) - dtype: float32, shape: [512, 896, 4, 12]\ny (Variable) - dtype: float32, shape: [512, 896, 4, 1]\naxis (int): -1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/minimum_5-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/minimum_5-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/minimum_5-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/minimum_5-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/minimum_5-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/minimum_5-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.24310541535188293","paddle_perf_backwards":"2.8595978129125075","paddle_gpu_time":"0.22472899455974207","paddle_gpu_time_backward":"2.816858570497501"},{"name":"minimum_6","op":"minimum","op_count":0,"config":"x (Variable) - dtype: float16, shape: [512, 896, 4, 12]\ny (Variable) - dtype: float16, shape: [512, 896, 4, 1]\naxis (int): -1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/minimum_6-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/minimum_6-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/minimum_6-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/minimum_6-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/minimum_6-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/minimum_6-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"--","paddle_perf_backwards":"--","paddle_gpu_time":"--","paddle_gpu_time_backward":"--"},{"name":"minimum_7","op":"minimum","op_count":0,"config":"x (Variable) - dtype: float16, shape: [32, 12, 128, 128]\ny (Variable) - dtype: float16, shape: [32, 1, 1, 128]\naxis (int): -1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/minimum_7-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/minimum_7-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/minimum_7-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/minimum_7-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/minimum_7-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/minimum_7-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"--","paddle_perf_backwards":"--","paddle_gpu_time":"--","paddle_gpu_time_backward":"--"},{"name":"minimum_8","op":"minimum","op_count":0,"config":"x (Variable) - dtype: float16, shape: [32, 1, 1, 128]\ny (Variable) - dtype: float16, shape: [1, 12, 128, 1]\naxis (int): -1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/minimum_8-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/minimum_8-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/minimum_8-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/minimum_8-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/minimum_8-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/minimum_8-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"--","paddle_perf_backwards":"--","paddle_gpu_time":"--","paddle_gpu_time_backward":"--"},{"name":"multiply_0","op":"multiply","op_count":0,"config":"x (Variable) - dtype: float32, shape: [50, 128, 1000]\ny (Variable) - dtype: float32, shape: [128, 1000]\naxis (int): -1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/multiply_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/multiply_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/multiply_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/multiply_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/multiply_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/multiply_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.08440452056365448","paddle_perf_backwards":"0.23651508478311684","paddle_gpu_time":"0.06434256749974906","paddle_gpu_time_backward":"0.21253546353345745"},{"name":"multiply_1","op":"multiply","op_count":0,"config":"x (Variable) - dtype: float32, shape: [50, 128, 1000]\ny (Variable) - dtype: float32, shape: [1, 128, 1000]\naxis (int): 0\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/multiply_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/multiply_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/multiply_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/multiply_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/multiply_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/multiply_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.08386289906167316","paddle_perf_backwards":"0.23416803929514304","paddle_gpu_time":"0.06465491183879093","paddle_gpu_time_backward":"0.21090241796200343"},{"name":"multiply_2","op":"multiply","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 2048, 7, 7]\ny (Variable) - dtype: float32, shape: [16, 2048]\naxis (int): 0\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/multiply_2-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/multiply_2-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/multiply_2-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/multiply_2-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/multiply_2-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/multiply_2-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.03712829463706466","paddle_perf_backwards":"0.09263433769852938","paddle_gpu_time":"0.017599919460384576","paddle_gpu_time_backward":"0.0671187106918239"},{"name":"multiply_3","op":"multiply","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 2048, 16, 16]\ny (Variable) - dtype: float32, shape: [16, 2048, 16, 16]\naxis (int): -1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/multiply_3-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/multiply_3-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/multiply_3-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/multiply_3-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/multiply_3-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/multiply_3-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.13811268166215243","paddle_perf_backwards":"0.3538968090065018","paddle_gpu_time":"0.12137749949505153","paddle_gpu_time_backward":"0.32916270218839194"},{"name":"multiply_4","op":"multiply","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 1, 513, 513]\ny (Variable) - dtype: float32, shape: [1]\naxis (int): -1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/multiply_4-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/multiply_4-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/multiply_4-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/multiply_4-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/multiply_4-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/multiply_4-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.06350551196234021","paddle_perf_backwards":"2.1210637503492094","paddle_gpu_time":"0.04271555197421434","paddle_gpu_time_backward":"2.0728673469387755"},{"name":"multiply_5","op":"multiply","op_count":0,"config":"x (Variable) - dtype: float32, shape: [512, 896, 4, 12]\ny (Variable) - dtype: float32, shape: [512, 896, 4, 1]\naxis (int): -1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/multiply_5-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/multiply_5-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/multiply_5-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/multiply_5-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/multiply_5-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/multiply_5-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.24462634910323577","paddle_perf_backwards":"2.8581989074278926","paddle_gpu_time":"0.22480854494155583","paddle_gpu_time_backward":"2.8198957428323195"},{"name":"multiply_6","op":"multiply","op_count":0,"config":"x (Variable) - dtype: float16, shape: [512, 896, 4, 12]\ny (Variable) - dtype: float16, shape: [512, 896, 4, 1]\naxis (int): -1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/multiply_6-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/multiply_6-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/multiply_6-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/multiply_6-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/multiply_6-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/multiply_6-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.137971491220007","paddle_perf_backwards":"2.7506839798157476","paddle_gpu_time":"0.12108020304568529","paddle_gpu_time_backward":"2.7189833699403825"},{"name":"multiply_7","op":"multiply","op_count":0,"config":"x (Variable) - dtype: float16, shape: [32, 12, 128, 128]\ny (Variable) - dtype: float16, shape: [32, 1, 1, 128]\naxis (int): -1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/multiply_7-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/multiply_7-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/multiply_7-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/multiply_7-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/multiply_7-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/multiply_7-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.05289493315669906","paddle_perf_backwards":"0.6155840364325955","paddle_gpu_time":"0.03507854464376203","paddle_gpu_time_backward":"0.6160071904409433"},{"name":"multiply_8","op":"multiply","op_count":0,"config":"x (Variable) - dtype: float16, shape: [32, 1, 1, 128]\ny (Variable) - dtype: float16, shape: [1, 12, 128, 1]\naxis (int): -1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/multiply_8-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/multiply_8-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/multiply_8-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/multiply_8-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/multiply_8-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/multiply_8-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"False","paddle_perf":"0.0582324001258743","paddle_perf_backwards":"0.49774570312194205","paddle_gpu_time":"0.039291929192919295","paddle_gpu_time_backward":"0.4732974683544304"},{"name":"mv_0","op":"mv","op_count":0,"config":"vec (Variable) - dtype: float32, shape: [8]\nx (Variable) - dtype: float32, shape: [128, 8]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/mv_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/mv_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/mv_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/mv_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/mv_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/mv_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.01991476331438337","paddle_perf_backwards":"0.038805543159951966","paddle_gpu_time":"0.0017147205031956985","paddle_gpu_time_backward":"0.005088160569105691"},{"name":"mv_1","op":"mv","op_count":0,"config":"vec (Variable) - dtype: float32, shape: [200]\nx (Variable) - dtype: float32, shape: [128, 200]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/mv_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/mv_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/mv_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/mv_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/mv_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/mv_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.019750781431943476","paddle_perf_backwards":"0.039230367702568214","paddle_gpu_time":"0.0027869591346153843","paddle_gpu_time_backward":"0.006279323797139141"},{"name":"normalize_0","op":"normalize","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 256, 128]\naxis (int): 1\nepsilon (float): 1e-12\np (int): 2\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/normalize_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/normalize_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/normalize_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/normalize_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/normalize_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/normalize_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.06518315295783841","paddle_perf_backwards":"0.1653454741653131","paddle_gpu_time":"0.020839288429375095","paddle_gpu_time_backward":"0.10400046114825916"},{"name":"not_equal_0","op":"not_equal","op_count":0,"config":"x (Variable) - dtype: int32, shape: [1]\ny (Variable) - dtype: int32, shape: [1]\ncond (string): None\nforce_cpu (string): None\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/not_equal_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/not_equal_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/not_equal_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/not_equal_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/not_equal_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/not_equal_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.02238812570820352","paddle_perf_backwards":"--","paddle_gpu_time":"0.0012246492985971945","paddle_gpu_time_backward":"--"},{"name":"not_equal_1","op":"not_equal","op_count":0,"config":"x (Variable) - dtype: float32, shape: [256, 1024]\ny (Variable) - dtype: float32, shape: [256, 1024]\ncond (string): None\nforce_cpu (string): None\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/not_equal_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/not_equal_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/not_equal_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/not_equal_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/not_equal_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/not_equal_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.020734580580839414","paddle_perf_backwards":"--","paddle_gpu_time":"0.0026132495948136144","paddle_gpu_time_backward":"--"},{"name":"not_equal_2","op":"not_equal","op_count":0,"config":"x (Variable) - dtype: int32, shape: [1024]\ny (Variable) - dtype: int32, shape: [256, 1024]\ncond (string): None\nforce_cpu (string): None\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/not_equal_2-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/not_equal_2-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/not_equal_2-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/not_equal_2-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/not_equal_2-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/not_equal_2-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.0250883952888076","paddle_perf_backwards":"--","paddle_gpu_time":"0.0033313216656608324","paddle_gpu_time_backward":"--"},{"name":"null_0","op":"null","op_count":0,"config":"None","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/null_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/null_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/null_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/null_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/null_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/null_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"0.023322689289949378","paddle_perf_backwards":"--","paddle_gpu_time":"2.176057390524585e-05","paddle_gpu_time_backward":"--"},{"name":"one_hot_0","op":"one_hot","op_count":17,"config":"x (Variable) - dtype: int64, shape: [16, 1]\nallow_out_of_range (bool): False\nnum_classes (int): 37007\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/one_hot_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/one_hot_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/one_hot_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/one_hot_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/one_hot_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/one_hot_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.0258462769644601","paddle_perf_backwards":"--","paddle_gpu_time":"0.0036748108448928125","paddle_gpu_time_backward":"--"},{"name":"p_norm_0","op":"p_norm","op_count":0,"config":"x (Variable) - dtype: float32, shape: [300, 128, 128]\nasvector (bool): False\naxis (int): -1\nkeepdim (bool): False\nporder (float): 2.0\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/p_norm_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/p_norm_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/p_norm_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/p_norm_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/p_norm_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/p_norm_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.18662676519277144","paddle_perf_backwards":"0.3870771855724101","paddle_gpu_time":"0.1713993871297242","paddle_gpu_time_backward":"0.3919901997738409"},{"name":"p_norm_1","op":"p_norm","op_count":0,"config":"x (Variable) - dtype: float32, shape: [300, 128, 128]\naxis (int): -1\nkeepdim (bool): False\nporder (float): 3.0\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/p_norm_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/p_norm_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/p_norm_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/p_norm_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/p_norm_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/p_norm_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.18432140350341797","paddle_perf_backwards":"0.3860587976416763","paddle_gpu_time":"0.17128561217195956","paddle_gpu_time_backward":"0.3804253717297196"},{"name":"pad2d_0","op":"pad2d","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 32, 255, 255]\ndata_format (string): NCHW\nmode (string): constant\npad (list): [0, 1, 0, 1]\nvalue (float): 0.0\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pad2d_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pad2d_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pad2d_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pad2d_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pad2d_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pad2d_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"1.127137860141126","paddle_perf_backwards":"2.3177146432868927","paddle_gpu_time":"1.1096286687206096","paddle_gpu_time_backward":"2.3614466918066506"},{"name":"pad2d_1","op":"pad2d","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 3, 256, 256]\ndata_format (string): NCHW\nmode (string): reflect\npad (list): [3, 3, 3, 3]\nvalue (float): 0.0\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pad2d_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pad2d_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pad2d_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pad2d_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pad2d_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pad2d_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.14496395386845232","paddle_perf_backwards":"0.2669577617721864","paddle_gpu_time":"0.1202021876035797","paddle_gpu_time_backward":"0.26161453782980953"},{"name":"pad3d_0","op":"pad3d","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 3, 64, 64, 64]\ndata_format (string): NCDHW\nmode (string): constant\npad (list): [1, 2, 3, 4, 5, 6]\nvalue (float): 0.0\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pad3d_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pad3d_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pad3d_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pad3d_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pad3d_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pad3d_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.219306313847921","paddle_perf_backwards":"0.4417245646557176","paddle_gpu_time":"0.21139283913262735","paddle_gpu_time_backward":"0.42658866249747823"},{"name":"pad3d_1","op":"pad3d","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 3, 64, 64, 64]\ndata_format (string): NCDHW\nmode (string): reflect\npad (list): [1, 2, 3, 4, 5, 6]\nvalue (float): 0.0\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pad3d_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pad3d_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pad3d_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pad3d_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pad3d_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pad3d_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.24192002882440405","paddle_perf_backwards":"0.523554991526776","paddle_gpu_time":"0.24101944192606026","paddle_gpu_time_backward":"0.52451541361721"},{"name":"pixel_shuffle_0","op":"pixel_shuffle","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 9, 224, 224]\ndata_format (string): NCHW\nupscale_factor (int): 3\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pixel_shuffle_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pixel_shuffle_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pixel_shuffle_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pixel_shuffle_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pixel_shuffle_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pixel_shuffle_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.09757499305569395","paddle_perf_backwards":"0.18823536074891384","paddle_gpu_time":"0.08605851063829786","paddle_gpu_time_backward":"0.1760717230008244"},{"name":"pixel_shuffle_1","op":"pixel_shuffle","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 224, 224, 9]\ndata_format (string): NHWC\nupscale_factor (int): 3\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pixel_shuffle_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pixel_shuffle_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pixel_shuffle_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pixel_shuffle_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pixel_shuffle_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pixel_shuffle_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.10350018131489656","paddle_perf_backwards":"0.18658346059371014","paddle_gpu_time":"0.09141768937790944","paddle_gpu_time_backward":"0.174027448147766"},{"name":"pow_0","op":"pow","op_count":0,"config":"x (Variable) - dtype: float32, shape: [50, 128, 1000]\ny (Variable) - dtype: float32, shape: [128, 1000]\naxis (int): -1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pow_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pow_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pow_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pow_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pow_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pow_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.08901823032367695","paddle_perf_backwards":"0.3645169723021972","paddle_gpu_time":"0.0690232768134845","paddle_gpu_time_backward":"0.34638958594730235"},{"name":"pow_1","op":"pow","op_count":0,"config":"x (Variable) - dtype: float32, shape: [50, 128, 1000]\ny (Variable) - dtype: float32, shape: [1, 128, 1000]\naxis (int): 0\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pow_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pow_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pow_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pow_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pow_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pow_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.0888666791285207","paddle_perf_backwards":"0.36444738059340115","paddle_gpu_time":"0.07177536231884057","paddle_gpu_time_backward":"0.34487266340484257"},{"name":"pow_2","op":"pow","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 2048, 7, 7]\ny (Variable) - dtype: float32, shape: [16, 2048]\naxis (int): 0\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pow_2-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pow_2-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pow_2-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pow_2-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pow_2-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pow_2-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.042710705606158604","paddle_perf_backwards":"0.10708952714541632","paddle_gpu_time":"0.02342716396903589","paddle_gpu_time_backward":"0.08629624718207134"},{"name":"pow_3","op":"pow","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 2048, 16, 16]\ny (Variable) - dtype: float32, shape: [16, 2048, 16, 16]\naxis (int): -1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pow_3-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pow_3-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pow_3-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pow_3-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pow_3-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pow_3-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.14131343914177233","paddle_perf_backwards":"0.3729744282418597","paddle_gpu_time":"0.1251433764135703","paddle_gpu_time_backward":"0.356313761750655"},{"name":"pow_4","op":"pow","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 1, 513, 513]\ny (Variable) - dtype: float32, shape: [1]\naxis (int): -1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pow_4-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pow_4-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pow_4-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pow_4-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pow_4-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pow_4-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.06607878423167135","paddle_perf_backwards":"7.065717586295638","paddle_gpu_time":"0.047508051529790665","paddle_gpu_time_backward":"6.987447151197906"},{"name":"pow_5","op":"pow","op_count":0,"config":"x (Variable) - dtype: float32, shape: [512, 896, 4, 12]\ny (Variable) - dtype: float32, shape: [512, 896, 4, 1]\naxis (int): -1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pow_5-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pow_5-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pow_5-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pow_5-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pow_5-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pow_5-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.2567394462998262","paddle_perf_backwards":"2.8715810221517253","paddle_gpu_time":"0.23971299093655588","paddle_gpu_time_backward":"2.830946403231088"},{"name":"pow_6","op":"pow","op_count":0,"config":"x (Variable) - dtype: float16, shape: [512, 896, 4, 12]\ny (Variable) - dtype: float16, shape: [512, 896, 4, 1]\naxis (int): -1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pow_6-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pow_6-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pow_6-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pow_6-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pow_6-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pow_6-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"--","paddle_perf_backwards":"--","paddle_gpu_time":"--","paddle_gpu_time_backward":"--"},{"name":"pow_7","op":"pow","op_count":0,"config":"x (Variable) - dtype: float16, shape: [32, 12, 128, 128]\ny (Variable) - dtype: float16, shape: [32, 1, 1, 128]\naxis (int): -1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pow_7-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pow_7-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pow_7-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pow_7-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pow_7-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pow_7-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"--","paddle_perf_backwards":"--","paddle_gpu_time":"--","paddle_gpu_time_backward":"--"},{"name":"pow_8","op":"pow","op_count":0,"config":"x (Variable) - dtype: float16, shape: [32, 1, 1, 128]\ny (Variable) - dtype: float16, shape: [1, 12, 128, 1]\naxis (int): -1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pow_8-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pow_8-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pow_8-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pow_8-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pow_8-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/pow_8-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"--","paddle_perf_backwards":"--","paddle_gpu_time":"--","paddle_gpu_time_backward":"--"},{"name":"prod_0","op":"prod","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 2048, 33, 33]\naxis (list): [2, 3]\nkeepdim (bool): True\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/prod_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/prod_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/prod_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/prod_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/prod_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/prod_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.19167279432675163","paddle_perf_backwards":"1.4833468712403446","paddle_gpu_time":"0.17441152597402598","paddle_gpu_time_backward":"1.4523652114442036"},{"name":"prod_1","op":"prod","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 8, 128]\naxis (list): [1]\nkeepdim (bool): False\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/prod_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/prod_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/prod_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/prod_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/prod_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/prod_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.037770855183504065","paddle_perf_backwards":"0.0542852343345175","paddle_gpu_time":"0.0015629977537267713","paddle_gpu_time_backward":"0.006177145169994603"},{"name":"prod_2","op":"prod","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 16, 1, 1]\naxis (list): [0]\nkeepdim (bool): False\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/prod_2-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/prod_2-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/prod_2-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/prod_2-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/prod_2-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/prod_2-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.02424570978904257","paddle_perf_backwards":"0.06208784726201272","paddle_gpu_time":"0.0018288406972030804","paddle_gpu_time_backward":"0.005902794117647057"},{"name":"prod_3","op":"prod","op_count":0,"config":"x (Variable) - dtype: float32, shape: [30522, 1024]\naxis (string): None\nkeepdim (bool): False\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/prod_3-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/prod_3-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/prod_3-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/prod_3-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/prod_3-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/prod_3-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.19732632474574394","paddle_perf_backwards":"0.5815793373780642","paddle_gpu_time":"0.1486355120732723","paddle_gpu_time_backward":"0.5069722142652535"},{"name":"relu_0","op":"relu","op_count":27,"config":"x (Variable) - dtype: float32, shape: [16, 128, 257, 257]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/relu_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/relu_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/relu_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/relu_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/relu_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/relu_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"1.3539744761281596","paddle_perf_backwards":"3.2837433662108766","paddle_gpu_time":"1.3219915339649264","paddle_gpu_time_backward":"3.228794037940379"},{"name":"relu_1","op":"relu","op_count":27,"config":"x (Variable) - dtype: float16, shape: [16, 128, 257, 257]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/relu_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/relu_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/relu_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/relu_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/relu_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/relu_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.700526796505303","paddle_perf_backwards":"1.6603213512825823","paddle_gpu_time":"0.6743392425463336","paddle_gpu_time_backward":"1.6361017525948616"},{"name":"relu6_0","op":"relu6","op_count":6,"config":"x (Variable) - dtype: float32, shape: [16, 128, 257, 257]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/relu6_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/relu6_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/relu6_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/relu6_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/relu6_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/relu6_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"1.337684920890059","paddle_perf_backwards":"3.2740882737842014","paddle_gpu_time":"1.3216436700574075","paddle_gpu_time_backward":"3.228281117696867"},{"name":"relu6_1","op":"relu6","op_count":6,"config":"x (Variable) - dtype: float16, shape: [16, 128, 257, 257]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/relu6_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/relu6_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/relu6_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/relu6_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/relu6_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/relu6_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.6879717649104361","paddle_perf_backwards":"1.6782909213660475","paddle_gpu_time":"0.674664115218048","paddle_gpu_time_backward":"1.6369525267993874"},{"name":"remainder_0","op":"remainder","op_count":0,"config":"x (Variable) - dtype: float32, shape: [50, 128, 1000]\ny (Variable) - dtype: float32, shape: [128, 1000]\naxis (int): -1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/remainder_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/remainder_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/remainder_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/remainder_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/remainder_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/remainder_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.0941829041795091","paddle_perf_backwards":"--","paddle_gpu_time":"0.0690914107967088","paddle_gpu_time_backward":"--"},{"name":"remainder_1","op":"remainder","op_count":0,"config":"x (Variable) - dtype: float32, shape: [50, 128, 1000]\ny (Variable) - dtype: float32, shape: [1, 128, 1000]\naxis (int): 0\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/remainder_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/remainder_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/remainder_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/remainder_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/remainder_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/remainder_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.09341003421791091","paddle_perf_backwards":"--","paddle_gpu_time":"0.06994762288477036","paddle_gpu_time_backward":"--"},{"name":"remainder_2","op":"remainder","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 2048, 7, 7]\ny (Variable) - dtype: float32, shape: [16, 2048]\naxis (int): 0\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/remainder_2-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/remainder_2-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/remainder_2-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/remainder_2-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/remainder_2-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/remainder_2-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.043435947211806436","paddle_perf_backwards":"--","paddle_gpu_time":"0.021873868436934216","paddle_gpu_time_backward":"--"},{"name":"remainder_3","op":"remainder","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 2048, 16, 16]\ny (Variable) - dtype: float32, shape: [16, 2048, 16, 16]\naxis (int): -1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/remainder_3-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/remainder_3-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/remainder_3-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/remainder_3-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/remainder_3-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/remainder_3-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.14694152709716307","paddle_perf_backwards":"--","paddle_gpu_time":"0.12395191433478128","paddle_gpu_time_backward":"--"},{"name":"remainder_4","op":"remainder","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 1, 513, 513]\ny (Variable) - dtype: float32, shape: [1]\naxis (int): -1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/remainder_4-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/remainder_4-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/remainder_4-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/remainder_4-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/remainder_4-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/remainder_4-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.06608585556427797","paddle_perf_backwards":"--","paddle_gpu_time":"0.045551192993053456","paddle_gpu_time_backward":"--"},{"name":"remainder_5","op":"remainder","op_count":0,"config":"x (Variable) - dtype: float32, shape: [512, 896, 4, 12]\ny (Variable) - dtype: float32, shape: [512, 896, 4, 1]\naxis (int): -1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/remainder_5-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/remainder_5-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/remainder_5-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/remainder_5-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/remainder_5-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/remainder_5-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.2603802031171107","paddle_perf_backwards":"--","paddle_gpu_time":"0.2364695809830782","paddle_gpu_time_backward":"--"},{"name":"remainder_6","op":"remainder","op_count":0,"config":"x (Variable) - dtype: float16, shape: [512, 896, 4, 12]\ny (Variable) - dtype: float16, shape: [512, 896, 4, 1]\naxis (int): -1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/remainder_6-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/remainder_6-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/remainder_6-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/remainder_6-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/remainder_6-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/remainder_6-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"--","paddle_perf_backwards":"--","paddle_gpu_time":"--","paddle_gpu_time_backward":"--"},{"name":"remainder_7","op":"remainder","op_count":0,"config":"x (Variable) - dtype: float16, shape: [32, 12, 128, 128]\ny (Variable) - dtype: float16, shape: [32, 1, 1, 128]\naxis (int): -1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/remainder_7-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/remainder_7-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/remainder_7-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/remainder_7-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/remainder_7-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/remainder_7-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"--","paddle_perf_backwards":"--","paddle_gpu_time":"--","paddle_gpu_time_backward":"--"},{"name":"remainder_8","op":"remainder","op_count":0,"config":"x (Variable) - dtype: float16, shape: [32, 1, 1, 128]\ny (Variable) - dtype: float16, shape: [1, 12, 128, 1]\naxis (int): -1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/remainder_8-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/remainder_8-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/remainder_8-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/remainder_8-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/remainder_8-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/remainder_8-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"--","paddle_perf_backwards":"--","paddle_gpu_time":"--","paddle_gpu_time_backward":"--"},{"name":"reshape_0","op":"reshape","op_count":287,"config":"x (Variable) - dtype: float32, shape: [16, 513, 513, 19]\nshape (list): [-1, 19]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/reshape_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/reshape_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/reshape_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/reshape_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/reshape_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/reshape_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.855584928717432","paddle_perf_backwards":"1.6878441065609495","paddle_gpu_time":"0.8182404437316225","paddle_gpu_time_backward":"1.633052996929649"},{"name":"reshape_1","op":"reshape","op_count":287,"config":"x (Variable) - dtype: float32, shape: [16]\nshape (list): [-1, 1]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/reshape_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/reshape_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/reshape_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/reshape_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/reshape_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/reshape_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.04028008908641581","paddle_perf_backwards":"0.04433198850982044","paddle_gpu_time":"0.0013458210322712003","paddle_gpu_time_backward":"0.002718598546042003"},{"name":"roll_0","op":"roll","op_count":0,"config":"x (Variable) - dtype: float32, shape: [100, 1785]\naxis (int): 0\nshifts (int): 1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/roll_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/roll_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/roll_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/roll_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/roll_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/roll_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.020183592426533604","paddle_perf_backwards":"0.02882310322352818","paddle_gpu_time":"0.0031274674506509875","paddle_gpu_time_backward":"0.006149310168625448"},{"name":"rsqrt_0","op":"rsqrt","op_count":4,"config":"x (Variable) - dtype: float32, shape: [16, 128, 257, 257]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/rsqrt_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/rsqrt_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/rsqrt_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/rsqrt_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/rsqrt_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/rsqrt_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"1.3449997605685002","paddle_perf_backwards":"3.2522294468774584","paddle_gpu_time":"1.321507001108089","paddle_gpu_time_backward":"3.2274420179448113"},{"name":"rsqrt_1","op":"rsqrt","op_count":4,"config":"x (Variable) - dtype: float16, shape: [16, 128, 257, 257]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/rsqrt_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/rsqrt_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/rsqrt_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/rsqrt_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/rsqrt_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/rsqrt_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.6905950381903945","paddle_perf_backwards":"1.6573372011433145","paddle_gpu_time":"0.6760489475274447","paddle_gpu_time_backward":"1.6377482541304718"},{"name":"scale_0","op":"scale","op_count":49,"config":"x (Variable) - dtype: float16, shape: [16, 16, 16]\nact (string): None\nbias (float): -1.0\nbias_after_scale (bool): False\nscale (float): 10000.0\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/scale_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/scale_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/scale_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/scale_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/scale_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/scale_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.019928143948924785","paddle_perf_backwards":"--","paddle_gpu_time":"0.0014838742496693456","paddle_gpu_time_backward":"--"},{"name":"scale_1","op":"scale","op_count":49,"config":"x (Variable) - dtype: float32, shape: [16, 16, 1024]\nact (string): None\nbias (float): 0.0\nbias_after_scale (bool): True\nscale (float): 32.0\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/scale_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/scale_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/scale_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/scale_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/scale_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/scale_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.0204150935253465","paddle_perf_backwards":"--","paddle_gpu_time":"0.002393779223419394","paddle_gpu_time_backward":"--"},{"name":"scatter_0","op":"scatter","op_count":2,"config":"index (Variable) - dtype: int32, shape: [16]\ninput (Variable) - dtype: float32, shape: [16, 64]\nupdates (Variable) - dtype: float32, shape: [16, 64]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/scatter_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/scatter_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/scatter_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/scatter_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/scatter_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/scatter_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"0.03119536808558873","paddle_perf_backwards":"0.05172977642137177","paddle_gpu_time":"0.0027572708981159683","paddle_gpu_time_backward":"0.0076618748128182085"},{"name":"scatter_1","op":"scatter","op_count":2,"config":"index (Variable) - dtype: int32, shape: [16]\ninput (Variable) - dtype: float32, shape: [16, 1024, 16]\nupdates (Variable) - dtype: float32, shape: [16, 1024, 16]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/scatter_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/scatter_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/scatter_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/scatter_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/scatter_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/scatter_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"0.03731126287376067","paddle_perf_backwards":"0.052826088595103064","paddle_gpu_time":"0.006187780922257366","paddle_gpu_time_backward":"0.015309523809523811"},{"name":"scatter_nd_add_0","op":"scatter_nd_add","op_count":0,"config":"index (Variable) - dtype: int32, shape: [8, 2]\ninput (Variable) - dtype: float32, shape: [16, 10, 10]\nupdates (Variable) - dtype: float32, shape: [8, 10]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/scatter_nd_add_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/scatter_nd_add_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/scatter_nd_add_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/scatter_nd_add_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/scatter_nd_add_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/scatter_nd_add_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.03767086535084004","paddle_perf_backwards":"0.06033279457870794","paddle_gpu_time":"0.0051713098729227755","paddle_gpu_time_backward":"0.010682750301568157"},{"name":"scatter_nd_add_1","op":"scatter_nd_add","op_count":0,"config":"index (Variable) - dtype: int32, shape: [16, 3]\ninput (Variable) - dtype: float32, shape: [16, 256, 14, 14]\nupdates (Variable) - dtype: float32, shape: [16, 14]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/scatter_nd_add_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/scatter_nd_add_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/scatter_nd_add_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/scatter_nd_add_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/scatter_nd_add_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/scatter_nd_add_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.03855802926672511","paddle_perf_backwards":"0.06191792737049272","paddle_gpu_time":"0.010380659490859388","paddle_gpu_time_backward":"0.025543127095397744"},{"name":"sequence_mask_0","op":"sequence_mask","op_count":8,"config":"maxlen (Variable) - dtype: int32, shape: [1]\nx (Variable) - dtype: int32, shape: [16]\ndtype (string): float32\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sequence_mask_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sequence_mask_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sequence_mask_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sequence_mask_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sequence_mask_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sequence_mask_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.06304638726370675","paddle_perf_backwards":"--","paddle_gpu_time":"0.004525942241801272","paddle_gpu_time_backward":"--"},{"name":"shape_0","op":"shape","op_count":0,"config":"input (Variable) - dtype: float32, shape: [16, 1]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/shape_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/shape_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/shape_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/shape_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/shape_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/shape_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.010757057034239476","paddle_perf_backwards":"--","paddle_gpu_time":"2.4158582876423452e-05","paddle_gpu_time_backward":"--"},{"name":"sigmoid_0","op":"sigmoid","op_count":31,"config":"x (Variable) - dtype: float32, shape: [16, 128, 257, 257]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sigmoid_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sigmoid_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sigmoid_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sigmoid_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sigmoid_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sigmoid_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"1.3364945719380656","paddle_perf_backwards":"3.2512845161682615","paddle_gpu_time":"1.3252795969773299","paddle_gpu_time_backward":"3.231389830508475"},{"name":"sigmoid_1","op":"sigmoid","op_count":31,"config":"x (Variable) - dtype: float16, shape: [16, 128, 257, 257]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sigmoid_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sigmoid_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sigmoid_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sigmoid_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sigmoid_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sigmoid_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.6992435168646619","paddle_perf_backwards":"1.6663444066095447","paddle_gpu_time":"0.6854613215149072","paddle_gpu_time_backward":"1.6467054794520548"},{"name":"sigmoid_cross_entropy_with_logits_0","op":"sigmoid_cross_entropy_with_logits","op_count":12,"config":"label (Variable) - dtype: float32, shape: [16, 900]\nx (Variable) - dtype: float32, shape: [16, 900]\nignore_index (int): -100\nnormalize (bool): False\n","timestamp":"2021.0312.104831.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.0312.104831.gcc82.post107.develop/sigmoid_cross_entropy_with_logits_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.0312.104831.gcc82.post107.develop/sigmoid_cross_entropy_with_logits_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.0312.104831.gcc82.post107.develop/sigmoid_cross_entropy_with_logits_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.0312.104831.gcc82.post107.develop/sigmoid_cross_entropy_with_logits_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.0312.104831.gcc82.post107.develop/sigmoid_cross_entropy_with_logits_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.0312.104831.gcc82.post107.develop/sigmoid_cross_entropy_with_logits_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.020173861055958024","paddle_perf_backwards":"0.03149825699475346","paddle_gpu_time":"0.001556322786766979","paddle_gpu_time_backward":"0.003836954431840031"},{"name":"sigmoid_cross_entropy_with_logits_1","op":"sigmoid_cross_entropy_with_logits","op_count":12,"config":"label (Variable) - dtype: float32, shape: [16, 63504]\nx (Variable) - dtype: float32, shape: [16, 63504]\nignore_index (int): -1\nnormalize (bool): True\n","timestamp":"2021.0312.104831.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.0312.104831.gcc82.post107.develop/sigmoid_cross_entropy_with_logits_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.0312.104831.gcc82.post107.develop/sigmoid_cross_entropy_with_logits_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.0312.104831.gcc82.post107.develop/sigmoid_cross_entropy_with_logits_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.0312.104831.gcc82.post107.develop/sigmoid_cross_entropy_with_logits_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.0312.104831.gcc82.post107.develop/sigmoid_cross_entropy_with_logits_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.0312.104831.gcc82.post107.develop/sigmoid_cross_entropy_with_logits_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.037319593161464215","paddle_perf_backwards":"0.0659092363104763","paddle_gpu_time":"0.022909495849362218","paddle_gpu_time_backward":"0.04957171496263897"},{"name":"sin_0","op":"sin","op_count":2,"config":"x (Variable) - dtype: float32, shape: [16, 128, 257, 257]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sin_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sin_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sin_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sin_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sin_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sin_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"1.3346246105874469","paddle_perf_backwards":"3.247939178604401","paddle_gpu_time":"1.3233225871448722","paddle_gpu_time_backward":"3.2298492292054886"},{"name":"sin_1","op":"sin","op_count":2,"config":"x (Variable) - dtype: float16, shape: [16, 128, 257, 257]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sin_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sin_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sin_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sin_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sin_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sin_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.6946616277904931","paddle_perf_backwards":"1.7008365753418457","paddle_gpu_time":"0.6788213961922032","paddle_gpu_time_backward":"1.6506847918436702"},{"name":"sinh_0","op":"sinh","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 128, 257, 257]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sinh_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sinh_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sinh_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sinh_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sinh_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sinh_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"1.336435468975671","paddle_perf_backwards":"3.2544358937678215","paddle_gpu_time":"1.3222166246851383","paddle_gpu_time_backward":"3.2282133784928027"},{"name":"sinh_1","op":"sinh","op_count":0,"config":"x (Variable) - dtype: float16, shape: [16, 128, 257, 257]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sinh_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sinh_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sinh_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sinh_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sinh_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sinh_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.6919448982498688","paddle_perf_backwards":"1.6677965859850805","paddle_gpu_time":"0.6773282288938143","paddle_gpu_time_backward":"1.6445003399048266"},{"name":"slice_0","op":"slice","op_count":76,"config":"input (Variable) - dtype: float32, shape: [16, 800]\naxes (list): [1]\nends (list): [400]\nstarts (list): [200]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/slice_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/slice_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/slice_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/slice_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/slice_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/slice_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.024388031083710338","paddle_perf_backwards":"0.04004702276113081","paddle_gpu_time":"0.0015134887508907663","paddle_gpu_time_backward":"0.004098848203541345"},{"name":"slice_1","op":"slice","op_count":76,"config":"input (Variable) - dtype: float32, shape: [35, 16, 1500]\naxes (list): [0]\nends (list): [35]\nstarts (list): [34]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/slice_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/slice_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/slice_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/slice_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/slice_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/slice_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.02390215466091702","paddle_perf_backwards":"0.035558639465270936","paddle_gpu_time":"0.0016510644799837017","paddle_gpu_time_backward":"0.006055302919172657"},{"name":"slice_2","op":"slice","op_count":76,"config":"input (Variable) - dtype: float32, shape: [2, 16, 1500]\naxes (list): [0]\nends (list): [1]\nstarts (list): [0]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/slice_2-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/slice_2-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/slice_2-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/slice_2-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/slice_2-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/slice_2-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.02398515234188158","paddle_perf_backwards":"0.037946992990921954","paddle_gpu_time":"0.0016330095080611823","paddle_gpu_time_backward":"0.003978765759787658"},{"name":"slice_3","op":"slice","op_count":76,"config":"input (Variable) - dtype: float32, shape: [512, 1407, 4, 12]\naxes (list): [0, 1, 2, 3]\nends (list): [100000000, 896, 100000000, 100000000]\nstarts (list): [0, 0, 0, 0]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/slice_3-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/slice_3-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/slice_3-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/slice_3-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/slice_3-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/slice_3-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.2660661337366066","paddle_perf_backwards":"0.5811898105115776","paddle_gpu_time":"0.24909629327070357","paddle_gpu_time_backward":"0.5615384615384614"},{"name":"slice_4","op":"slice","op_count":76,"config":"input (Variable) - dtype: float16, shape: [512, 1407, 4, 12]\naxes (list): [0, 1, 2, 3]\nends (list): [100000000, 896, 100000000, 100000000]\nstarts (list): [0, 0, 0, 0]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/slice_4-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/slice_4-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/slice_4-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/slice_4-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/slice_4-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/slice_4-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.1637374540888162","paddle_perf_backwards":"0.39486650482239016","paddle_gpu_time":"0.14581762608252674","paddle_gpu_time_backward":"0.37392044524472084"},{"name":"softmax_0","op":"softmax","op_count":57,"config":"x (Variable) - dtype: float32, shape: [16, 1000]\naxis (int): -1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.020836567392154618","paddle_perf_backwards":"0.03165103951278998","paddle_gpu_time":"0.0026560760902559952","paddle_gpu_time_backward":"0.0062670386608210445"},{"name":"softmax_1","op":"softmax","op_count":57,"config":"x (Variable) - dtype: float16, shape: [16, 1000]\naxis (int): -1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.021432370555644134","paddle_perf_backwards":"0.028021238288100884","paddle_gpu_time":"0.0044968313046977165","paddle_gpu_time_backward":"0.008498803009575924"},{"name":"softmax_2","op":"softmax","op_count":57,"config":"x (Variable) - dtype: float32, shape: [32, 12, 128, 128]\naxis (int): -1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_2-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_2-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_2-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_2-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_2-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_2-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.0773188579513366","paddle_perf_backwards":"0.1804336007819118","paddle_gpu_time":"0.06412624314998984","paddle_gpu_time_backward":"0.15515646258503402"},{"name":"softmax_3","op":"softmax","op_count":57,"config":"x (Variable) - dtype: float16, shape: [32, 12, 128, 128]\naxis (int): -1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_3-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_3-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_3-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_3-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_3-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_3-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.04906151668134942","paddle_perf_backwards":"0.09678362842544495","paddle_gpu_time":"0.0353683997162258","paddle_gpu_time_backward":"0.08267462267462268"},{"name":"softmax_4","op":"softmax","op_count":57,"config":"x (Variable) - dtype: float32, shape: [15, 16, 33, 33]\naxis (int): -1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_4-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_4-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_4-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_4-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_4-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_4-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.02298211764715955","paddle_perf_backwards":"0.027381131548680857","paddle_gpu_time":"0.005265201685731487","paddle_gpu_time_backward":"0.008984078068823833"},{"name":"softmax_5","op":"softmax","op_count":57,"config":"x (Variable) - dtype: float16, shape: [15, 16, 33, 33]\naxis (int): -1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_5-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_5-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_5-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_5-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_5-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_5-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.018159301581984772","paddle_perf_backwards":"0.029898095943168074","paddle_gpu_time":"0.004538384445780717","paddle_gpu_time_backward":"0.008705998356614626"},{"name":"softmax_6","op":"softmax","op_count":57,"config":"x (Variable) - dtype: float32, shape: [128, 128, 16, 16]\naxis (int): 0\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_6-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_6-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_6-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_6-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_6-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_6-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.19006729125976562","paddle_perf_backwards":"0.3240252027706224","paddle_gpu_time":"0.17661701693531934","paddle_gpu_time_backward":"0.3162762022194821"},{"name":"softmax_7","op":"softmax","op_count":57,"config":"x (Variable) - dtype: float16, shape: [128, 128, 16, 16]\naxis (int): 0\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_7-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_7-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_7-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_7-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_7-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_7-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.16897478882147343","paddle_perf_backwards":"0.26627681693252253","paddle_gpu_time":"0.14603720177921553","paddle_gpu_time_backward":"0.2403442986193966"},{"name":"softmax_8","op":"softmax","op_count":57,"config":"x (Variable) - dtype: float32, shape: [512, 896, 4, 12]\naxis (int): 1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_8-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_8-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_8-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_8-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_8-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_8-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"1.214469695577816","paddle_perf_backwards":"2.1376079442549725","paddle_gpu_time":"1.205351712953578","paddle_gpu_time_backward":"2.1142730966260377"},{"name":"softmax_9","op":"softmax","op_count":57,"config":"x (Variable) - dtype: float16, shape: [512, 896, 4, 12]\naxis (int): 1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_9-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_9-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_9-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_9-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_9-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_9-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"1.0450976235525948","paddle_perf_backwards":"2.0851050104413713","paddle_gpu_time":"1.0414675939925413","paddle_gpu_time_backward":"2.089928343949044"},{"name":"softmax_with_cross_entropy_0","op":"softmax_with_cross_entropy","op_count":56,"config":"label (Variable) - dtype: float32, shape: [16, 37007]\nlogits (Variable) - dtype: float32, shape: [16, 37007]\naxis (int): -1\nignore_index (int): -100\nsoft_label (bool): True\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_with_cross_entropy_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_with_cross_entropy_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_with_cross_entropy_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_with_cross_entropy_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_with_cross_entropy_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_with_cross_entropy_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"0.11570477972225268","paddle_perf_backwards":"0.13766240100471341","paddle_gpu_time":"0.10620450241952452","paddle_gpu_time_backward":"0.12376018626309661"},{"name":"softmax_with_cross_entropy_1","op":"softmax_with_cross_entropy","op_count":56,"config":"label (Variable) - dtype: int64, shape: [8, 512, 1024, 1]\nlogits (Variable) - dtype: float32, shape: [8, 512, 1024, 19]\naxis (int): -1\nignore_index (int): -100\nsoft_label (bool): False\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_with_cross_entropy_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_with_cross_entropy_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_with_cross_entropy_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_with_cross_entropy_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_with_cross_entropy_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_with_cross_entropy_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"1.3898168701723397","paddle_perf_backwards":"3.384454135435173","paddle_gpu_time":"1.4024381875063074","paddle_gpu_time_backward":"3.3998303777949115"},{"name":"softmax_with_cross_entropy_2","op":"softmax_with_cross_entropy","op_count":56,"config":"label (Variable) - dtype: int64, shape: [8, 1, 512, 1024]\nlogits (Variable) - dtype: float32, shape: [8, 19, 512, 1024]\naxis (int): 1\nignore_index (int): -100\nsoft_label (bool): False\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_with_cross_entropy_2-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_with_cross_entropy_2-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_with_cross_entropy_2-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_with_cross_entropy_2-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_with_cross_entropy_2-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softmax_with_cross_entropy_2-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"3.911814397695113","paddle_perf_backwards":"6.74598606265321","paddle_gpu_time":"3.9189571544058213","paddle_gpu_time_backward":"6.785400731626205"},{"name":"softplus_0","op":"softplus","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 128, 257, 257]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softplus_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softplus_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softplus_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softplus_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softplus_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softplus_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"1.3426620162321714","paddle_perf_backwards":"3.2649486002797836","paddle_gpu_time":"1.330907991534818","paddle_gpu_time_backward":"3.2381324278438033"},{"name":"softplus_1","op":"softplus","op_count":0,"config":"x (Variable) - dtype: float16, shape: [16, 128, 257, 257]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softplus_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softplus_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softplus_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softplus_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softplus_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softplus_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.8669820720542647","paddle_perf_backwards":"1.805807139448269","paddle_gpu_time":"0.8647334540333937","paddle_gpu_time_backward":"1.8003173431734318"},{"name":"softsign_0","op":"softsign","op_count":1,"config":"x (Variable) - dtype: float32, shape: [16, 128, 257, 257]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softsign_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softsign_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softsign_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softsign_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softsign_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softsign_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"1.3378798842191217","paddle_perf_backwards":"3.251453845916626","paddle_gpu_time":"1.323639661426844","paddle_gpu_time_backward":"3.2297475859732336"},{"name":"softsign_1","op":"softsign","op_count":1,"config":"x (Variable) - dtype: float16, shape: [16, 128, 257, 257]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softsign_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softsign_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softsign_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softsign_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softsign_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/softsign_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.6950900406541233","paddle_perf_backwards":"1.670726339420479","paddle_gpu_time":"0.6816433249370277","paddle_gpu_time_backward":"1.6515659574468082"},{"name":"split_0","op":"split","op_count":34,"config":"x (Variable) - dtype: float32, shape: [16, 35, 1500]\naxis (int): 1\nnum_or_sections (int): 35\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/split_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/split_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/split_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/split_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/split_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/split_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.054610262111741664","paddle_perf_backwards":"0.4886919138382892","paddle_gpu_time":"0.011020939734422882","paddle_gpu_time_backward":"0.06521407442005292"},{"name":"split_1","op":"split","op_count":34,"config":"x (Variable) - dtype: float32, shape: [16, 800]\naxis (int): -1\nnum_or_sections (int): 4\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/split_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/split_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/split_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/split_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/split_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/split_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.029759990925691565","paddle_perf_backwards":"0.07541276970688178","paddle_gpu_time":"0.0019985806974858068","paddle_gpu_time_backward":"0.007737397888025502"},{"name":"sqrt_0","op":"sqrt","op_count":5,"config":"x (Variable) - dtype: float32, shape: [16, 128, 257, 257]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sqrt_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sqrt_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sqrt_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sqrt_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sqrt_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sqrt_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"1.3336197885578285","paddle_perf_backwards":"3.2463036940427488","paddle_gpu_time":"1.322317380352645","paddle_gpu_time_backward":"3.228885878767355"},{"name":"sqrt_1","op":"sqrt","op_count":5,"config":"x (Variable) - dtype: float16, shape: [16, 128, 257, 257]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sqrt_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sqrt_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sqrt_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sqrt_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sqrt_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sqrt_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.6916114228044101","paddle_perf_backwards":"1.6631993119845647","paddle_gpu_time":"0.6780821917808219","paddle_gpu_time_backward":"1.645970403129784"},{"name":"square_0","op":"square","op_count":41,"config":"x (Variable) - dtype: float32, shape: [16, 128, 257, 257]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/square_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/square_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/square_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/square_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/square_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/square_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"1.3314448998781867","paddle_perf_backwards":"3.2442415644506175","paddle_gpu_time":"1.3219836710009072","paddle_gpu_time_backward":"3.2272773450728076"},{"name":"square_1","op":"square","op_count":41,"config":"x (Variable) - dtype: float16, shape: [16, 128, 257, 257]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/square_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/square_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/square_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/square_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/square_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/square_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.686971123567325","paddle_perf_backwards":"1.6558408020493502","paddle_gpu_time":"0.6744191435768262","paddle_gpu_time_backward":"1.6360802311745706"},{"name":"squeeze_0","op":"squeeze","op_count":37,"config":"x (Variable) - dtype: float32, shape: [16, 1, 512]\naxis (list): [1]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/squeeze_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/squeeze_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/squeeze_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/squeeze_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/squeeze_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/squeeze_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.0236944276459363","paddle_perf_backwards":"0.03825158488993742","paddle_gpu_time":"0.0012546938775510204","paddle_gpu_time_backward":"0.002617771936918722"},{"name":"stack_0","op":"stack","op_count":30,"config":"x (list<Variable>[16]) - dtype: float32, shape: [16, 16, 16]; \naxis (int): 1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/stack_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/stack_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/stack_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/stack_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/stack_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/stack_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.030216392205685984","paddle_perf_backwards":"0.05751653593413684","paddle_gpu_time":"0.0038309537407195888","paddle_gpu_time_backward":"0.008363758389261743"},{"name":"subtract_0","op":"subtract","op_count":0,"config":"x (Variable) - dtype: float32, shape: [50, 128, 1000]\ny (Variable) - dtype: float32, shape: [128, 1000]\naxis (int): -1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/subtract_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/subtract_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/subtract_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/subtract_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/subtract_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/subtract_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.08278990889693405","paddle_perf_backwards":"0.1823334961204796","paddle_gpu_time":"0.06432744428829552","paddle_gpu_time_backward":"0.164159211398277"},{"name":"subtract_1","op":"subtract","op_count":0,"config":"x (Variable) - dtype: float32, shape: [50, 128, 1000]\ny (Variable) - dtype: float32, shape: [1, 128, 1000]\naxis (int): 0\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/subtract_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/subtract_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/subtract_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/subtract_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/subtract_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/subtract_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.09056750184787299","paddle_perf_backwards":"0.1825693136226677","paddle_gpu_time":"0.0645345557122708","paddle_gpu_time_backward":"0.16769821808075855"},{"name":"subtract_2","op":"subtract","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 2048, 7, 7]\ny (Variable) - dtype: float32, shape: [16, 2048]\naxis (int): 0\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/subtract_2-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/subtract_2-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/subtract_2-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/subtract_2-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/subtract_2-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/subtract_2-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.03610620039976192","paddle_perf_backwards":"0.08771085070225901","paddle_gpu_time":"0.017470546772731847","paddle_gpu_time_backward":"0.06801374477833176"},{"name":"subtract_3","op":"subtract","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 2048, 16, 16]\ny (Variable) - dtype: float32, shape: [16, 2048, 16, 16]\naxis (int): -1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/subtract_3-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/subtract_3-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/subtract_3-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/subtract_3-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/subtract_3-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/subtract_3-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.1372949871606005","paddle_perf_backwards":"0.273771109227427","paddle_gpu_time":"0.12144690310195005","paddle_gpu_time_backward":"0.25142829457364335"},{"name":"subtract_4","op":"subtract","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 1, 513, 513]\ny (Variable) - dtype: float32, shape: [1]\naxis (int): -1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/subtract_4-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/subtract_4-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/subtract_4-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/subtract_4-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/subtract_4-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/subtract_4-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.06103095167385553","paddle_perf_backwards":"1.6685241926648096","paddle_gpu_time":"0.04268285311303647","paddle_gpu_time_backward":"1.628483417188623"},{"name":"subtract_5","op":"subtract","op_count":0,"config":"x (Variable) - dtype: float32, shape: [512, 896, 4, 12]\ny (Variable) - dtype: float32, shape: [512, 896, 4, 1]\naxis (int): -1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/subtract_5-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/subtract_5-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/subtract_5-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/subtract_5-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/subtract_5-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/subtract_5-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.24455671081084285","paddle_perf_backwards":"2.8568513407735883","paddle_gpu_time":"0.2249828594474692","paddle_gpu_time_backward":"2.818928610235792"},{"name":"subtract_6","op":"subtract","op_count":0,"config":"x (Variable) - dtype: float16, shape: [512, 896, 4, 12]\ny (Variable) - dtype: float16, shape: [512, 896, 4, 1]\naxis (int): -1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/subtract_6-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/subtract_6-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/subtract_6-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/subtract_6-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/subtract_6-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/subtract_6-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.13806637989948076","paddle_perf_backwards":"2.7510071854036013","paddle_gpu_time":"0.12149618320610686","paddle_gpu_time_backward":"2.7186355550905095"},{"name":"subtract_7","op":"subtract","op_count":0,"config":"x (Variable) - dtype: float16, shape: [32, 12, 128, 128]\ny (Variable) - dtype: float16, shape: [32, 1, 1, 128]\naxis (int): -1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/subtract_7-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/subtract_7-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/subtract_7-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/subtract_7-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/subtract_7-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/subtract_7-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.05736279200358563","paddle_perf_backwards":"0.4702519700230365","paddle_gpu_time":"0.034752103822366424","paddle_gpu_time_backward":"0.45278996865203763"},{"name":"subtract_8","op":"subtract","op_count":0,"config":"x (Variable) - dtype: float16, shape: [32, 1, 1, 128]\ny (Variable) - dtype: float16, shape: [1, 12, 128, 1]\naxis (int): -1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/subtract_8-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/subtract_8-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/subtract_8-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/subtract_8-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/subtract_8-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/subtract_8-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"False","paddle_perf":"0.06094506842817715","paddle_perf_backwards":"0.3341202745456734","paddle_gpu_time":"0.039277927792779284","paddle_gpu_time_backward":"0.3115570862032978"},{"name":"sum_0","op":"sum","op_count":5,"config":"x (Variable) - dtype: float32, shape: [16, 2048, 33, 33]\naxis (list): [2, 3]\nkeepdim (bool): True\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sum_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sum_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sum_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sum_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sum_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sum_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.18815435245185194","paddle_perf_backwards":"0.8324546661071166","paddle_gpu_time":"0.1739560886359016","paddle_gpu_time_backward":"0.8088945949415842"},{"name":"sum_1","op":"sum","op_count":5,"config":"x (Variable) - dtype: float32, shape: [16, 8, 128]\naxis (list): [1]\nkeepdim (bool): False\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sum_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sum_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sum_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sum_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sum_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sum_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.02590320548232721","paddle_perf_backwards":"0.03718478339059012","paddle_gpu_time":"0.001390094146541138","paddle_gpu_time_backward":"0.004471009975062344"},{"name":"sum_2","op":"sum","op_count":5,"config":"x (Variable) - dtype: float32, shape: [16, 16, 1, 1]\naxis (list): [0]\nkeepdim (bool): False\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sum_2-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sum_2-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sum_2-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sum_2-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sum_2-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sum_2-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.02345308965566207","paddle_perf_backwards":"0.037146831045345385","paddle_gpu_time":"0.001770214061073349","paddle_gpu_time_backward":"0.003938223938223939"},{"name":"sum_3","op":"sum","op_count":5,"config":"x (Variable) - dtype: float32, shape: [30522, 1024]\naxis (string): None\nkeepdim (bool): False\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sum_3-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sum_3-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sum_3-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sum_3-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sum_3-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sum_3-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.1701762537679118","paddle_perf_backwards":"0.3294186506099357","paddle_gpu_time":"0.14803752850922663","paddle_gpu_time_backward":"0.30286993690209646"},{"name":"switch_case_0","op":"switch_case","op_count":0,"config":"input (Variable) - dtype: int32, shape: [1]\nx (Variable) - dtype: float32, shape: [16, 256, 6, 6]\ny (Variable) - dtype: float32, shape: [16, 256, 6, 6]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/switch_case_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/switch_case_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/switch_case_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/switch_case_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/switch_case_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/switch_case_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.45750579055474727","paddle_perf_backwards":"0.8581339096536441","paddle_gpu_time":"0.034866225999370476","paddle_gpu_time_backward":"0.08377721678763017"},{"name":"sync_batch_norm_0","op":"sync_batch_norm","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 256]\ndata_format (string): NCHW\nepsilon (float): 1e-05\nmomentum (float): 0.9\ntraining (bool): True\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sync_batch_norm_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sync_batch_norm_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sync_batch_norm_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sync_batch_norm_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sync_batch_norm_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/sync_batch_norm_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"0.036811585329016856","paddle_perf_backwards":"0.06529871298342335","paddle_gpu_time":"0.0064332638164754955","paddle_gpu_time_backward":"0.015762620837808806"},{"name":"tanh_0","op":"tanh","op_count":15,"config":"x (Variable) - dtype: float32, shape: [16, 128, 257, 257]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/tanh_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/tanh_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/tanh_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/tanh_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/tanh_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/tanh_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"1.3358249931870578","paddle_perf_backwards":"3.2478842085492396","paddle_gpu_time":"1.324355098750504","paddle_gpu_time_backward":"3.230057607590647"},{"name":"tanh_1","op":"tanh","op_count":15,"config":"x (Variable) - dtype: float16, shape: [16, 128, 257, 257]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/tanh_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/tanh_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/tanh_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/tanh_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/tanh_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/tanh_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.69619720588944","paddle_perf_backwards":"1.6641721935692673","paddle_gpu_time":"0.6832198610131935","paddle_gpu_time_backward":"1.6443382352941178"},{"name":"temporal_shift_0","op":"temporal_shift","op_count":0,"config":"x (Variable) - dtype: float32, shape: [32, 64, 24, 42]\nseg_num (int): 2\nshift_ratio (float): 0.2\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/temporal_shift_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/temporal_shift_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/temporal_shift_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/temporal_shift_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/temporal_shift_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/temporal_shift_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"0.04302454760754444","paddle_perf_backwards":"0.06815343496789894","paddle_gpu_time":"0.027037636584378792","paddle_gpu_time_backward":"0.05493472584856396"},{"name":"temporal_shift_1","op":"temporal_shift","op_count":0,"config":"x (Variable) - dtype: float32, shape: [128, 64, 24, 24]\nseg_num (int): 2\nshift_ratio (float): 0.2\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/temporal_shift_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/temporal_shift_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/temporal_shift_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/temporal_shift_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/temporal_shift_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/temporal_shift_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"0.07505359419857163","paddle_perf_backwards":"0.12881779766465767","paddle_gpu_time":"0.05825928549741929","paddle_gpu_time_backward":"0.12082412060301508"},{"name":"tile_0","op":"tile","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 1785, 1]\nrepeat_times (list): [1, 1, 2]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/tile_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/tile_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/tile_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/tile_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/tile_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/tile_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.020698868498510242","paddle_perf_backwards":"0.03389485028325295","paddle_gpu_time":"0.0025193774794018916","paddle_gpu_time_backward":"0.006220121539500337"},{"name":"tile_1","op":"tile","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 5, 1, 1]\nrepeat_times (list): [1, 1, 128, 128]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/tile_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/tile_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/tile_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/tile_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/tile_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/tile_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.03281929055038764","paddle_perf_backwards":"6.094040676039093","paddle_gpu_time":"0.017126551862234683","paddle_gpu_time_backward":"5.999087353324642"},{"name":"tile_2","op":"tile","op_count":0,"config":"x (Variable) - dtype: float32, shape: [32, 807, 1]\nrepeat_times (list): [4, 1, 807]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/tile_2-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/tile_2-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/tile_2-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/tile_2-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/tile_2-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/tile_2-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.6788959308546417","paddle_perf_backwards":"9.610410612456652","paddle_gpu_time":"0.6588158815881588","paddle_gpu_time_backward":"9.552288354103997"},{"name":"topk_0","op":"topk","op_count":12,"config":"x (Variable) - dtype: float32, shape: [16, 1000]\nk (int): 5\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/topk_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/topk_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/topk_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/topk_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/topk_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/topk_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.09244534434104451","paddle_perf_backwards":"0.08933106247259646","paddle_gpu_time":"0.05498392282958199","paddle_gpu_time_backward":"0.05690897729639659"},{"name":"topk_1","op":"topk","op_count":12,"config":"x (Variable) - dtype: float32, shape: [16, 3]\nk (int): 1\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/topk_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/topk_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/topk_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/topk_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/topk_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/topk_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.07954762906444317","paddle_perf_backwards":"0.07401826430340203","paddle_gpu_time":"0.04202742002492729","paddle_gpu_time_backward":"0.04434896401308615"},{"name":"trace_0","op":"trace","op_count":0,"config":"x (Variable) - dtype: float32, shape: [100, 1785]\naxis1 (int): 0\naxis2 (int): 1\noffset (int): 0\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/trace_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/trace_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/trace_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/trace_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/trace_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/trace_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.08990837603199239","paddle_perf_backwards":"0.15245603055370097","paddle_gpu_time":"0.006681853734845523","paddle_gpu_time_backward":"0.0172659793814433"},{"name":"transpose_0","op":"transpose","op_count":150,"config":"x (Variable) - dtype: float32, shape: [16, 14, 1, 1]\nperm (list): [0, 2, 3, 1]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/transpose_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/transpose_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/transpose_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/transpose_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/transpose_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/transpose_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.02262957242070412","paddle_perf_backwards":"0.03082800884636081","paddle_gpu_time":"0.0013323840520748577","paddle_gpu_time_backward":"0.0027402807230132277"},{"name":"transpose_1","op":"transpose","op_count":150,"config":"x (Variable) - dtype: float32, shape: [16, 19, 513, 513]\nperm (list): [0, 2, 3, 1]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/transpose_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/transpose_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/transpose_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/transpose_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/transpose_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/transpose_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.9620216955621559","paddle_perf_backwards":"1.9231419965445278","paddle_gpu_time":"0.9506863737598704","paddle_gpu_time_backward":"1.9007526665325014"},{"name":"transpose_2","op":"transpose","op_count":150,"config":"x (Variable) - dtype: float32, shape: [16, 128, 256]\nperm (list): [0, 2, 1]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/transpose_2-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/transpose_2-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/transpose_2-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/transpose_2-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/transpose_2-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/transpose_2-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.02143153225082949","paddle_perf_backwards":"0.03245566264692559","paddle_gpu_time":"0.00413495056569157","paddle_gpu_time_backward":"0.009788465416414599"},{"name":"transpose_3","op":"transpose","op_count":150,"config":"x (Variable) - dtype: float32, shape: [4, 12, 512, 896]\nperm (list): [2, 3, 0, 1]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/transpose_3-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/transpose_3-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/transpose_3-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/transpose_3-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/transpose_3-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/transpose_3-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.25761429085788956","paddle_perf_backwards":"0.4812271240724617","paddle_gpu_time":"0.24031246829664196","paddle_gpu_time_backward":"0.4626727089627392"},{"name":"transpose_4","op":"transpose","op_count":150,"config":"x (Variable) - dtype: float16, shape: [4, 12, 512, 896]\nperm (list): [2, 3, 0, 1]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/transpose_4-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/transpose_4-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/transpose_4-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/transpose_4-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/transpose_4-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/transpose_4-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.16231450689844337","paddle_perf_backwards":"0.2966718980107442","paddle_gpu_time":"0.14974322684997976","paddle_gpu_time_backward":"0.2881886754500654"},{"name":"transpose_5","op":"transpose","op_count":150,"config":"x (Variable) - dtype: float32, shape: [4, 512, 512, 1]\nperm (list): [1, 2, 0, 3]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/transpose_5-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/transpose_5-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/transpose_5-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/transpose_5-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/transpose_5-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/transpose_5-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.029926462824564858","paddle_perf_backwards":"0.049997333541931394","paddle_gpu_time":"0.013135020242914979","paddle_gpu_time_backward":"0.03339831189710611"},{"name":"transpose_6","op":"transpose","op_count":150,"config":"x (Variable) - dtype: float16, shape: [4, 512, 512, 1]\nperm (list): [1, 2, 0, 3]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/transpose_6-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/transpose_6-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/transpose_6-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/transpose_6-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/transpose_6-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/transpose_6-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.023686981584173608","paddle_perf_backwards":"0.03205121281635331","paddle_gpu_time":"0.005528862737141118","paddle_gpu_time_backward":"0.01375133286389699"},{"name":"tril_0","op":"tril","op_count":0,"config":"x (Variable) - dtype: float32, shape: [1024, 2048]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/tril_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/tril_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/tril_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/tril_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/tril_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/tril_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"0.03392780670011887","paddle_perf_backwards":"0.05546218217021287","paddle_gpu_time":"0.02166424870466321","paddle_gpu_time_backward":"0.04251655291840684"},{"name":"triu_0","op":"triu","op_count":0,"config":"x (Variable) - dtype: float32, shape: [1024, 2048]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/triu_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/triu_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/triu_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/triu_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/triu_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/triu_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"0.03963889497699159","paddle_perf_backwards":"0.0609138999322448","paddle_gpu_time":"0.02687120666598087","paddle_gpu_time_backward":"0.05307489344428658"},{"name":"unique_0","op":"unique","op_count":0,"config":"x (Variable) - dtype: float32, shape: [100, 100]\naxis (string): None\ndtype (string): int64\nreturn_counts (bool): True\nreturn_index (bool): True\nreturn_inverse (bool): True\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/unique_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/unique_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/unique_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/unique_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/unique_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/unique_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"1.0880377827858438","paddle_perf_backwards":"--","paddle_gpu_time":"0.3859603415834419","paddle_gpu_time_backward":"--"},{"name":"unique_1","op":"unique","op_count":0,"config":"x (Variable) - dtype: float32, shape: [4, 50, 30]\naxis (int): 1\ndtype (string): int64\nreturn_counts (bool): True\nreturn_index (bool): True\nreturn_inverse (bool): True\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/unique_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/unique_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/unique_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/unique_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/unique_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/unique_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"0.7353597757767658","paddle_perf_backwards":"--","paddle_gpu_time":"0.40334484924623115","paddle_gpu_time_backward":"--"},{"name":"unsqueeze_0","op":"unsqueeze","op_count":44,"config":"x (Variable) - dtype: float32, shape: [16, 16, 1]\naxis (list): [2]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/unsqueeze_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/unsqueeze_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/unsqueeze_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/unsqueeze_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/unsqueeze_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/unsqueeze_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.025111801770268657","paddle_perf_backwards":"0.03830121487987284","paddle_gpu_time":"0.0013793068297655454","paddle_gpu_time_backward":"0.002551115147845393"},{"name":"where_index_0","op":"where_index","op_count":0,"config":"x (Variable) - dtype: bool, shape: [16, 100, 100]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/where_index_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/where_index_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/where_index_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/where_index_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/where_index_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/where_index_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.06352468412749622","paddle_perf_backwards":"--","paddle_gpu_time":"0.02116631130063966","paddle_gpu_time_backward":"--"},{"name":"where_index_1","op":"where_index","op_count":0,"config":"x (Variable) - dtype: int32, shape: [16, 10]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/where_index_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/where_index_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/where_index_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/where_index_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/where_index_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/where_index_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.07170098168509348","paddle_perf_backwards":"--","paddle_gpu_time":"0.010370146243066064","paddle_gpu_time_backward":"--"},{"name":"where_index_2","op":"where_index","op_count":0,"config":"x (Variable) - dtype: float32, shape: [16, 1000]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/where_index_2-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/where_index_2-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/where_index_2-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/where_index_2-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/where_index_2-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/where_index_2-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.059839414090526344","paddle_perf_backwards":"--","paddle_gpu_time":"0.011010791366906475","paddle_gpu_time_backward":"--"},{"name":"while_loop_0","op":"while_loop","op_count":0,"config":"bias (Variable) - dtype: float32, shape: [2048]\nweight (Variable) - dtype: float32, shape: [36864, 2048]\nx (Variable) - dtype: float32, shape: [16, 36864]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/while_loop_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/while_loop_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/while_loop_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/while_loop_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/while_loop_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/while_loop_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"1.4588034882837413","paddle_perf_backwards":"3.955042605497399","paddle_gpu_time":"1.2018284106891701","paddle_gpu_time_backward":"3.54094759962347"},{"name":"while_loop_1","op":"while_loop","op_count":0,"config":"bias (Variable) - dtype: float32, shape: [1024]\nweight (Variable) - dtype: float32, shape: [1024, 1024]\nx (Variable) - dtype: float32, shape: [16, 16, 1024]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/while_loop_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/while_loop_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/while_loop_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/while_loop_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/while_loop_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/while_loop_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.38945991165783944","paddle_perf_backwards":"0.6976341714664381","paddle_gpu_time":"0.09498564593301435","paddle_gpu_time_backward":"0.26178486055776895"},{"name":"while_loop_2","op":"while_loop","op_count":0,"config":"bias (Variable) - dtype: float32, shape: [1024]\nweight (Variable) - dtype: float32, shape: [12544, 1024]\nx (Variable) - dtype: float32, shape: [16, 12544]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/while_loop_2-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/while_loop_2-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/while_loop_2-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/while_loop_2-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/while_loop_2-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/while_loop_2-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.5039649780350502","paddle_perf_backwards":"1.0877628519077494","paddle_gpu_time":"0.2427589367552704","paddle_gpu_time_backward":"0.6757570483814829"},{"name":"while_loop_3","op":"while_loop","op_count":0,"config":"bias (Variable) - dtype: float32, shape: [256]\nweight (Variable) - dtype: float32, shape: [16, 256]\nx (Variable) - dtype: float32, shape: [16, 16]\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/while_loop_3-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/while_loop_3-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/while_loop_3-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/while_loop_3-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/while_loop_3-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/while_loop_3-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"True","paddle_perf":"0.35235419565317583","paddle_perf_backwards":"0.6417615073067802","paddle_gpu_time":"0.020852953325072282","paddle_gpu_time_backward":"0.041831088664422"},{"name":"yolo_box_0","op":"yolo_box","op_count":1,"config":"img_size (Variable) - dtype: int32, shape: [32, 2]\nx (Variable) - dtype: float32, shape: [32, 21, 13, 13]\nanchors (list): [10, 13, 16, 30, 33, 23]\nclass_num (int): 2\nconf_thresh (float): 0.01\ndownsample_ratio (int): 32\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/yolo_box_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/yolo_box_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/yolo_box_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/yolo_box_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/yolo_box_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/yolo_box_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"0.04008721156292651","paddle_perf_backwards":"--","paddle_gpu_time":"0.013164809723628997","paddle_gpu_time_backward":"--"},{"name":"yolo_box_1","op":"yolo_box","op_count":1,"config":"img_size (Variable) - dtype: int32, shape: [128, 2]\nx (Variable) - dtype: float32, shape: [128, 21, 26, 26]\nanchors (list): [10, 13, 16, 30, 33, 23]\nclass_num (int): 2\nconf_thresh (float): 0.01\ndownsample_ratio (int): 32\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/yolo_box_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/yolo_box_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/yolo_box_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/yolo_box_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/yolo_box_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/yolo_box_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"--","paddle_consistency_backwards":"--","paddle_perf":"0.07906146796352892","paddle_perf_backwards":"--","paddle_gpu_time":"0.05594174757281553","paddle_gpu_time_backward":"--"},{"name":"zeros_like_0","op":"zeros_like","op_count":5,"config":"x (Variable) - dtype: float32, shape: [30522, 1024]\nout (string): None\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/zeros_like_0-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/zeros_like_0-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/zeros_like_0-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/zeros_like_0-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/zeros_like_0-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/zeros_like_0-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.15890951863749472","paddle_perf_backwards":"--","paddle_gpu_time":"0.14643060353586668","paddle_gpu_time_backward":"--"},{"name":"zeros_like_1","op":"zeros_like","op_count":5,"config":"x (Variable) - dtype: float32, shape: [3]\nout (string): None\n","timestamp":"2021.1023.160715.gcc82.post107.develop","paddle_accuracy_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/zeros_like_1-paddle_gpu_accuracy_backward.txt","paddle_accuracy_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/zeros_like_1-paddle_gpu_accuracy_forward.txt","paddle_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/zeros_like_1-paddle_gpu_speed_backward.txt","paddle_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/zeros_like_1-paddle_gpu_speed_forward.txt","tf_speed_backward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/zeros_like_1-tensorflow_gpu_speed_backward.txt","tf_speed_forward_log_url":"logs/op_benchmark/2021.1023.160715.gcc82.post107.develop/zeros_like_1-tensorflow_gpu_speed_forward.txt","paddle_consistency":"True","paddle_consistency_backwards":"--","paddle_perf":"0.018170415138711735","paddle_perf_backwards":"--","paddle_gpu_time":"0.001253539777936233","paddle_gpu_time_backward":"--"}]
diff --git a/python/paddle/device/__init__.py b/python/paddle/device/__init__.py
index 0c53097d9ff3b..84b08fcdd39a0 100644
--- a/python/paddle/device/__init__.py
+++ b/python/paddle/device/__init__.py
@@ -48,7 +48,7 @@ def is_compiled_with_npu():
         .. code-block:: python
 
             import paddle
-            support_npu = paddle.is_compiled_with_npu()
+            support_npu = paddle.device.is_compiled_with_npu()
     """
     return core.is_compiled_with_npu()
 
@@ -63,7 +63,7 @@ def is_compiled_with_xpu():
         .. code-block:: python
 
             import paddle
-            support_xpu = paddle.is_compiled_with_xpu()
+            support_xpu = paddle.device.is_compiled_with_xpu()
     """
     return core.is_compiled_with_xpu()
 
@@ -77,10 +77,11 @@ def XPUPlace(dev_id):
 
     Examples:
         .. code-block:: python
+
             # required: xpu
             
             import paddle
-            place = paddle.XPUPlace(0)
+            place = paddle.device.XPUPlace(0)
     """
     return core.XPUPlace(dev_id)
 
@@ -98,7 +99,7 @@ def get_cudnn_version():
             
             import paddle
 
-            cudnn_version = paddle.get_cudnn_version()
+            cudnn_version = paddle.device.get_cudnn_version()
 
 
 
@@ -195,7 +196,7 @@ def set_device(device):
             
         import paddle
 
-        paddle.set_device("cpu")
+        paddle.device.set_device("cpu")
         x1 = paddle.ones(name='x1', shape=[1, 2], dtype='int32')
         x2 = paddle.zeros(name='x2', shape=[1, 2], dtype='int32')
         data = paddle.stack([x1,x2], axis=1)
@@ -217,7 +218,7 @@ def get_device():
      .. code-block:: python
             
         import paddle
-        device = paddle.get_device()
+        device = paddle.device.get_device()
 
     """
     device = ''
diff --git a/python/paddle/device/cuda/graphs.py b/python/paddle/device/cuda/graphs.py
index 612f4d2c8cebd..2a60aad2fd2de 100644
--- a/python/paddle/device/cuda/graphs.py
+++ b/python/paddle/device/cuda/graphs.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 from paddle.fluid.core import is_compiled_with_cuda, is_compiled_with_rocm, CUDAPlace
 
 if is_compiled_with_cuda() and not is_compiled_with_rocm():
@@ -22,7 +23,8 @@ def __init__(self, place=None, mode="thread_local"):
             ALL_MODES = ["global", "thread_local", "relaxed"]
             self._graph = None
             if place is None:
-                place = CUDAPlace(0)
+                device_id = int(os.environ.get('FLAGS_selected_gpus', 0))
+                place = CUDAPlace(device_id)
             self._place = place
             assert mode in ALL_MODES
             self._mode = ALL_MODES.index(mode)
@@ -38,6 +40,16 @@ def replay(self):
 
         def reset(self):
             self._graph.reset()
+
+        def print_to_dot_files(self, dirname, flags=None):
+            if not isinstance(dirname, (str, bytes)):
+                dirname = dirname.name
+            os.makedirs(name=dirname, exist_ok=True)
+            assert os.path.isdir(
+                dirname), "The dirname {} should be a directory".format(dirname)
+            if flags is None:
+                flags = 2047  # only all information. It can be any integer inside [1, 2048)  
+            self._graph.print_to_dot_files(dirname, flags)
 else:
 
     class CUDAGraph:
@@ -55,3 +67,6 @@ def replay(self):
 
         def reset(self):
             raise NotImplementedError()
+
+        def print_to_dot_files(self, dirname, flags=None):
+            raise NotImplementedError()
diff --git a/python/paddle/distributed/__init__.py b/python/paddle/distributed/__init__.py
index 20007f76ed5e4..600327e4a508c 100644
--- a/python/paddle/distributed/__init__.py
+++ b/python/paddle/distributed/__init__.py
@@ -43,10 +43,6 @@
 
 from .auto_parallel import shard_op  # noqa: F401
 from .auto_parallel import shard_tensor  # noqa: F401
-from .auto_parallel import set_shard_mask  # noqa: F401
-from .auto_parallel import set_offload_device  # noqa: F401
-from .auto_parallel import set_pipeline_stage  # noqa: F401
-from .auto_parallel import ProcessMesh  # noqa: F401
 
 from .fleet import BoxPSDataset  # noqa: F401
 
diff --git a/python/paddle/distributed/auto_parallel/__init__.py b/python/paddle/distributed/auto_parallel/__init__.py
index 2779a9feb0b83..3b5ccaa062f6e 100644
--- a/python/paddle/distributed/auto_parallel/__init__.py
+++ b/python/paddle/distributed/auto_parallel/__init__.py
@@ -14,10 +14,11 @@
 
 from .interface import shard_tensor  # noqa: F401
 from .interface import shard_op  # noqa: F401
-from .interface import set_shard_mask  # noqa: F401
-from .interface import set_offload_device  # noqa: F401
-from .interface import set_pipeline_stage  # noqa: F401
-from .interface import ProcessMesh  # noqa: F401
+from .process_mesh import ProcessMesh
+# from .interface import set_shard_mask  # noqa: F401
+# from .interface import set_offload_device  # noqa: F401
+# from .interface import set_pipeline_stage  # noqa: F401
+# from .interface import ProcessMesh  # noqa: F401
 from .completion import complete_annotation  # noqa: F401
 from .completion import complete_backward_annotation  # noqa: F401
 from .reshard import reshard  # noqa: F401
diff --git a/python/paddle/distributed/auto_parallel/attribute.py b/python/paddle/distributed/auto_parallel/attribute.py
deleted file mode 100644
index 879e94b83733c..0000000000000
--- a/python/paddle/distributed/auto_parallel/attribute.py
+++ /dev/null
@@ -1,309 +0,0 @@
-#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License
-
-import copy
-from collections import defaultdict
-from paddle.fluid import core
-
-
-class TensorDistributedAttribute:
-    def __init__(self, owner_tensor, owner_context):
-        self._owner_tensor = owner_tensor
-        self._owner_context = owner_context
-        self._process_mesh = None
-        self._dims_mapping = None
-        self._shard_mask = None
-        self._offload_device = None
-        self._shape = None
-        self._is_annotated = {}
-        self._is_parameter = False
-
-    def get_owner_tensor(self):
-        return self._owner_tensor
-
-    def get_owner_context(self):
-        return self._owner_context
-
-    def get_process_mesh(self):
-        return self._process_mesh
-
-    def set_process_mesh(self, process_mesh):
-        self._process_mesh = copy.deepcopy(process_mesh)
-
-    def get_dims_mapping(self):
-        return self._dims_mapping
-
-    def set_dims_mapping(self, dims_mapping):
-        self._dims_mapping = copy.deepcopy(dims_mapping)
-
-    def get_shard_mask(self):
-        return self._shard_mask
-
-    def set_shard_mask(self, shard_mask):
-        self._shard_mask = copy.deepcopy(shard_mask)
-
-    def get_offload_device(self):
-        return self._offload_device
-
-    def set_offload_device(self, offload_device):
-        self._offload_device = copy.deepcopy(offload_device)
-
-    def get_shape(self):
-        return self._shape
-
-    def set_shape(self, shape):
-        self._shape = copy.deepcopy(shape)
-
-    def is_annotated(self, dist_attr_name):
-        return self._is_annotated.get(dist_attr_name, False)
-
-    def mark_as_annotated(self, dist_attr_name):
-        self._is_annotated[dist_attr_name] = True
-
-    def is_parameter(self):
-        return self._is_parameter
-
-    def mark_as_parameter(self):
-        self._is_parameter = True
-
-    def is_valid(self):
-        if self.get_owner_tensor().type == core.VarDesc.VarType.READER:
-            return True
-        tensor_shape = self.get_owner_tensor().desc.shape()
-        if len(tensor_shape) != len(self.get_dims_mapping()):
-            return False
-        for i in range(len(self.get_dims_mapping())):
-            if self.get_dims_mapping()[i] < -1 or self.get_dims_mapping()[
-                    i] >= len(self.get_process_mesh().topology):
-                return False
-        for i in range(len(self.get_process_mesh().topology)):
-            if self.get_dims_mapping().count(i) > 1:
-                return False
-        return True
-
-    def __str__(self):
-        str = "{{tensor name: {}, tensor id: {}".format(
-            self.get_owner_tensor().desc.name(),
-            self.get_owner_tensor().desc.id())
-        if self.is_annotated("process_mesh"):
-            annotated_str = "annotated"
-        else:
-            annotated_str = "non-annotated"
-        str += ", process_mesh ({}): {}".format(annotated_str,
-                                                self.get_process_mesh())
-
-        str += ", is_parameter: {}".format(self._is_parameter)
-
-        if self.is_annotated("dims_mapping"):
-            annotated_str = "annotated"
-        else:
-            annotated_str = "non-annotated"
-        str += ", dims_mapping ({}): {}".format(annotated_str,
-                                                self.get_dims_mapping())
-
-        if self.is_annotated("shard_mask"):
-            annotated_str = "annotated"
-        else:
-            annotated_str = "non-annotated"
-        str += ", shard_mask ({}): {}".format(annotated_str,
-                                              self.get_shard_mask())
-
-        if self.is_annotated("offload_device"):
-            annotated_str = "annotated"
-        else:
-            annotated_str = "non-annotated"
-        str += ", offload_device ({}): {} }}".format(annotated_str,
-                                                     self.get_offload_device())
-        return str
-
-    def __deepcopy__(self, memo):
-        cls = self.__class__
-        result = cls.__new__(cls)
-        memo[id(self)] = result
-        for k, v in self.__dict__.items():
-            # No need to copy the owner tensor and context
-            if k == "_owner_tensor" or k == "_owner_context":
-                setattr(result, k, v)
-            else:
-                setattr(result, k, copy.deepcopy(v, memo))
-        return result
-
-
-class OperatorDistributedAttribute:
-    def __init__(self, owner_op, owner_context):
-        self._owner_op = owner_op
-        self._owner_context = owner_context
-        self._process_mesh = None
-        self._dims_mapping = {}
-        self._shapes = {}
-        self._is_annotated = {}
-        self._is_parameters = {}
-        self._pipeline_stage = None
-        self._impl_idx = None
-
-    def get_owner_op(self):
-        return self._owner_op
-
-    def get_owner_context(self):
-        return self._owner_context
-
-    def get_process_mesh(self):
-        return self._process_mesh
-
-    def set_process_mesh(self, process_mesh):
-        self._process_mesh = copy.deepcopy(process_mesh)
-
-    def get_input_dims_mapping(self, name):
-        return self._dims_mapping.get("IN_" + name, None)
-
-    def set_input_dims_mapping(self, name, dims_mapping):
-        self._dims_mapping["IN_" + name] = copy.deepcopy(dims_mapping)
-
-    def get_output_dims_mapping(self, name):
-        return self._dims_mapping.get("OUT_" + name, None)
-
-    def set_output_dims_mapping(self, name, dims_mapping):
-        self._dims_mapping["OUT_" + name] = copy.deepcopy(dims_mapping)
-
-    def get_impl_idx(self):
-        return self._impl_idx
-
-    def set_impl_idx(self, impl_idx):
-        self._impl_idx = impl_idx
-
-    def get_pipeline_stage(self):
-        return self._pipeline_stage
-
-    def set_pipeline_stage(self, pipeline_stage):
-        self._pipeline_stage = copy.deepcopy(pipeline_stage)
-
-    def get_input_shape(self, name):
-        return self._shapes.get("IN_" + name, None)
-
-    def set_input_shape(self, name, shape):
-        self._shapes["IN_" + name] = copy.deepcopy(shape)
-
-    def get_output_shape(self, name):
-        return self._shapes.get("OUT_" + name, None)
-
-    def set_output_shape(self, name, shape):
-        self._shapes["OUT_" + name] = copy.deepcopy(shape)
-
-    def is_annotated(self, attr_name):
-        return self._is_annotated.get(attr_name, False)
-
-    def mark_as_annotated(self, attr_name):
-        self._is_annotated[attr_name] = True
-
-    def is_annotated_input_dims_mapping(self, name):
-        return self._is_annotated.get("IN_" + name, False)
-
-    def mark_as_annotated_input_dims_mapping(self, name):
-        self._is_annotated["IN_" + name] = True
-
-    def is_annotated_output_dims_mapping(self, name):
-        return self._is_annotated.get("OUT_" + name, False)
-
-    def mark_as_annotated_output_dims_mapping(self, name):
-        self._is_annotated["OUT_" + name] = True
-
-    def is_parameter(self, name):
-        return self._is_parameters.get(name, False)
-
-    def mark_as_parameter(self, name):
-        self._is_parameters[name] = True
-
-    def is_valid(self):
-        if "read" in self.get_owner_op().type:
-            return True
-        for name in self.get_owner_op().desc.input_arg_names():
-            dims_mapping = self.get_input_dims_mapping(name)
-            shape = self.get_input_shape(name)
-            if len(shape) != len(dims_mapping):
-                return False
-            for i in range(len(dims_mapping)):
-                if dims_mapping[i] < -1 or dims_mapping[i] >= len(
-                        self.get_process_mesh().topology):
-                    return False
-            for i in range(len(self.get_process_mesh().topology)):
-                if dims_mapping.count(i) > 1:
-                    return False
-        for name in self.get_owner_op().desc.output_arg_names():
-            dims_mapping = self.get_output_dims_mapping(name)
-            shape = self.get_output_shape(name)
-            if len(shape) != len(dims_mapping):
-                return False
-            for i in range(len(dims_mapping)):
-                if dims_mapping[i] < -1 or dims_mapping[i] >= len(
-                        self.get_process_mesh().topology):
-                    return False
-            for i in range(len(self.get_process_mesh().topology)):
-                if dims_mapping.count(i) > 1:
-                    return False
-        return True
-
-    def __str__(self):
-        str = "{{op type: {}, op id: {}".format(self.get_owner_op().desc.type(),
-                                                self.get_owner_op().desc.id())
-
-        if self.is_annotated("process_mesh"):
-            annotated_str = "annotated"
-        else:
-            annotated_str = "non-annotated"
-        str += ", process_mesh ({}): {}".format(annotated_str,
-                                                self.get_process_mesh())
-
-        for arg_name in self.get_owner_op().desc.input_arg_names():
-            dims_mapping = self.get_input_dims_mapping(arg_name)
-            if self.is_annotated_input_dims_mapping(arg_name):
-                annotated_str = "annotated"
-            else:
-                annotated_str = "non-annotated"
-            if self.is_parameter(arg_name):
-                is_parameter_str = "parameter"
-            else:
-                is_parameter_str = "non-parameter"
-            str += ", {}'s dims_mapping (input, {}, {}): {}".format(
-                arg_name, annotated_str, is_parameter_str, dims_mapping)
-
-        for arg_name in self.get_owner_op().desc.output_arg_names():
-            dims_mapping = self.get_output_dims_mapping(arg_name)
-            if self.is_annotated_output_dims_mapping(arg_name):
-                annotated_str = "annotated"
-            else:
-                annotated_str = "non-annotated"
-            if self.is_parameter(arg_name):
-                is_parameter_str = "parameter"
-            else:
-                is_parameter_str = "non-parameter"
-            str += ", {}'s dims_mapping (output, {}, {}): {}".format(
-                arg_name, annotated_str, is_parameter_str, dims_mapping)
-
-        str += ", pipeline stage: {}".format(self._pipeline_stage)
-
-        str += ", dist_impl idx: {} }}".format(self._impl_idx)
-
-        return str
-
-    def __deepcopy__(self, memo):
-        cls = self.__class__
-        result = cls.__new__(cls)
-        memo[id(self)] = result
-        for k, v in self.__dict__.items():
-            # No need to copy the owner op and context
-            if k == "_owner_op" or k == "_owner_context":
-                setattr(result, k, v)
-            else:
-                setattr(result, k, copy.deepcopy(v, memo))
-        return result
diff --git a/python/paddle/distributed/auto_parallel/completion.py b/python/paddle/distributed/auto_parallel/completion.py
index 855eb656bd90e..934239c0cd650 100644
--- a/python/paddle/distributed/auto_parallel/completion.py
+++ b/python/paddle/distributed/auto_parallel/completion.py
@@ -20,10 +20,13 @@
 from .utils import compute_compatible_process_mesh
 from .utils import compute_compatible_dim_mapping
 from .utils import compute_compatible_dims_mapping
-from .utils import print_program_with_distributed_attr
-from .context import get_default_distributed_context
+from .utils import print_program_with_dist_attr
 from .operators import find_best_compatible_distributed_operator_impl
-from .attribute import OperatorDistributedAttribute, TensorDistributedAttribute
+from .dist_context import get_default_distributed_context
+from .dist_tensor import DistributedTensor
+from .dist_op import DistributedOperator
+from .dist_attribute import TensorDistributedAttribute
+from .dist_attribute import OperatorDistributedAttribute
 from paddle.distributed.fleet.meta_optimizers.common import OpRole
 
 ELEMENTWISE_LIKE_OP_LIST = ["elementwise_add", "gelu", "dropout", "cast"]
@@ -43,36 +46,35 @@ def update_tensor_node_process_mesh(dist_context, tensor_node, fwd=True):
     process meshes are compatible for now.
     """
     changed = False
-    tensor_dist_attr = dist_context.get_tensor_distributed_attr_for_graph(
-        tensor_node)
+    tensor_dist_attr = dist_context.get_tensor_dist_attr_for_graph(tensor_node)
     if tensor_dist_attr.is_annotated("process_mesh"):
         return changed
-    tensor_process_mesh = tensor_dist_attr.get_process_mesh()
+    tensor_process_mesh = tensor_dist_attr.process_mesh
     if fwd:
         inputs_process_meshes = []
         for pred_op_node in tensor_node.inputs:
             if pred_op_node.op() is not None:
-                op_dist_attr = dist_context.get_op_distributed_attr_for_graph(
+                op_dist_attr = dist_context.get_op_dist_attr_for_graph(
                     pred_op_node)
-                op_process_mesh = op_dist_attr.get_process_mesh()
+                op_process_mesh = op_dist_attr.process_mesh
                 inputs_process_meshes.append(op_process_mesh)
         compatible_process_mesh = compute_compatible_process_mesh(
             inputs_process_meshes)
         if compatible_process_mesh is not None and tensor_process_mesh is None:
-            tensor_dist_attr.set_process_mesh(compatible_process_mesh)
+            tensor_dist_attr.process_mesh = compatible_process_mesh
             changed = True
     else:
         outputs_process_meshes = []
         for succ_op_node in tensor_node.outputs:
             if succ_op_node.op() is not None:
-                op_dist_attr = dist_context.get_op_distributed_attr_for_graph(
+                op_dist_attr = dist_context.get_op_dist_attr_for_graph(
                     succ_op_node)
-                op_process_mesh = op_dist_attr.get_process_mesh()
+                op_process_mesh = op_dist_attr.process_mesh
                 outputs_process_meshes.append(op_process_mesh)
         compatible_process_mesh = compute_compatible_process_mesh(
             outputs_process_meshes)
         if compatible_process_mesh is not None and tensor_process_mesh is None:
-            tensor_dist_attr.set_process_mesh(compatible_process_mesh)
+            tensor_dist_attr.process_mesh = compatible_process_mesh
             changed = True
     return changed
 
@@ -84,43 +86,47 @@ def update_op_node_process_mesh(dist_context, op_node, fwd=True):
     process meshes are compatible for now.
     """
     changed = False
-    op_dist_attr = dist_context.get_op_distributed_attr_for_graph(op_node)
+    op_dist_attr = dist_context.get_op_dist_attr_for_graph(op_node)
     if op_dist_attr.is_annotated("process_mesh"):
         return changed
-    op_process_mesh = op_dist_attr.get_process_mesh()
+    op_process_mesh = op_dist_attr.process_mesh
     if fwd:
         inputs_process_meshes = []
         for tensor_node in op_node.inputs:
             if tensor_node.var() is not None:
-                tensor_dist_attr = dist_context.get_tensor_distributed_attr_for_graph(
+                tensor_dist_attr = dist_context.get_tensor_dist_attr_for_graph(
                     tensor_node)
-                tensor_process_mesh = tensor_dist_attr.get_process_mesh()
+                tensor_process_mesh = tensor_dist_attr.process_mesh
                 inputs_process_meshes.append(tensor_process_mesh)
         compatible_process_mesh = compute_compatible_process_mesh(
             inputs_process_meshes)
         if compatible_process_mesh is not None and op_process_mesh is None:
-            op_dist_attr.set_process_mesh(compatible_process_mesh)
+            op_dist_attr.process_mesh = compatible_process_mesh
             changed = True
     else:
         outputs_process_meshes = []
         for tensor_node in op_node.outputs:
             if tensor_node.var() is not None:
-                tensor_dist_attr = dist_context.get_tensor_distributed_attr_for_graph(
+                tensor_dist_attr = dist_context.get_tensor_dist_attr_for_graph(
                     tensor_node)
-                tensor_process_mesh = tensor_dist_attr.get_process_mesh()
+                tensor_process_mesh = tensor_dist_attr.process_mesh
                 outputs_process_meshes.append(tensor_process_mesh)
         compatible_process_mesh = compute_compatible_process_mesh(
             outputs_process_meshes)
         if compatible_process_mesh is not None and op_process_mesh is None:
-            op_dist_attr.set_process_mesh(compatible_process_mesh)
+            op_dist_attr.process_mesh = compatible_process_mesh
             changed = True
     return changed
 
 
-def update_op_dims_mapping_by_default_dist_impl(op_dist_attr):
+def update_op_dims_mapping_by_default_dist_impl(dist_context, op_node):
     """Each operator has a default distributed operator, only allowed to be sharded in batch dimension."""
     changed = False
-    op_desc = op_dist_attr.get_owner_op().desc
+    if (not op_node.is_op()) or (op_node.op() is None):
+        return False
+    op_desc = op_node.op()
+    dist_op = dist_context.get_dist_op_for_graph(op_node)
+    op_dist_attr = dist_op.dist_attr
     # The following statement will be replaced by a more elegent way
     if op_desc.type() == "shape" or op_desc.type() == "slice":
         return False
@@ -130,7 +136,8 @@ def update_op_dims_mapping_by_default_dist_impl(op_dist_attr):
         xshape_arg_names = op_desc.output("XShape")
     batch_dim_mappings = []
     for arg_name in op_desc.input_arg_names():
-        if op_dist_attr.is_parameter(arg_name):
+        serial_tensor = dist_op.get_serial_input(arg_name)
+        if serial_tensor.is_parameter:
             continue
         dims_mapping = op_dist_attr.get_input_dims_mapping(arg_name)
         if len(dims_mapping) > 1:
@@ -140,7 +147,8 @@ def update_op_dims_mapping_by_default_dist_impl(op_dist_attr):
                         .format(op_desc.type(), idx, mapping)
         batch_dim_mappings.append(dims_mapping[0])
     for arg_name in op_desc.output_arg_names():
-        if op_dist_attr.is_parameter(arg_name):
+        serial_tensor = dist_op.get_serial_output(arg_name)
+        if serial_tensor.is_parameter:
             continue
         dims_mapping = op_dist_attr.get_output_dims_mapping(arg_name)
         if arg_name not in xshape_arg_names:
@@ -164,14 +172,16 @@ def update_op_dims_mapping_by_default_dist_impl(op_dist_attr):
     compatible_dim_mapping = compute_compatible_dim_mapping(batch_dim_mappings)
     assert compatible_dim_mapping is not None, "There is no compatible dim mapping."
     for arg_name in op_desc.input_arg_names():
-        if op_dist_attr.is_parameter(arg_name):
+        serial_tensor = dist_op.get_serial_input(arg_name)
+        if serial_tensor.is_parameter:
             continue
         dims_mapping = op_dist_attr.get_input_dims_mapping(arg_name)
         if compatible_dim_mapping != dims_mapping[0]:
             dims_mapping[0] = compatible_dim_mapping
             changed = True
     for arg_name in op_desc.output_arg_names():
-        if op_dist_attr.is_parameter(arg_name):
+        serial_tensor = dist_op.get_serial_output(arg_name)
+        if serial_tensor.is_parameter:
             continue
         dims_mapping = op_dist_attr.get_output_dims_mapping(arg_name)
         if arg_name not in xshape_arg_names:
@@ -186,10 +196,13 @@ def update_op_dims_mapping_by_default_dist_impl(op_dist_attr):
     return changed
 
 
-def update_op_dims_mapping_by_elementwise_like_dist_impl(op_dist_attr):
+def update_op_dims_mapping_by_elementwise_like_dist_impl(dist_context, op_node):
     """Element-wise operator can be sharded in any way (but should take care of broadcasting)."""
     changed = False
-    op_desc = op_dist_attr.get_owner_op().desc
+    if (not op_node.is_op()) or (op_node.op() is None):
+        return False
+    op_desc = op_node.op()
+    op_dist_attr = dist_context.get_op_dist_attr_for_graph(op_node)
 
     input_arg_names = op_desc.input_arg_names()
     input_dims_mapping_dict = {}
@@ -258,12 +271,11 @@ def update_tensor_node_dims_mapping(dist_context, tensor_node, fwd=True):
     # Skip reader tensor
     if tensor_desc.type() == core.VarDesc.VarType.READER:
         return False
-    tensor_dist_attr = dist_context.get_tensor_distributed_attr_for_graph(
-        tensor_node)
+    tensor_dist_attr = dist_context.get_tensor_dist_attr_for_graph(tensor_node)
     assert tensor_dist_attr is not None
     if tensor_dist_attr.is_annotated("dims_mapping"):
         return False
-    tensor_dims_mapping = tensor_dist_attr.get_dims_mapping()
+    tensor_dims_mapping = tensor_dist_attr.dims_mapping
     if fwd:
         dims_mapping_list = []
         for pred_op_node in tensor_node.inputs:
@@ -272,7 +284,7 @@ def update_tensor_node_dims_mapping(dist_context, tensor_node, fwd=True):
                     or pred_op_node.op().type() == "create_double_buffer_reader" \
                     or pred_op_node.op().type() == "read":
                     continue
-                op_dist_attr = dist_context.get_op_distributed_attr_for_graph(
+                op_dist_attr = dist_context.get_op_dist_attr_for_graph(
                     pred_op_node)
                 op_dims_mapping = op_dist_attr.get_output_dims_mapping(
                     tensor_desc.name())
@@ -282,7 +294,7 @@ def update_tensor_node_dims_mapping(dist_context, tensor_node, fwd=True):
             dims_mapping_list)
         if (compatible_dims_mapping is not None) and \
             (compatible_dims_mapping != tensor_dims_mapping):
-            tensor_dist_attr.set_dims_mapping(compatible_dims_mapping)
+            tensor_dist_attr.dims_mapping = compatible_dims_mapping
             changed = True
     else:
         dims_mapping_list = []
@@ -292,7 +304,7 @@ def update_tensor_node_dims_mapping(dist_context, tensor_node, fwd=True):
                     or succ_op_node.op().type() == "create_double_buffer_reader" \
                     or succ_op_node.op().type() == "read":
                     continue
-                op_dist_attr = dist_context.get_op_distributed_attr_for_graph(
+                op_dist_attr = dist_context.get_op_dist_attr_for_graph(
                     succ_op_node)
                 op_dims_mapping = op_dist_attr.get_input_dims_mapping(
                     tensor_desc.name())
@@ -302,7 +314,7 @@ def update_tensor_node_dims_mapping(dist_context, tensor_node, fwd=True):
             dims_mapping_list)
         if (compatible_dims_mapping is not None) and \
             (compatible_dims_mapping != tensor_dims_mapping):
-            tensor_dist_attr.set_dims_mapping(compatible_dims_mapping)
+            tensor_dist_attr.dims_mapping = compatible_dims_mapping
             changed = True
     return changed
 
@@ -317,7 +329,8 @@ def update_op_node_dims_mapping(dist_context, op_node, fwd=True):
         or op_desc.type() == "create_double_buffer_reader" \
         or op_desc.type() == "read":
         return False
-    op_dist_attr = dist_context.get_op_distributed_attr_for_graph(op_node)
+    dist_op = dist_context.get_dist_op_for_graph(op_node)
+    op_dist_attr = dist_op.dist_attr
     if fwd:
         for tensor_node in op_node.inputs:
             if tensor_node.var() is not None:
@@ -327,9 +340,9 @@ def update_op_node_dims_mapping(dist_context, op_node, fwd=True):
                 if op_dist_attr.is_annotated_input_dims_mapping(
                         tensor_desc.name()):
                     continue
-                tensor_dist_attr = dist_context.get_tensor_distributed_attr_for_graph(
+                tensor_dist_attr = dist_context.get_tensor_dist_attr_for_graph(
                     tensor_node)
-                tensor_dims_mapping = tensor_dist_attr.get_dims_mapping()
+                tensor_dims_mapping = tensor_dist_attr.dims_mapping
                 op_dims_mapping = op_dist_attr.get_input_dims_mapping(
                     tensor_desc.name())
                 compatible_dims_mapping = compute_compatible_dims_mapping(
@@ -341,26 +354,29 @@ def update_op_node_dims_mapping(dist_context, op_node, fwd=True):
                     changed = True
         # Find the most compatible implemenetations from the distributed operator
         op_dist_impl, op_dist_impl_idx = find_best_compatible_distributed_operator_impl(
-            op_desc.type(), op_dist_attr, fwd=True)
+            op_desc.type(), dist_op, fwd=True)
         if op_dist_impl is not None:
-            dim_changed = op_dist_impl.update_dims_mapping(op_dist_attr)
+            dim_changed = op_dist_impl.update_dims_mapping(dist_op)
             if dim_changed:
                 changed = True
             # This statement will be replaced by a good way
-            if op_dist_impl.is_compatible(op_dist_attr):
-                op_dist_attr.set_impl_idx(op_dist_impl_idx)
+            if op_dist_impl.is_compatible(dist_op):
+                op_dist_attr.impl_type = op_desc.type()
+                op_dist_attr.impl_idx = op_dist_impl_idx
         elif is_elementwise_like_op(op_desc.type()):
             dim_changed = update_op_dims_mapping_by_elementwise_like_dist_impl(
-                op_dist_attr)
+                dist_context, op_node)
             if dim_changed:
                 changed = True
-            op_dist_attr.set_impl_idx(-1)
+            op_dist_attr.impl_type = "element-wise"
+            op_dist_attr.impl_idx = -1
         else:
             dim_changed = update_op_dims_mapping_by_default_dist_impl(
-                op_dist_attr)
+                dist_context, op_node)
             if dim_changed:
                 changed = True
-            op_dist_attr.set_impl_idx(-2)
+            op_dist_attr.impl_type = "default"
+            op_dist_attr.impl_idx = -2
     else:
         for tensor_node in op_node.outputs:
             if tensor_node.var() is not None:
@@ -370,9 +386,9 @@ def update_op_node_dims_mapping(dist_context, op_node, fwd=True):
                 if op_dist_attr.is_annotated_output_dims_mapping(
                         tensor_desc.name()):
                     continue
-                tensor_dist_attr = dist_context.get_tensor_distributed_attr_for_graph(
+                tensor_dist_attr = dist_context.get_tensor_dist_attr_for_graph(
                     tensor_node)
-                tensor_dims_mapping = tensor_dist_attr.get_dims_mapping()
+                tensor_dims_mapping = tensor_dist_attr.dims_mapping
                 op_dims_mapping = op_dist_attr.get_output_dims_mapping(
                     tensor_desc.name())
                 compatible_dims_mapping = compute_compatible_dims_mapping(
@@ -384,26 +400,29 @@ def update_op_node_dims_mapping(dist_context, op_node, fwd=True):
                     changed = True
         # Find the most compatible implemenetations from the distributed operator
         op_dist_impl, op_dist_impl_idx = find_best_compatible_distributed_operator_impl(
-            op_desc.type(), op_dist_attr, fwd=False)
+            op_desc.type(), dist_op, fwd=False)
         if op_dist_impl is not None:
-            dim_changed = op_dist_impl.update_dims_mapping(op_dist_attr)
+            dim_changed = op_dist_impl.update_dims_mapping(dist_op)
             if dim_changed:
                 changed = True
             # This statement will be replaced by a good way
-            if op_dist_impl.is_compatible(op_dist_attr):
-                op_dist_attr.set_impl_idx(op_dist_impl_idx)
+            if op_dist_impl.is_compatible(dist_op):
+                op_dist_attr.impl_type = op_desc.type()
+                op_dist_attr.impl_idx = op_dist_impl_idx
         elif is_elementwise_like_op(op_desc.type()):
             dim_changed = update_op_dims_mapping_by_elementwise_like_dist_impl(
-                op_dist_attr)
+                dist_context, op_node)
             if dim_changed:
                 changed = True
-            op_dist_attr.set_impl_idx(-1)
+            op_dist_attr.impl_type = "element-wise"
+            op_dist_attr.impl_idx = -1
         else:
             dim_changed = update_op_dims_mapping_by_default_dist_impl(
-                op_dist_attr)
+                dist_context, op_node)
             if dim_changed:
                 changed = True
-            op_dist_attr.set_impl_idx(-2)
+            op_dist_attr.impl_type = "default"
+            op_dist_attr.impl_idx = -2
     return changed
 
 
@@ -421,18 +440,20 @@ def complete_annotation(program, dist_context=None):
     # Use the default distribted context for completeion if there is no one
     if dist_context is None:
         dist_context = get_default_distributed_context()
+        dist_context.serial_program = program
+    else:
+        dist_context.serial_program = program
 
-    # Initialize distributed attributes for all var and op node in program
-    dist_context.initialize_distributed_attr_for_program(program)
+    # print_program_with_dist_attr(program, dist_context)
 
-    # Convert program to graph
-    graph = framework.IrGraph(core.Graph(program.desc))
+    # Initialize distributed attributes for all var and op node in program
+    dist_context.init_dist_attr_for_program()
 
     # Initialize distributed attributes for all var and op node in graph
-    dist_context.initialize_distributed_attr_for_graph(graph)
+    dist_context.init_dist_attr_for_graph()
 
     # Complete process mesh for each node
-    all_nodes = list(graph.all_nodes())
+    all_nodes = list(dist_context.serial_graph.all_nodes())
 
     def sort_key_fun(node):
         first = -1
@@ -498,27 +519,27 @@ def sort_key_fun(node):
             is_wrong = False
             for node in all_nodes:
                 if node.is_var() and node.var() is not None:
-                    tensor_dist_attr = dist_context.get_tensor_distributed_attr_for_graph(
+                    tensor_dist_attr = dist_context.get_tensor_dist_attr_for_graph(
                         node)
-                    if tensor_dist_attr.get_process_mesh() is None:
+                    if tensor_dist_attr.process_mesh is None:
                         msg_str = ""
                         for op_node in node.inputs:
                             if op_node.op() is not None:
-                                op_dist_attr = dist_context.get_op_distributed_attr_for_graph(
+                                op_dist_attr = dist_context.get_op_dist_attr_for_graph(
                                     op_node)
                                 msg_str += "{} [{}], ".format(
                                     op_node.op().type(),
-                                    op_dist_attr.get_process_mesh())
+                                    op_dist_attr.process_mesh)
                             else:
                                 msg_str += "{} [{}], ".format(op_node.name(),
                                                               None)
                         for op_node in node.outputs:
                             if op_node.op() is not None:
-                                op_dist_attr = dist_context.get_op_distributed_attr_for_graph(
+                                op_dist_attr = dist_context.get_op_dist_attr_for_graph(
                                     op_node)
                                 msg_str += "{} [{}], ".format(
                                     op_node.op().type(),
-                                    op_dist_attr.get_process_mesh())
+                                    op_dist_attr.process_mesh)
                             else:
                                 msg_str += "{} [{}], ".format(op_node.name(),
                                                               None)
@@ -527,27 +548,26 @@ def sort_key_fun(node):
                         is_wrong = True
                         print(msg_str)
                 if node.is_op() and node.op() is not None:
-                    op_dist_attr = dist_context.get_op_distributed_attr_for_graph(
-                        node)
-                    if op_dist_attr.get_process_mesh() is None:
+                    op_dist_attr = dist_context.get_op_dist_attr_for_graph(node)
+                    if op_dist_attr.process_mesh is None:
                         msg_str = ""
                         for tensor_node in node.inputs:
                             if tensor_node.var() is not None:
-                                tensor_dist_attr = dist_context.get_tensor_distributed_attr_for_graph(
+                                tensor_dist_attr = dist_context.get_tensor_dist_attr_for_graph(
                                     tensor_node)
                                 msg_str += "{} [{}], ".format(
                                     tensor_node.var().name(),
-                                    tensor_dist_attr.get_process_mesh())
+                                    tensor_dist_attr.process_mesh)
                             else:
                                 msg_str += "{} [{}], ".format(
                                     tensor_node.name(), None)
                         for tensor_node in node.outputs:
                             if tensor_node.var() is not None:
-                                tensor_dist_attr = dist_context.get_tensor_distributed_attr_for_graph(
+                                tensor_dist_attr = dist_context.get_tensor_dist_attr_for_graph(
                                     tensor_node)
                                 msg_str += "{} [{}], ".format(
                                     tensor_node.var().name(),
-                                    tensor_dist_attr.get_process_mesh())
+                                    tensor_dist_attr.process_mesh)
                             else:
                                 msg_str += "{} [{}], ".format(
                                     tensor_node.name(), None)
@@ -592,11 +612,14 @@ def sort_key_fun(node):
             reach_fix_point = True
 
     # Copy the corresponding distributed attribute from graph to program
-    dist_context.copy_distribute_attr_from_graph_to_program(graph, program)
-    dist_context.clear_distributed_attr_for_graph()
+    dist_context.copy_dist_attr_from_graph_to_program()
+    dist_context.clear_dist_info_for_graph()
 
     # Do the validation check and amend some completion
-    dist_context.amend_distributed_attr_for_program()
+    dist_context.amend_dist_attr_for_program()
+
+    # print_program_with_dist_attr(program, dist_context)
+    dist_context.validate_dist_attr_for_program()
 
     return program
 
@@ -623,126 +646,123 @@ def _get_op_by_id(ops, id):
     if dist_context is None:
         dist_context = get_default_distributed_context()
 
-    grad_start_idx = -1
+    first_backward_op_idx = -1
     for idx, op in enumerate(auto_parallel_main_prog.global_block().ops):
         if int(op.attr('op_role')) == int(
                 int(core.op_proto_and_checker_maker.OpRole.Backward) | int(
                     core.op_proto_and_checker_maker.OpRole.Loss)):
             assert op.type == "fill_constant"
-            grad_start_idx = idx
+            first_backward_op_idx = idx
             break
 
-    assert grad_start_idx >= 0, "No backward procedure found in this program."
+    assert first_backward_op_idx >= 0, "No backward procedure found in this program."
 
     ops = list(auto_parallel_main_prog.global_block().ops)
     vars = auto_parallel_main_prog.global_block().vars
+    dist_op_context = dist_context.dist_op_context
 
-    for idx in range(grad_start_idx, len(ops)):
+    for idx in range(first_backward_op_idx, len(ops)):
 
         # complete the initial grad loss op
-        if idx == grad_start_idx:
-            grad_var = vars[ops[idx].output_arg_names[0]]
-            forward_var_name = _get_forward_varname_from_grad_varname(
-                grad_var.name)
-            forward_var = vars[forward_var_name]
-
-            # TODO complete other attribte for grad var
-            tensor_attr = TensorDistributedAttribute(grad_var, dist_context)
-            process_mesh = dist_context.get_tensor_distributed_attr_for_program(
-                forward_var).get_process_mesh()
-            dims_mapping = dist_context.get_tensor_distributed_attr_for_program(
-                forward_var).get_dims_mapping()
-            tensor_attr.set_dims_mapping(dims_mapping)
-            tensor_attr.set_process_mesh(process_mesh)
-            dist_context.set_tensor_distributed_attr_for_program(grad_var,
-                                                                 tensor_attr)
-
-            op_attr = OperatorDistributedAttribute(ops[idx], dist_context)
-            op_attr.set_process_mesh(process_mesh)
-            dist_context.set_op_distributed_attr_for_program(ops[idx], op_attr)
-            continue
+        if idx == first_backward_op_idx:
+            assert ops[idx].type == "fill_constant"
+            assert len(
+                ops[idx].input_arg_names
+            ) == 0, "first backward op should has only ONE output, but got [{}]".format(
+                len(ops[idx].input_arg_names))
+            assert len(
+                ops[idx].output_arg_names
+            ) == 1, "first backward op should has only ONE output, but got [{}]".format(
+                len(ops[idx].output_arg_names))
 
-        # TODO remove this when dist op handle its own grad scale
-        # in the data parallel mode, the loss op followed by scale op.
-        if ops[idx].type == "scale" and idx == grad_start_idx + 1:
-            assert grad_var.name in ops[
-                idx].input_arg_names and grad_var.name in ops[
-                    idx].output_arg_names
             grad_var = vars[ops[idx].output_arg_names[0]]
             forward_var_name = _get_forward_varname_from_grad_varname(
                 grad_var.name)
             forward_var = vars[forward_var_name]
-            process_mesh = dist_context.get_tensor_distributed_attr_for_program(
-                forward_var).get_process_mesh()
-            op_attr = OperatorDistributedAttribute(ops[idx], dist_context)
-            op_attr.set_process_mesh(process_mesh)
-            dist_context.set_op_distributed_attr_for_program(ops[idx], op_attr)
-            continue
 
-        # TODO remove this when dist op handle its own communication
-        # TODO should distinguish the dp allreduce and mp allreduce
-        # complete the c_allreduce_sum op for gradient in the data parallel mode.
-        if ops[idx].type == "c_allreduce_sum" and ops[
-                idx].input_arg_names == ops[idx].output_arg_names:
-            grad_var = vars[ops[idx].output_arg_names[0]]
-            op_attr = OperatorDistributedAttribute(ops[idx], dist_context)
-            process_mesh = dist_context.get_tensor_distributed_attr_for_program(
-                grad_var).get_process_mesh()
-            op_attr.set_process_mesh(process_mesh)
-            dist_context.set_op_distributed_attr_for_program(ops[idx], op_attr)
+            # TODO complete other attribte for grad var
+            tensor_dist_attr = TensorDistributedAttribute()
+            process_mesh = dist_context.get_tensor_dist_attr_for_program(
+                forward_var).process_mesh
+            dims_mapping = dist_context.get_tensor_dist_attr_for_program(
+                forward_var).dims_mapping
+            tensor_dist_attr.dims_mapping = dims_mapping
+            tensor_dist_attr.process_mesh = process_mesh
+            dist_context.set_tensor_dist_attr_for_program(grad_var,
+                                                          tensor_dist_attr)
+
+            op_dist_attr = OperatorDistributedAttribute()
+            op_dist_attr.process_mesh = process_mesh
+            op_dist_attr.set_output_dims_mapping(grad_var.name, dims_mapping)
+            dist_context.set_op_dist_attr_for_program(ops[idx], op_dist_attr)
             continue
 
         # complete the annotation of grad op (xxx_grad op or sum op)
-        grad_op = ops[idx]
-
         # xxx_grad op will have a corresponding forward op in gradopidx2opidx
-        dist_op_helper = dist_context.get_dist_op_helper()
-        if grad_op.desc.id() in dist_op_helper.gradopidx2opidx:
+        grad_op = ops[idx]
+        if grad_op.desc.id() in dist_op_context.gradopidx2opidx:
             # TODO support the case where one forward op corresponding to multiple xxx_grad op
             forward_op = _get_op_by_id(
-                ops[:grad_start_idx],
-                dist_op_helper.gradopidx2opidx[grad_op.desc.id()])
+                ops[:first_backward_op_idx],
+                dist_op_context.gradopidx2opidx[grad_op.desc.id()])
             assert forward_op is not None
 
             # op dist attr
-            forward_op_attr = dist_context.get_op_distributed_attr_for_program(
+            forward_op_dist_attr = dist_context.get_op_dist_attr_for_program(
                 forward_op)
-            grad_op_attr = OperatorDistributedAttribute(grad_op, dist_context)
-            grad_op_attr.set_process_mesh(forward_op_attr.get_process_mesh())
-
-            for var_name in grad_op.input_arg_names:
-                if "@GRAD" in var_name:
-                    dims_mapping = dist_context.get_tensor_distributed_attr_for_program(
-                        vars[var_name]).get_dims_mapping()
-                    grad_op_attr.set_input_dims_mapping(var_name, dims_mapping)
+            forward_op_process_mesh = forward_op_dist_attr.process_mesh
+            grad_op_dist_attr = OperatorDistributedAttribute()
+            grad_op_dist_attr.process_mesh = forward_op_process_mesh
+
+            # var 
+            for output_name in grad_op.desc.output_names():
+                assert len(grad_op.desc.output(output_name)) in [0, 1]
+                if _is_grad_var_name(output_name):
+                    input_name = _get_forward_varname_from_grad_varname(
+                        output_name)
                 else:
-                    dims_mapping = forward_op_attr.get_input_dims_mapping(
-                        var_name)
-                    # TODO fixed here
-                    if dims_mapping == None:
-                        dims_mapping = forward_op_attr.get_output_dims_mapping(
-                            var_name)
-                    assert dims_mapping is not None, "[{}]'s dims_mapping is None".format(
-                        var_name)
-                    grad_op_attr.set_input_dims_mapping(var_name, dims_mapping)
-            dist_context.set_op_distributed_attr_for_program(grad_op,
-                                                             grad_op_attr)
-            # var dist attr 
-            for var_name in grad_op.output_arg_names:
-                if _is_grad_var_name(var_name):
-
-                    forward_var_name = _get_forward_varname_from_grad_varname(
-                        var_name)
-                    forward_var = vars[forward_var_name]
-                    tensor_attr = TensorDistributedAttribute(vars[var_name],
-                                                             dist_context)
-                    process_mesh = grad_op_attr.get_process_mesh()
-                    dims_mapping = grad_op_attr.get_input_dims_mapping(
-                        forward_var_name)
-                    tensor_attr.set_process_mesh(process_mesh)
-                    tensor_attr.set_dims_mapping(dims_mapping)
-                    dist_context.set_tensor_distributed_attr_for_program(
-                        vars[var_name], tensor_attr)
+                    assert grad_op.type in [
+                        "cast", "c_identity", "c_allreduce_sum"
+                    ]
+                    input_name = "X"
+                assert input_name in forward_op.desc.input_names(
+                ), "var [{}] in op [{}]'s output but coulf not find [{}] in its forward op".format(
+                    output_name, grad_op.type, input_name)
+                if len(grad_op.desc.output(output_name)) == 1:
+                    assert len(forward_op.desc.input(input_name)) == 1
+                    input_var = vars[forward_op.desc.input(input_name)[0]]
+                    input_var_dist_attr = dist_context.get_tensor_dist_attr_for_program(
+                        input_var)
+                    assert input_var_dist_attr is not None, "[{}] has not dist attribute".format(
+                        input_var.name)
+                    ref_dims_mapping = input_var_dist_attr.dims_mapping
+
+                    # tensor dist attr
+                    output_var = vars[grad_op.desc.output(output_name)[0]]
+                    output_var_dist_attr = TensorDistributedAttribute()
+                    output_var_dist_attr.dims_mapping = ref_dims_mapping
+                    output_var_dist_attr.process_mesh = forward_op_process_mesh
+                    dist_context.set_tensor_dist_attr_for_program(
+                        output_var, output_var_dist_attr)
+
+                    # op dist attr
+                    grad_op_dist_attr.set_output_dims_mapping(output_var.name,
+                                                              ref_dims_mapping)
+
+            for input_name in grad_op.input_arg_names:
+                input_var = vars[input_name]
+                input_var_dist_attr = dist_context.get_tensor_dist_attr_for_program(
+                    input_var)
+                assert input_var_dist_attr is not None, "[{}] has not dist attribute".format(
+                    input_var.name)
+                ref_dims_mapping = input_var_dist_attr.dims_mapping
+                assert ref_dims_mapping is not None, "[{}] 's dims mapping is NONE".format(
+                    input_var.name)
+                grad_op_dist_attr.set_input_dims_mapping(input_name,
+                                                         ref_dims_mapping)
+
+            dist_context.set_op_dist_attr_for_program(grad_op,
+                                                      grad_op_dist_attr)
 
         # only sum op for merge mutiple version grad has no a corresponding mapping in gradopidx2opidx
         else:
@@ -754,29 +774,31 @@ def _get_op_by_id(ops, id):
             ref_forward_var_name = _get_forward_varname_from_grad_varname(
                 grad_op.output_arg_names[0])
             forward_var = vars[ref_forward_var_name]
-            ref_forward_var_dims_mapping = dist_context.get_tensor_distributed_attr_for_program(
-                forward_var).get_dims_mapping()
-            ref_forward_var_process_mesh = dist_context.get_tensor_distributed_attr_for_program(
-                forward_var).get_process_mesh()
+            ref_forward_var_dims_mapping = dist_context.get_tensor_dist_attr_for_program(
+                forward_var).dims_mapping
+            ref_forward_var_process_mesh = dist_context.get_tensor_dist_attr_for_program(
+                forward_var).process_mesh
 
             # output
-            tensor_attr = TensorDistributedAttribute(
-                vars[grad_op.output_arg_names[0]], dist_context)
-            tensor_attr.set_dims_mapping(ref_forward_var_dims_mapping)
-            tensor_attr.set_process_mesh(ref_forward_var_process_mesh)
-            dist_context.set_tensor_distributed_attr_for_program(
-                vars[grad_op.output_arg_names[0]], tensor_attr)
+            tensor_dist_attr = TensorDistributedAttribute()
+            tensor_dist_attr.dims_mapping = ref_forward_var_dims_mapping
+            tensor_dist_attr.process_mesh = ref_forward_var_process_mesh
+            dist_context.set_tensor_dist_attr_for_program(
+                vars[grad_op.output_arg_names[0]], tensor_dist_attr)
 
             # op
-            grad_op_attr = OperatorDistributedAttribute(grad_op, dist_context)
-            grad_op_attr.set_process_mesh(ref_forward_var_process_mesh)
+            grad_op_dist_attr = OperatorDistributedAttribute()
+            grad_op_dist_attr.process_mesh = ref_forward_var_process_mesh
             for var_name in grad_op.input_arg_names:
                 assert _get_forward_varname_from_grad_varname(
                     var_name) == ref_forward_var_name
-                grad_op_attr.set_input_dims_mapping(
+                grad_op_dist_attr.set_input_dims_mapping(
                     var_name, ref_forward_var_dims_mapping)
-            dist_context.set_op_distributed_attr_for_program(grad_op,
-                                                             grad_op_attr)
+
+            grad_op_dist_attr.set_output_dims_mapping(
+                grad_op.output_arg_names[0], ref_forward_var_dims_mapping)
+            dist_context.set_op_dist_attr_for_program(grad_op,
+                                                      grad_op_dist_attr)
 
 
 def complete_update_annotation(auto_parallel_main_prog, dist_context):
@@ -787,28 +809,88 @@ def complete_update_annotation(auto_parallel_main_prog, dist_context):
 
     ops = list(auto_parallel_main_prog.global_block().ops)
     vars = auto_parallel_main_prog.global_block().vars
+    learning_rate_completed = False
 
     for idx in range(len(ops)):
 
         # complete the annotation of the optimizer op.
         # TODO to add attribute for moment var
-        if int(ops[idx].attr('op_role')) == int(OpRole.Optimize):
-            if "Grad" in ops[idx].input_names and "Param" in ops[
-                    idx].input_names:
-                assert len(ops[idx].input(
+        op = ops[idx]
+        if int(op.attr('op_role')) == int(OpRole.Optimize):
+
+            if "Grad" in op.input_names and "Param" in ops[idx].input_names:
+                assert len(op.input(
                     "Param")) == 1, "Only support one-to-one now."
-                assert len(ops[idx].input(
+                assert len(op.input(
                     "Grad")) == 1, "Only support one-to-one now."
-                param = vars[ops[idx].input("Param")[0]]
-                grad_var = vars[ops[idx].input("Grad")[0]]
-                process_mesh = dist_context.get_tensor_distributed_attr_for_program(
-                    param).get_process_mesh()
-                dims_mapping = dist_context.get_tensor_distributed_attr_for_program(
-                    param).get_dims_mapping()
-                op_attr = OperatorDistributedAttribute(ops[idx], dist_context)
-                op_attr.set_process_mesh(process_mesh)
-                op_attr.set_input_dims_mapping(grad_var.name, dims_mapping)
-                op_attr.set_input_dims_mapping(param.name, dims_mapping)
-                dist_context.set_op_distributed_attr_for_program(ops[idx],
-                                                                 op_attr)
+                param = vars[op.input("Param")[0]]
+                grad_var = vars[op.input("Grad")[0]]
+
+                param_dist_attr = dist_context.get_tensor_dist_attr_for_program(
+                    param)
+                grad_dist_attr = dist_context.get_tensor_dist_attr_for_program(
+                    grad_var)
+
+                assert param_dist_attr is not None
+                assert grad_dist_attr is not None
+                assert param_dist_attr.dims_mapping == grad_dist_attr.dims_mapping
+
+                ref_process_mesh = dist_context.get_tensor_dist_attr_for_program(
+                    param).process_mesh
+                assert ref_process_mesh is not None
+                ref_dims_mapping = dist_context.get_tensor_dist_attr_for_program(
+                    param).dims_mapping
+                assert ref_dims_mapping is not None
+                op_dist_attr = OperatorDistributedAttribute()
+                op_dist_attr.process_mesh = ref_process_mesh
+                op_dist_attr.set_input_dims_mapping(grad_var.name,
+                                                    ref_dims_mapping)
+                op_dist_attr.set_input_dims_mapping(param.name,
+                                                    ref_dims_mapping)
+                op_dist_attr.set_output_dims_mapping(param.name,
+                                                     ref_dims_mapping)
+                learning_var = vars[op.input("LearningRate")[0]]
+                op_dist_attr.set_input_dims_mapping(learning_var.name, [-1])
+                op_dist_attr.set_output_dims_mapping(learning_var.name, [-1])
+
+                if not learning_rate_completed:
+                    learning_rate_completed = True
+                    var_dist_attr = TensorDistributedAttribute()
+                    var_dist_attr.process_mesh = ref_process_mesh
+                    var_dist_attr.dims_mapping = [-1]
+                    dist_context.set_tensor_dist_attr_for_program(learning_var,
+                                                                  var_dist_attr)
+
+                for input_name in op.desc.input_names():
+
+                    if input_name in [
+                            'Param', 'Grad', 'LearningRate', "SkipUpdate",
+                            "Beta1Tensor", "Beta2Tensor", "EpsilonTensor",
+                            "MasterParam"
+                    ]:
+                        continue
+
+                    assert len(op.desc.input(input_name)) == 1
+                    input_var = vars[op.desc.input(input_name)[0]]
+                    input_var_attr = TensorDistributedAttribute()
+
+                    if "Beta1Pow" in input_name or "Beta2Pow" in input_name:
+                        input_var_attr.dims_mapping = [-1]
+                        op_dist_attr.set_input_dims_mapping(input_var.name,
+                                                            [-1])
+                        op_dist_attr.set_output_dims_mapping(input_var.name,
+                                                             [-1])
+                    else:
+                        assert "Moment" in input_name
+                        input_var_attr.dims_mapping = ref_dims_mapping
+                        op_dist_attr.set_input_dims_mapping(input_var.name,
+                                                            ref_dims_mapping)
+                        op_dist_attr.set_output_dims_mapping(input_var.name,
+                                                             ref_dims_mapping)
+
+                    input_var_attr.process_mesh = ref_process_mesh
+                    dist_context.set_tensor_dist_attr_for_program(
+                        input_var, input_var_attr)
+
+                dist_context.set_op_dist_attr_for_program(op, op_dist_attr)
                 continue
diff --git a/python/paddle/distributed/auto_parallel/context.py b/python/paddle/distributed/auto_parallel/context.py
deleted file mode 100644
index 6785f21351aa4..0000000000000
--- a/python/paddle/distributed/auto_parallel/context.py
+++ /dev/null
@@ -1,495 +0,0 @@
-#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License
-
-import copy
-from collections import defaultdict
-from paddle.fluid import framework
-from paddle.fluid import core
-from .attribute import TensorDistributedAttribute
-from .attribute import OperatorDistributedAttribute
-from .utils import append_distributed_attr_suffix
-from .interface import _g_process_mesh_map
-
-# There always exists a default context for user. And user can set it to another one.
-DEFAULT_DISTRIBUTED_CONTEXT = None
-
-
-def get_default_distributed_context():
-    global DEFAULT_DISTRIBUTED_CONTEXT
-    if DEFAULT_DISTRIBUTED_CONTEXT is None:
-        dist_context = DistributedContext()
-        set_default_distributed_context(dist_context)
-    return DEFAULT_DISTRIBUTED_CONTEXT
-
-
-def set_default_distributed_context(dist_context):
-    global DEFAULT_DISTRIBUTED_CONTEXT
-    DEFAULT_DISTRIBUTED_CONTEXT = dist_context
-
-
-class DistributedContext:
-    """
-    DistributedContext is used to collect related distributed information for program and graph.
-    One auto-parallel run should use its own DistributedContext to avoid interfering other run.
-    """
-
-    def __init__(self):
-        self._is_initialized_for_program = False
-        self._is_initialized_for_graph = False
-        self._tensor_distributed_attr_map_for_program = {}
-        self._op_distributed_attr_map_for_program = {}
-        self._tensor_distributed_attr_map_for_graph = {}
-        self._op_distributed_attr_map_for_graph = {}
-        self._get_dist_op_helper = DistOpHelper()
-        self._process_mesh = _g_process_mesh_map.get(0, None)
-
-    def is_initialized_for_program(self):
-        return self._is_initialized_for_program
-
-    def is_initialized_for_graph(self):
-        return self._is_initialized_for_graph
-
-    def get_tensor_distributed_attr_for_program(self, tensor):
-        tensor_id = tensor.desc.id()
-        tensor_dist_attr = self._tensor_distributed_attr_map_for_program.get(
-            tensor_id, None)
-        return tensor_dist_attr
-
-    def set_tensor_distributed_attr_for_program(self, tensor, tensor_dist_attr):
-        tensor_id = tensor.desc.id()
-        self._tensor_distributed_attr_map_for_program[
-            tensor_id] = tensor_dist_attr
-
-    def get_op_distributed_attr_for_program(self, op):
-        op_id = op.desc.id()
-        op_dist_attr = self._op_distributed_attr_map_for_program.get(op_id,
-                                                                     None)
-        return op_dist_attr
-
-    def set_op_distributed_attr_for_program(self, op, op_dist_attr):
-        op_id = op.desc.id()
-        self._op_distributed_attr_map_for_program[op_id] = op_dist_attr
-
-    def get_tensor_distributed_attr_for_graph(self, tensor_node):
-        tensor_node_id = tensor_node.id()
-        tensor_dist_attr = self._tensor_distributed_attr_map_for_graph.get(
-            tensor_node_id, None)
-        return tensor_dist_attr
-
-    def set_tensor_distributed_attr_for_graph(self, tensor_node,
-                                              tensor_dist_attr):
-        tensor_node_id = tensor_node.id()
-        self._tensor_distributed_attr_map_for_graph[
-            tensor_node_id] = tensor_dist_attr
-
-    def get_op_distributed_attr_for_graph(self, op_node):
-        op_node_id = op_node.id()
-        op_dist_attr = self._op_distributed_attr_map_for_graph.get(op_node_id,
-                                                                   None)
-        return op_dist_attr
-
-    def set_op_distributed_attr_for_graph(self, op_node, op_dist_attr):
-        op_node_id = op_node.id()
-        self._op_distributed_attr_map_for_graph[op_node_id] = op_dist_attr
-
-    def set_process_mesh(self, process_mesh):
-        self._process_mesh = process_mesh
-
-    def get_dist_op_helper(self):
-        return self._get_dist_op_helper
-
-    def initialize_distributed_attr_for_program(self, program):
-        if self._is_initialized_for_program:
-            return
-        for block in program.blocks:
-            for tensor in block.vars.values():
-                # Since only tensors have distributed attributes, it's better to make sure var is a tensor
-                tensor_dist_attr = self.get_tensor_distributed_attr_for_program(
-                    tensor)
-                if tensor_dist_attr is None:
-                    tensor_dist_attr = TensorDistributedAttribute(tensor, self)
-                    self._copy_distributed_attr_from_tensor_desc(
-                        tensor.desc, tensor_dist_attr)
-                    self.set_tensor_distributed_attr_for_program(
-                        tensor, tensor_dist_attr)
-                if tensor.type == core.VarDesc.VarType.READER:
-                    tensor_dist_attr.set_shape([])
-                else:
-                    tensor_dist_attr.set_shape(tensor.desc.shape())
-                if tensor_dist_attr.get_process_mesh() is not None:
-                    tensor_dist_attr.mark_as_annotated("process_mesh")
-                if tensor_dist_attr.get_dims_mapping() is None:
-                    tensor_dims_mapping = [
-                        -1 for _ in range(len(tensor_dist_attr.get_shape()))
-                    ]
-                    tensor_dist_attr.set_dims_mapping(tensor_dims_mapping)
-                else:
-                    tensor_dist_attr.mark_as_annotated("dims_mapping")
-                if isinstance(tensor, framework.Parameter):
-                    tensor_dist_attr.mark_as_parameter()
-            for op in block.ops:
-                op_dist_attr = self.get_op_distributed_attr_for_program(op)
-                if op_dist_attr is None:
-                    op_dist_attr = OperatorDistributedAttribute(op, self)
-                    self._copy_distributed_attr_from_op_desc(op.desc,
-                                                             op_dist_attr)
-                    self.set_op_distributed_attr_for_program(op, op_dist_attr)
-                # Default distributed implementation for all operators
-                # This will be updated during the completion prcess
-                op_dist_attr.set_impl_idx(-2)
-                if op_dist_attr.get_process_mesh() is not None:
-                    op_dist_attr.mark_as_annotated("process_mesh")
-                for tensor_name in op.input_arg_names:
-                    # There may be a better way to find the tensor by name
-                    if op.type == "create_py_reader" \
-                        or tensor.type == core.VarDesc.VarType.READER:
-                        op_dist_attr.set_input_shape(tensor_name, [])
-                    else:
-                        tensor = op.block._var_recursive(tensor_name)
-                        op_dist_attr.set_input_shape(tensor_name,
-                                                     tensor.desc.shape())
-                    if op_dist_attr.get_input_dims_mapping(tensor_name) is None:
-                        tensor_dims_mapping = [
-                            -1
-                            for _ in range(
-                                len(op_dist_attr.get_input_shape(tensor_name)))
-                        ]
-                        op_dist_attr.set_input_dims_mapping(tensor_name,
-                                                            tensor_dims_mapping)
-                    else:
-                        op_dist_attr.mark_as_annotated_input_dims_mapping(
-                            tensor_name)
-                    if isinstance(tensor, framework.Parameter):
-                        op_dist_attr.mark_as_parameter(tensor_name)
-                for tensor_name in op.output_arg_names:
-                    tensor = op.block._var_recursive(tensor_name)
-                    if tensor.type == core.VarDesc.VarType.READER:
-                        op_dist_attr.set_output_shape(tensor_name, [])
-                    else:
-                        op_dist_attr.set_output_shape(tensor_name,
-                                                      tensor.desc.shape())
-                    if op_dist_attr.get_output_dims_mapping(
-                            tensor_name) is None:
-                        tensor_dims_mapping = [
-                            -1
-                            for _ in range(
-                                len(
-                                    op_dist_attr.get_output_shape(tensor_name)))
-                        ]
-                        op_dist_attr.set_output_dims_mapping(
-                            tensor_name, tensor_dims_mapping)
-                    else:
-                        op_dist_attr.mark_as_annotated_output_dims_mapping(
-                            tensor_name)
-                    if isinstance(tensor, framework.Parameter):
-                        op_dist_attr.mark_as_parameter(tensor_name)
-        self._is_initialized_for_program = True
-
-    def finalize_distributed_attr_for_program(self, program):
-        assert self._is_initialized_for_program, \
-            "The program must initialize its distributed attribute before finalization."
-        for block in program.blocks:
-            for tensor in block.vars.values():
-                tensor_dist_attr = self.get_tensor_distributed_attr_for_program(
-                    tensor)
-                if tensor_dist_attr is not None:
-                    self._store_distributed_attr_to_tensor_desc(
-                        tensor.desc, tensor_dist_attr)
-            for op in block.ops:
-                op_dist_attr = self.get_op_distributed_attr_for_program(op)
-                if op_dist_attr is not None:
-                    self._store_distributed_attr_to_op_desc(op.desc,
-                                                            op_dist_attr)
-
-    def _copy_distributed_attr_from_tensor_desc(self, desc, dist_attr):
-        from paddle.distributed.auto_parallel.interface import _g_process_mesh_map
-        attr_name = append_distributed_attr_suffix("mesh_id")
-        if desc.has_attr(attr_name):
-            mesh_id = desc.attr(attr_name)
-            process_mesh = _g_process_mesh_map[mesh_id]
-            copied_process_mesh = copy.deepcopy(process_mesh)
-            dist_attr.set_process_mesh(copied_process_mesh)
-        attr_name = append_distributed_attr_suffix("dim_mapping")
-        if desc.has_attr(attr_name):
-            dims_mapping = desc.attr(attr_name)
-            copied_dims_mapping = copy.deepcopy(dims_mapping)
-            dist_attr.set_dims_mapping(copied_dims_mapping)
-        attr_name = append_distributed_attr_suffix("mask")
-        if desc.has_attr(attr_name):
-            shard_mask = desc.attr(attr_name)
-            copied_shard_mask = copy.deepcopy(shard_mask)
-            dist_attr.set_shard_mask(copied_shard_mask)
-        attr_name = append_distributed_attr_suffix("offload_device")
-        if desc.has_attr(attr_name):
-            offload_device = desc.attr(attr_name)
-            copied_offload_device = copy.deepcopy(offload_device)
-            dist_attr.set_offload_device(copied_offload_device)
-
-    def _copy_distributed_attr_from_op_desc(self, desc, dist_attr):
-        from paddle.distributed.auto_parallel.interface import _g_process_mesh_map
-        attr_name = append_distributed_attr_suffix("mesh_id")
-        if desc.has_attr(attr_name):
-            mesh_id = desc.attr(attr_name)
-            process_mesh = _g_process_mesh_map[mesh_id]
-            copied_process_mesh = copy.deepcopy(process_mesh)
-            dist_attr.set_process_mesh(copied_process_mesh)
-        for tensor_name in desc.input_arg_names():
-            attr_name = append_distributed_attr_suffix("IN_" + tensor_name)
-            if desc.has_attr(attr_name):
-                dims_mapping = desc.attr(attr_name)
-                copied_dims_mapping = copy.deepcopy(dims_mapping)
-                dist_attr.set_input_dims_mapping(tensor_name,
-                                                 copied_dims_mapping)
-        for tensor_name in desc.output_arg_names():
-            attr_name = append_distributed_attr_suffix("OUT_" + tensor_name)
-            if desc.has_attr(attr_name):
-                dims_mapping = desc.attr(attr_name)
-                copied_dims_mapping = copy.deepcopy(dims_mapping)
-                dist_attr.set_input_dims_mapping(tensor_name,
-                                                 copied_dims_mapping)
-        attr_name = append_distributed_attr_suffix("pipeline_stage")
-        if desc.has_attr(attr_name):
-            pipeline_stage = desc.attr(attr_name)
-            copied_pipeline_stage = copy.deepcopy(pipeline_stage)
-            dist_attr.set_pipeline_stage(copied_pipeline_stage)
-
-    def _store_distributed_attr_to_tensor_desc(self, desc, dist_attr):
-        process_mesh = dist_attr.get_process_mesh()
-        if process_mesh is not None:
-            attr_name = append_distributed_attr_suffix("mesh_id")
-            desc._set_attr(attr_name, process_mesh._id)
-        dims_mapping = dist_attr.get_dims_mapping()
-        if dims_mapping is not None:
-            attr_name = append_distributed_attr_suffix("dim_mapping")
-            desc._set_attr(attr_name, dims_mapping)
-        shard_mask = dist_attr.get_shard_mask()
-        if shard_mask is not None:
-            attr_name = append_distributed_attr_suffix("mask")
-            desc._set_attr(attr_name, shard_mask)
-        offload_device = dist_attr.get_offload_device()
-        if offload_device is not None:
-            attr_name = append_distributed_attr_suffix("offload_device")
-            desc._set_attr(attr_name, offload_device)
-
-    def _store_distributed_attr_to_op_desc(self, desc, dist_attr):
-        process_mesh = dist_attr.get_process_mesh()
-        if process_mesh is not None:
-            attr_name = append_distributed_attr_suffix("mesh_id")
-            desc._set_attr(attr_name, process_mesh._id)
-        for tensor_name in desc.input_arg_names():
-            dims_mapping = dist_attr.get_input_dims_mapping(tensor_name)
-            if dims_mapping is not None:
-                attr_name = append_distributed_attr_suffix("IN_" + tensor_name)
-                desc._set_attr(attr_name, dims_mapping)
-        for tensor_name in desc.output_arg_names():
-            dims_mapping = dist_attr.get_output_dims_mapping(tensor_name)
-            if dims_mapping is not None:
-                attr_name = append_distributed_attr_suffix("OUT_" + tensor_name)
-                desc._set_attr(attr_name, dims_mapping)
-        pipeline_stage = dist_attr.get_pipeline_stage()
-        if pipeline_stage is not None:
-            attr_name = append_distributed_attr_suffix("pipeline_stage")
-            desc._set_attr(attr_name, pipeline_stage)
-
-    def initialize_distributed_attr_for_graph(self, graph):
-        assert self._is_initialized_for_program, \
-            "The program must initialize its distributed attribute before its graph."
-        if self._is_initialized_for_graph:
-            return
-        all_nodes = graph.all_nodes()
-        for node in all_nodes:
-            if node.is_var() and node.var() is not None:
-                tensor_desc = node.var()
-                tensor_id = tensor_desc.id()
-                tensor_dist_attr = self._tensor_distributed_attr_map_for_program[
-                    tensor_id]
-                assert tensor_dist_attr is not None, \
-                    "Tensor must have a distributed attribute after the initialization for program."
-                new_tensor_dist_attr = copy.deepcopy(tensor_dist_attr)
-                self.set_tensor_distributed_attr_for_graph(node,
-                                                           new_tensor_dist_attr)
-
-            if node.is_op() and node.op() is not None:
-                op_desc = node.op()
-                op_id = op_desc.id()
-                op_dist_attr = self._op_distributed_attr_map_for_program[op_id]
-                assert op_dist_attr is not None, \
-                    "Operator must have a distributed attribute after the initialization for program."
-                new_op_dist_attr = copy.deepcopy(op_dist_attr)
-                self.set_op_distributed_attr_for_graph(node, new_op_dist_attr)
-        self._is_initialized_for_graph = True
-
-    def clear_distributed_attr_for_program(self):
-        self._tensor_distributed_attr_map_for_program.clear()
-        self._op_distributed_attr_map_for_program.clear()
-
-    def clear_distributed_attr_for_graph(self):
-        self._tensor_distributed_attr_map_for_graph.clear()
-        self._op_distributed_attr_map_for_graph.clear()
-
-    def copy_distribute_attr_from_graph_to_program(self, graph, program):
-        assert self._is_initialized_for_program and self._is_initialized_for_graph, \
-            "The distribute attributes must be initialized both in its program and graph"
-        updated_tensors = {}
-        all_nodes = graph.all_nodes()
-        for node in all_nodes:
-            if node.is_var() and node.var() is not None:
-                tensor_desc = node.var()
-                tensor_id = tensor_desc.id()
-                updated = updated_tensors.get(tensor_desc.name(), False)
-                # If a var has multiples var nodes in graph, only use the first one for now
-                if not updated:
-                    tensor_dist_attr = self.get_tensor_distributed_attr_for_graph(
-                        node)
-                    new_tensor_dist_attr = copy.deepcopy(tensor_dist_attr)
-                    self._tensor_distributed_attr_map_for_program[
-                        tensor_id] = new_tensor_dist_attr
-                    updated_tensors[tensor_desc.name()] = True
-            if node.is_op() and node.op() is not None:
-                op_desc = node.op()
-                op_id = op_desc.id()
-                op_dist_attr = self.get_op_distributed_attr_for_graph(node)
-                new_op_dist_attr = copy.deepcopy(op_dist_attr)
-                self._op_distributed_attr_map_for_program[
-                    op_id] = new_op_dist_attr
-
-    def amend_distributed_attr_for_program(self):
-        for attr in self._tensor_distributed_attr_map_for_program.values():
-            assert attr.is_valid(), \
-                "Tensor's distributed attribute {} is not valid".format(attr)
-            tensor_shape = attr.get_shape()
-            dims_mapping = attr.get_dims_mapping()
-            process_mesh_shape = attr.get_process_mesh().topology
-            # If the dimension of tensor is less than the sharding dimension of process mesh,
-            # we just amend the dimension mapping to -1. (Is this really OK?)
-            for i in range(len(tensor_shape)):
-                if dims_mapping[i] != -1 and tensor_shape[i] > 0 \
-                    and process_mesh_shape[dims_mapping[i]] > tensor_shape[i]:
-                    dims_mapping[i] = -1
-
-        for attr in self._op_distributed_attr_map_for_program.values():
-            assert attr.is_valid(), \
-                "Operator's distributed attribute {} is not valid".format(attr)
-            for arg_name in attr.get_owner_op().desc.input_arg_names():
-                tensor_shape = attr.get_input_shape(arg_name)
-                dims_mapping = attr.get_input_dims_mapping(arg_name)
-                process_mesh_shape = attr.get_process_mesh().topology
-                # If the dimension of tensor is less than the sharding dimension of process mesh,
-                # we just amend the dimension mapping to -1. (Is this really OK?)
-                for i in range(len(tensor_shape)):
-                    if dims_mapping[i] != -1 and tensor_shape[i] > 0 \
-                        and process_mesh_shape[dims_mapping[i]] > tensor_shape[i]:
-                        dims_mapping[i] = -1
-
-            for arg_name in attr.get_owner_op().desc.output_arg_names():
-                tensor_shape = attr.get_output_shape(arg_name)
-                dims_mapping = attr.get_output_dims_mapping(arg_name)
-                process_mesh_shape = attr.get_process_mesh().topology
-                # If the dimension of tensor is less than the sharding dimension of process mesh,
-                # we just amend the dimension mapping to -1. (Is this really OK?)
-                for i in range(len(tensor_shape)):
-                    if dims_mapping[i] != -1 and tensor_shape[i] > 0 \
-                        and process_mesh_shape[dims_mapping[i]] > tensor_shape[i]:
-                        dims_mapping[i] = -1
-
-
-class DistOpHelper:
-    """
-    DistOpHelper is used to create a dist op desc in Program.
-    Every time to create a new dist op, the context should be updated for it accordingly.
-    """
-
-    def __init__(self):
-        self._dst_main_program = None
-        self._dst_startup_program = None
-        self._varname_mapping = None
-        self._rank_id = None
-        self._cur_src_op = None
-        self._cur_dist_attr = None
-        self.gradopidx2opidx = {}
-        self.already_init_sync_vars = set()
-
-    def set_dst_main_program(self, prog):
-        self._dst_main_program = prog
-
-    def get_dst_main_program(self):
-        return self._dst_main_program
-
-    def set_dst_startup_program(self, prog):
-        self._dst_startup_program = prog
-
-    def get_dst_startup_program(self):
-        return self._dst_startup_program
-
-    def set_varname_mapping(self, mapping):
-        self._varname_mapping = mapping
-
-    def get_varname_mapping(self):
-        return self._varname_mapping
-
-    def set_rank_id(self, rank_id):
-        self._rank_id = rank_id
-
-    def get_rank_id(self):
-        return self._rank_id
-
-    def set_cur_src_op(self, cur_src_op):
-        self._cur_src_op = cur_src_op
-
-    def get_cur_src_op(self):
-        return self._cur_src_op
-
-    def prepare_forward_context(self, src_op):
-
-        self.set_cur_src_op(src_op)
-
-        # build input varname mapping
-        kinputs = {}
-        for input_name in src_op.desc.input_names():
-            varnames = []
-            for varname in src_op.desc.input(input_name):
-                varnames.append(self._varname_mapping[varname])
-            kinputs[input_name] = varnames
-
-        # build output varname mapping
-        koutputs = {}
-        for output_name in src_op.desc.output_names():
-            varnames = []
-            for varname in src_op.desc.output(output_name):
-                varnames.append(self._varname_mapping[varname])
-            koutputs[output_name] = varnames
-
-        return kinputs, koutputs
-
-    def prepare_backward_context(self, backward_op):
-
-        self.set_cur_src_op(backward_op)
-
-        # build input varname mapping
-        kinputs = {}
-        for input_name in backward_op.desc.input_names():
-            varnames = []
-            for varname in backward_op.desc.input(input_name):
-                varnames.append(varname)
-            kinputs[input_name] = varnames
-
-        # build output varname mapping
-        koutputs = {}
-        for output_name in backward_op.desc.output_names():
-            varnames = []
-            for varname in backward_op.desc.output(output_name):
-                varnames.append(varname)
-            koutputs[output_name] = varnames
-
-        return kinputs, koutputs
diff --git a/python/paddle/distributed/auto_parallel/cost_model.py b/python/paddle/distributed/auto_parallel/cost_model.py
index 3fd438e2a624a..b1ff4fb0ba7c9 100644
--- a/python/paddle/distributed/auto_parallel/cost_model.py
+++ b/python/paddle/distributed/auto_parallel/cost_model.py
@@ -131,7 +131,7 @@ def __init__(self,
         elif node.dtype == paddle.int64:
             self.dtype_factor *= 8
         else:
-            raise NotImplementedError("{} not counted".format(v.node.dtype))
+            raise NotImplementedError("{} not counted".format(node.dtype))
 
         self.batch_size = None
         if batch_size is not None:
diff --git a/python/paddle/distributed/auto_parallel/dist_attribute.py b/python/paddle/distributed/auto_parallel/dist_attribute.py
new file mode 100644
index 0000000000000..4415448769d01
--- /dev/null
+++ b/python/paddle/distributed/auto_parallel/dist_attribute.py
@@ -0,0 +1,436 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+
+import copy
+from collections import defaultdict
+from paddle.fluid.framework import Variable
+from .process_mesh import ProcessMesh
+
+_g_tensor_dist_attr_field_keys = [
+    "process_mesh", "dims_mapping", "shard_sizes", "device_placement"
+]
+
+_g_op_dist_attr_field_keys = ["process_mesh", "impl_type", "impl_idx"]
+
+_g_op_input_suffix = "@input"
+
+_g_op_output_suffix = "@output"
+
+
+def get_tensor_dist_attr_field_keys():
+    global _g_tensor_dist_attr_field_keys
+    return _g_tensor_dist_attr_field_keys
+
+
+def get_op_dist_attr_field_keys():
+    global _g_op_dist_attr_field_keys
+    return _g_op_dist_attr_field_keys
+
+
+def append_op_input_suffix(name):
+    global _g_op_input_suffix
+    return name + _g_op_input_suffix
+
+
+def append_op_output_suffix(name):
+    global _g_op_output_suffix
+    return name + _g_op_output_suffix
+
+
+class TensorDistributedAttribute:
+    def __init__(self):
+        # The process mesh of distributed operator attribute must is the same as 
+        # the process meshes of all input and output distributed attributed
+        self._process_mesh = None
+        self._dims_mapping = None
+        self._shard_sizes = None
+        self._device_placement = None
+        self._is_annotated = {}
+
+    @property
+    def process_mesh(self):
+        return self._process_mesh
+
+    @process_mesh.setter
+    def process_mesh(self, process_mesh):
+        if process_mesh is not None:
+            assert isinstance(process_mesh, (list, ProcessMesh)), \
+                "The type of process_mesh must be list or ProcessMesh."
+            if isinstance(process_mesh, list):
+                process_mesh = ProcessMesh(process_mesh)
+            self._process_mesh = copy.deepcopy(process_mesh)
+
+    @property
+    def dims_mapping(self):
+        return self._dims_mapping
+
+    @dims_mapping.setter
+    def dims_mapping(self, dims_mapping):
+        if dims_mapping is not None:
+            assert isinstance(dims_mapping, list), \
+                "The type of dims_mapping must be list."
+            assert all(isinstance(x, int) for x in dims_mapping), \
+                ("All elements of dims_mapping must be integer")
+            assert all(x >= -1 for x in dims_mapping), \
+                ("All elements of dims_mapping must be greater than or equal to -1.")
+            self._dims_mapping = copy.deepcopy(dims_mapping)
+
+    @property
+    def shard_sizes(self):
+        return self._shard_sizes
+
+    @shard_sizes.setter
+    def shard_sizes(self, shard_sizes):
+        if shard_sizes is not None:
+            self._shard_sizes = copy.deepcopy(shard_sizes)
+
+    @property
+    def device_placement(self):
+        return self._device_placement
+
+    @device_placement.setter
+    def device_placement(self, device_placement):
+        if device_placement is not None:
+            self._device_placement = copy.deepcopy(device_placement)
+
+    def init(self, dist_attr):
+        if dist_attr is None:
+            return
+        assert isinstance(dist_attr, (dict, TensorDistributedAttribute)), \
+            "The type of dist_attr must be dict or TensorDistributedAttribute."
+        if isinstance(dist_attr, dict):
+            for key, value in dist_attr.items():
+                if key in get_tensor_dist_attr_field_keys():
+                    field_property = TensorDistributedAttribute.__dict__.get(
+                        key, None)
+                    if field_property:
+                        field_property.fset(self, value)
+                    else:
+                        assert False, "No setter for {} in args {}.".format(
+                            key, dist_attr)
+        elif isinstance(dist_attr, TensorDistributedAttribute):
+            for key in get_tensor_dist_attr_field_keys():
+                field_property = TensorDistributedAttribute.__dict__.get(key,
+                                                                         None)
+                if field_property:
+                    field_property.fset(self, field_property.fget(dist_attr))
+                else:
+                    assert False, "No setter for {} in args {}.".format(
+                        key, dist_attr)
+            self._is_annotated = copy.deepcopy(dist_attr._is_annotated)
+
+    def is_annotated(self, dist_attr_field_name):
+        return self._is_annotated.get(dist_attr_field_name, False)
+
+    def mark_annotated(self, dist_attr_field_name):
+        self._is_annotated[dist_attr_field_name] = True
+
+    def mark_annotated_as(self, dist_attr):
+        if dist_attr is None:
+            return
+        assert isinstance(dist_attr, (dict, TensorDistributedAttribute)), \
+            "The type of dist_attr must be dict or TensorDistributedAttribute."
+        if isinstance(dist_attr, dict):
+            for key in dist_attr.keys():
+                if key in get_tensor_dist_attr_field_keys():
+                    self.mark_annotated(key)
+        elif isinstance(dist_attr, TensorDistributedAttribute):
+            self._is_annotated = copy.deepcopy(dist_attr._is_annotated)
+
+    def clear_annotated(self):
+        self._is_annotated.clear()
+
+    def __str__(self):
+        str = "\n\ttensor_dist_attr = {"
+        if self.is_annotated("process_mesh"):
+            annotated_str = "annotated"
+        else:
+            annotated_str = "non-annotated"
+        str += "\n\t\tprocess_mesh ({}): {},".format(annotated_str,
+                                                     self.process_mesh)
+
+        if self.is_annotated("dims_mapping"):
+            annotated_str = "annotated"
+        else:
+            annotated_str = "non-annotated"
+        str += "\n\t\tdims_mapping ({}): {}".format(annotated_str,
+                                                    self.dims_mapping)
+        str += "\n\t}"
+        return str
+
+
+class OperatorDistributedAttribute:
+    def __init__(self):
+        self._process_mesh = None
+        self._impl_type = None
+        self._impl_idx = None
+        self._inputs_dist_attrs = {}
+        self._outputs_dist_attrs = {}
+        self._is_annotated = {}
+
+    @property
+    def process_mesh(self):
+        return self._process_mesh
+
+    @process_mesh.setter
+    def process_mesh(self, process_mesh):
+        if process_mesh is not None:
+            assert isinstance(process_mesh, (list, ProcessMesh)), \
+                "The type of process_mesh must be list or ProcessMesh."
+            if isinstance(process_mesh, list):
+                process_mesh = ProcessMesh(process_mesh)
+            self._process_mesh = copy.deepcopy(process_mesh)
+            for dist_attr in self._inputs_dist_attrs.values():
+                dist_attr.process_mesh = process_mesh
+            for dist_attr in self._outputs_dist_attrs.values():
+                dist_attr.process_mesh = process_mesh
+
+    @property
+    def impl_type(self):
+        return self._impl_type
+
+    @impl_type.setter
+    def impl_type(self, impl_type):
+        if impl_type is not None:
+            self._impl_type = impl_type
+
+    @property
+    def impl_idx(self):
+        return self._impl_idx
+
+    @impl_idx.setter
+    def impl_idx(self, impl_idx):
+        if impl_idx is not None:
+            self._impl_idx = impl_idx
+
+    @property
+    def inputs_dist_attrs(self):
+        return self._inputs_dist_attrs
+
+    @property
+    def outputs_dist_attrs(self):
+        return self._outputs_dist_attrs
+
+    def get_input_dist_attr(self, name):
+        return self._inputs_dist_attrs.get(name, None)
+
+    def set_input_dist_attr(self, name, dist_attr):
+        dist_attr_object = TensorDistributedAttribute()
+        dist_attr_object.init(dist_attr)
+        self._inputs_dist_attrs[name] = dist_attr_object
+
+    def get_output_dist_attr(self, name):
+        return self._outputs_dist_attrs.get(name, None)
+
+    def set_output_dist_attr(self, name, dist_attr):
+        dist_attr_object = TensorDistributedAttribute()
+        dist_attr_object.init(dist_attr)
+        self._outputs_dist_attrs[name] = dist_attr_object
+
+    def get_input_dims_mapping(self, name):
+        input_dist_attr = self.get_input_dist_attr(name)
+        if input_dist_attr:
+            dims_mapping = input_dist_attr.dims_mapping
+        else:
+            dims_mapping = None
+        return dims_mapping
+
+    def set_input_dims_mapping(self, name, dims_mapping):
+        input_dist_attr = self.get_input_dist_attr(name)
+        if input_dist_attr:
+            input_dist_attr.dims_mapping = dims_mapping
+        else:
+            dist_attr = TensorDistributedAttribute()
+            dist_attr.dims_mapping = dims_mapping
+            self._inputs_dist_attrs[name] = dist_attr
+
+    def get_output_dims_mapping(self, name):
+        output_dist_attr = self.get_output_dist_attr(name)
+        if output_dist_attr:
+            dims_mapping = output_dist_attr.dims_mapping
+        else:
+            dims_mapping = None
+        return dims_mapping
+
+    def set_output_dims_mapping(self, name, dims_mapping):
+        output_dist_attr = self.get_output_dist_attr(name)
+        if output_dist_attr:
+            output_dist_attr.dims_mapping = dims_mapping
+        else:
+            dist_attr = TensorDistributedAttribute()
+            dist_attr.dims_mapping = dims_mapping
+            self._outputs_dist_attrs[name] = dist_attr
+
+    def init(self, dist_attr):
+        if dist_attr is None:
+            return
+        assert isinstance(dist_attr, (dict, OperatorDistributedAttribute)), \
+            "The type of dist_attr must be dict or OperatorDistributedAttribute."
+        if isinstance(dist_attr, dict):
+            for key, value in dist_attr.items():
+                if isinstance(key, Variable):
+                    tensor_dist_attr = TensorDistributedAttribute()
+                    tensor_dist_attr.init(value)
+                    if dist_attr.get(append_op_input_suffix(key.name), False):
+                        self.set_input_dist_attr(key.name, tensor_dist_attr)
+                    if dist_attr.get(append_op_output_suffix(key.name), False):
+                        self.set_output_dist_attr(key.name, tensor_dist_attr)
+                else:
+                    if key in get_op_dist_attr_field_keys():
+                        field_property = OperatorDistributedAttribute.__dict__.get(
+                            key, None)
+                        if field_property:
+                            field_property.fset(self, value)
+                        else:
+                            assert False, "No setter for {} in args {}.".format(
+                                key, dist_attr)
+        elif isinstance(dist_attr, OperatorDistributedAttribute):
+            for tensor_name, tensor_dist_attr in dist_attr.inputs_dist_attrs.items(
+            ):
+                self.set_input_dist_attr(
+                    tensor_name, dist_attr.get_input_dist_attr(tensor_name))
+            for tensor_name, tensor_dist_attr in dist_attr.outputs_dist_attrs.items(
+            ):
+                self.set_output_dist_attr(
+                    tensor_name, dist_attr.get_output_dist_attr(tensor_name))
+            self._is_annotated = copy.deepcopy(dist_attr._is_annotated)
+            for key in get_op_dist_attr_field_keys():
+                field_property = OperatorDistributedAttribute.__dict__.get(key,
+                                                                           None)
+                if field_property:
+                    field_property.fset(self, field_property.fget(dist_attr))
+                else:
+                    assert False, "No setter for {} in args {}.".format(
+                        key, dist_attr)
+        # Make sure proscess_meshes in dist op be same
+        process_meshes = []
+        process_meshes.append(self.process_mesh)
+        for tensor_dist_attr in self.inputs_dist_attrs.values():
+            process_meshes.append(tensor_dist_attr.process_mesh)
+        for tensor_dist_attr in self.outputs_dist_attrs.values():
+            process_meshes.append(tensor_dist_attr.process_mesh)
+        shared_process_mesh = None
+        for process_mesh in process_meshes:
+            if process_mesh is not None:
+                if shared_process_mesh is None:
+                    shared_process_mesh = process_mesh
+                else:
+                    assert process_mesh == shared_process_mesh, \
+                        "ProcessMeshes in DistributedOperator must be the same."
+        self.process_mesh = shared_process_mesh
+
+    def is_annotated(self, attr_name):
+        return self._is_annotated.get(attr_name, False)
+
+    def mark_annotated(self, attr_name):
+        if attr_name == "process_mesh":
+            # Make sure proscess_mesh be annotated consistently
+            self._is_annotated[attr_name] = True
+            for tensor_dist_attr in self.inputs_dist_attrs.values():
+                tensor_dist_attr.mark_annotated(attr_name)
+            for tensor_dist_attr in self.outputs_dist_attrs.values():
+                tensor_dist_attr.mark_annotated(attr_name)
+        else:
+            self._is_annotated[attr_name] = True
+
+    def mark_annotated_as(self, dist_attr):
+        if dist_attr is None:
+            return
+        assert isinstance(dist_attr, (dict, OperatorDistributedAttribute)), \
+            "The type of dist_attr must be dict or OperatorDistributedAttribute."
+        if isinstance(dist_attr, dict):
+            for key, value in dist_attr.items():
+                if isinstance(key, Variable):
+                    input_dist_attr = self.get_input_dist_attr(key.name)
+                    if input_dist_attr is not None:
+                        input_dist_attr.mark_annotated_as(value)
+                    output_dist_attr = self.get_output_dist_attr(key.name)
+                    if output_dist_attr is not None:
+                        output_dist_attr.mark_annotated_as(value)
+                else:
+                    if key in get_op_dist_attr_field_keys():
+                        self.mark_annotated(key)
+            process_mesh_annotated = False
+            if self.is_annotated("process_mesh"):
+                process_mesh_annotated = True
+            for tensor_dist_attr in self.inputs_dist_attrs.values():
+                if tensor_dist_attr.is_annotated("process_mesh"):
+                    process_mesh_annotated = True
+            for tensor_dist_attr in self.outputs_dist_attrs.values():
+                if tensor_dist_attr.is_annotated("process_mesh"):
+                    process_mesh_annotated = True
+            if process_mesh_annotated:
+                self.mark_annotated("process_mesh")
+        elif isinstance(dist_attr, OperatorDistributedAttribute):
+            process_mesh_annotated = False
+            self._is_annotated = copy.deepcopy(dist_attr._is_annotated)
+            if self.is_annotated("process_mesh"):
+                process_mesh_annotated = True
+            for tensor_name, tensor_dist_attr in dist_attr.inputs_dist_attrs.items(
+            ):
+                input_dist_attr = self.get_input_dist_attr(tensor_name)
+                if input_dist_attr is not None:
+                    input_dist_attr.mark_annotated_as(tensor_dist_attr)
+                    if input_dist_attr.is_annotated("process_mesh"):
+                        process_mesh_annotated = True
+            for tensor_name, tensor_dist_attr in dist_attr.outputs_dist_attrs.items(
+            ):
+                output_dist_attr = self.get_output_dist_attr(tensor_name)
+                if output_dist_attr is not None:
+                    output_dist_attr.mark_annotated_as(tensor_dist_attr)
+                    if output_dist_attr.is_annotated("process_mesh"):
+                        process_mesh_annotated = True
+            if process_mesh_annotated:
+                self.mark_annotated("process_mesh")
+
+    def clear_annotated(self):
+        self._is_annotated.clear()
+        for tensor_dist_attr in self.inputs_dist_attrs.values():
+            tensor_dist_attr.clear_annotated()
+        for tensor_dist_attr in self.outputs_dist_attrs.values():
+            tensor_dist_attr.clear_annotated()
+
+    def is_annotated_input_dims_mapping(self, name):
+        input_dist_attr = self.get_input_dist_attr(name)
+        if input_dist_attr:
+            return input_dist_attr.is_annotated("dims_mapping")
+        else:
+            return False
+
+    def is_annotated_output_dims_mapping(self, name):
+        output_dist_attr = self.get_output_dist_attr(name)
+        if output_dist_attr:
+            return output_dist_attr.is_annotated("dims_mapping")
+        else:
+            return False
+
+    def __str__(self):
+        str = "\n\top_dist_attr = {"
+        if self.is_annotated("process_mesh"):
+            annotated_str = "annotated"
+        else:
+            annotated_str = "non-annotated"
+        str += "\n\t\tprocess_mesh ({}): {},".format(annotated_str,
+                                                     self.process_mesh)
+
+        for arg_name, tensor_dist_attr in self.inputs_dist_attrs.items():
+            str += "\n\t\t{}'s: {},".format(arg_name, tensor_dist_attr)
+
+        for arg_name, tensor_dist_attr in self.outputs_dist_attrs.items():
+            str += "\n\t\t{}'s: {},".format(arg_name, tensor_dist_attr)
+
+        str += "\n\t\timpl type: {}, ".format(self._impl_type)
+        str += "impl idx: {}".format(self._impl_idx)
+        str += "\n\t}"
+        return str
diff --git a/python/paddle/distributed/auto_parallel/dist_context.py b/python/paddle/distributed/auto_parallel/dist_context.py
new file mode 100755
index 0000000000000..e3b3ee6a3760a
--- /dev/null
+++ b/python/paddle/distributed/auto_parallel/dist_context.py
@@ -0,0 +1,427 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+
+import copy
+from collections import defaultdict
+from paddle.fluid import framework
+from paddle.fluid import core
+from .dist_attribute import TensorDistributedAttribute
+from .dist_attribute import OperatorDistributedAttribute
+from .dist_tensor import DistributedTensor
+from .dist_op import DistributedOperator
+from .process_mesh import ProcessMesh
+
+# There always exists a default context for user. And user can set it to another one.
+_g_default_distributed_context = None
+
+
+def get_default_distributed_context():
+    global _g_default_distributed_context
+    if _g_default_distributed_context is None:
+        dist_context = DistributedContext()
+        set_default_distributed_context(dist_context)
+    return _g_default_distributed_context
+
+
+def set_default_distributed_context(dist_context):
+    global _g_default_distributed_context
+    _g_default_distributed_context = dist_context
+
+
+class DistributedContext:
+    """
+    DistributedContext is used to collect related distributed information for program and graph.
+    One auto-parallel run should use its own DistributedContext to avoid interfering other run.
+    """
+
+    def __init__(self, program=None):
+        self._serial_program = program
+        self._serial_graph = None
+        self._is_initialized_for_program = False
+        self._is_initialized_for_graph = False
+        self._dist_tensors_for_program = {}
+        self._dist_ops_for_program = {}
+        self._dist_tensors_for_graph = {}
+        self._dist_ops_for_graph = {}
+        self._dist_op_context = DistributedOperatorContext()
+        self._process_meshes = []
+
+    @property
+    def serial_program(self):
+        return self._serial_program
+
+    @property
+    def serial_graph(self):
+        return self._serial_graph
+
+    @serial_program.setter
+    def serial_program(self, program):
+        assert self._serial_program is None, \
+            "This distributed context has already been realted to a serial program"
+        self._serial_program = program
+
+    @property
+    def process_meshes(self):
+        return self._process_meshes
+
+    @property
+    def dist_op_context(self):
+        return self._dist_op_context
+
+    def add_process_mesh(self, process_mesh):
+        assert isinstance(process_mesh, ProcessMesh), \
+            'The type of dim_mapping must be ProcessMesh.'
+        if process_mesh not in self.process_meshes:
+            self._process_meshes.append(process_mesh)
+
+    def add_dist_tensor_for_program(self, dist_tensor):
+        inner_serial_tensor = dist_tensor.serial_tensor
+        inner_serial_tensor_id = inner_serial_tensor.desc.id()
+        self._dist_tensors_for_program[inner_serial_tensor_id] = dist_tensor
+
+    def add_dist_op_for_program(self, dist_op):
+        inner_serial_op = dist_op.serial_op
+        inner_serial_op_id = inner_serial_op.desc.id()
+        self._dist_ops_for_program[inner_serial_op_id] = dist_op
+
+    def get_dist_tensor_for_program(self, serial_tensor):
+        serial_tensor_id = serial_tensor.desc.id()
+        return self._dist_tensors_for_program.get(serial_tensor_id, None)
+
+    def get_dist_tensor_for_graph(self, serial_tensor_node):
+        serial_tensor_node_id = serial_tensor_node.id()
+        return self._dist_tensors_for_graph.get(serial_tensor_node_id, None)
+
+    def get_dist_op_for_program(self, serial_tensor):
+        serial_tensor_id = serial_tensor.desc.id()
+        return self._dist_ops_for_program.get(serial_tensor_id, None)
+
+    def get_dist_op_for_graph(self, serial_tensor_node):
+        serial_tensor_node_id = serial_tensor_node.id()
+        return self._dist_ops_for_graph.get(serial_tensor_node_id, None)
+
+    def get_tensor_dist_attr_for_program(self, serial_tensor):
+        serial_tensor_id = serial_tensor.desc.id()
+        dist_tensor = self._dist_tensors_for_program.get(serial_tensor_id, None)
+        if dist_tensor:
+            return dist_tensor.dist_attr
+        else:
+            return None
+
+    def set_tensor_dist_attr_for_program(self, serial_tensor, dist_attr):
+        dist_tensor = DistributedTensor(serial_tensor, dist_attr)
+        self.add_dist_tensor_for_program(dist_tensor)
+
+    def get_tensor_dist_attr_for_graph(self, serial_tensor_node):
+        serial_tensor_node_id = serial_tensor_node.id()
+        dist_tensor = self._dist_tensors_for_graph.get(serial_tensor_node_id,
+                                                       None)
+        if dist_tensor:
+            return dist_tensor.dist_attr
+        else:
+            return None
+
+    def set_tensor_dist_attr_for_graph(self, serial_tensor_node, dist_attr):
+        assert serial_tensor_node.is_var() and \
+            serial_tensor_node.var() is not None
+        serial_tensor_id = serial_tensor_node.var().id()
+        dist_tensor = self._dist_tensors_for_program.get(serial_tensor_id, None)
+        assert dist_tensor is not None, \
+            "The distributed tensor of the program has not been added to this context."
+        serial_tensor_node_id = serial_tensor_node.id()
+        new_dist_tensor = DistributedTensor(dist_tensor.serial_tensor,
+                                            dist_attr)
+        self._dist_tensors_for_graph[serial_tensor_node_id] = new_dist_tensor
+
+    def get_op_dist_attr_for_program(self, serial_op):
+        serial_op_id = serial_op.desc.id()
+        dist_op = self._dist_ops_for_program.get(serial_op_id, None)
+        if dist_op:
+            return dist_op.dist_attr
+        else:
+            return None
+
+    def set_op_dist_attr_for_program(self, serial_op, dist_attr):
+        dist_op = DistributedOperator(serial_op, dist_attr)
+        self.add_dist_op_for_program(dist_op)
+
+    def get_op_dist_attr_for_graph(self, serial_op_node):
+        serial_op_node_id = serial_op_node.id()
+        dist_op = self._dist_ops_for_graph.get(serial_op_node_id, None)
+        if dist_op:
+            return dist_op.dist_attr
+        else:
+            return None
+
+    def set_op_dist_attr_for_graph(self, serial_op_node, dist_attr):
+        assert serial_op_node.is_op() and \
+            serial_op_node.op() is not None
+        serial_op_id = serial_op_node.op().id()
+        dist_op = self._dist_ops_for_program.get(serial_op_id, None)
+        assert dist_op is not None, \
+            "The distributed operator of the program has not been added to this context."
+        serial_op_node_id = serial_op_node.id()
+        new_dist_op = DistributedOperator(dist_op.serial_op, dist_attr)
+        self._dist_ops_for_graph[serial_op_node_id] = new_dist_op
+
+    def init_dist_attr_for_program(self):
+        assert self._serial_program, \
+            "Please set the program of this context before initializing its distribute attributes."
+        if self._is_initialized_for_program:
+            return
+        # Copy the dist tensors and dist ops annotated by users from the default context
+        default_ctx = get_default_distributed_context()
+        self._process_meshes = copy.deepcopy(default_ctx.process_meshes)
+        for block in self._serial_program.blocks:
+            for tensor in block.vars.values():
+                # Copy the distributed tensors in the default context
+                default_dist_tensor = default_ctx.get_dist_tensor_for_program(
+                    tensor)
+                if default_dist_tensor and default_ctx is not self:
+                    self.add_dist_tensor_for_program(default_dist_tensor)
+                current_dist_tensor = self.get_dist_tensor_for_program(tensor)
+                if current_dist_tensor is None:
+                    dist_tensor = DistributedTensor(tensor)
+                    self.add_dist_tensor_for_program(dist_tensor)
+            for op in block.ops:
+                # Copy the distributed operators in the default context
+                default_dist_op = default_ctx.get_dist_op_for_program(op)
+                if default_dist_op and default_ctx is not self:
+                    self.add_dist_op_for_program(default_dist_op)
+                current_dist_op = self.get_dist_op_for_program(op)
+                if current_dist_op is None:
+                    dist_op = DistributedOperator(op)
+                    self.add_dist_op_for_program(dist_op)
+        self._is_initialized_for_program = True
+
+    def init_dist_attr_for_graph(self):
+        assert self._is_initialized_for_program, \
+            "The program must be initialized before initializing the distributed attributes for its graph."
+        if self._is_initialized_for_graph:
+            return
+        # Convert program to graph
+        self._serial_graph = framework.IrGraph(
+            core.Graph(self._serial_program.desc))
+        all_nodes = self._serial_graph.all_nodes()
+        for node in all_nodes:
+            if node.is_var() and node.var() is not None:
+                tensor_desc = node.var()
+                tensor_id = tensor_desc.id()
+                dist_tensor = self._dist_tensors_for_program.get(tensor_id,
+                                                                 None)
+                assert dist_tensor is not None, \
+                    "Tensor must have a distributed tensor after the initialization for program."
+                self.set_tensor_dist_attr_for_graph(node, dist_tensor.dist_attr)
+            if node.is_op() and node.op() is not None:
+                op_desc = node.op()
+                op_id = op_desc.id()
+                dist_op = self._dist_ops_for_program.get(op_id, None)
+                assert dist_op is not None, \
+                    "Operator must have a distributed operator after the initialization for program."
+                self.set_op_dist_attr_for_graph(node, dist_op.dist_attr)
+        self._is_initialized_for_graph = True
+
+    def clear_dist_info_for_program(self):
+        self._dist_tensors_for_program.clear()
+        self._dist_ops_for_program.clear()
+
+    def clear_dist_info_for_graph(self):
+        self._dist_tensors_for_graph.clear()
+        self._dist_ops_for_graph.clear()
+
+    def copy_dist_attr_from_graph_to_program(self):
+        assert self._is_initialized_for_program and self._is_initialized_for_graph, \
+            "Both program and graph must be initialized."
+        updated_tensors = {}
+        all_nodes = self._serial_graph.all_nodes()
+        for node in all_nodes:
+            if node.is_var() and node.var() is not None:
+                tensor_desc = node.var()
+                tensor_id = tensor_desc.id()
+                updated = updated_tensors.get(tensor_desc.name(), False)
+                # If a var has multiples var nodes in graph, only use the first one for now
+                if not updated:
+                    tensor_dist_attr_for_graph = self.get_tensor_dist_attr_for_graph(
+                        node)
+                    dist_tensor_for_program = self._dist_tensors_for_program[
+                        tensor_id]
+                    dist_tensor_for_program.dist_attr = tensor_dist_attr_for_graph
+                    updated_tensors[tensor_desc.name()] = True
+            if node.is_op() and node.op() is not None:
+                op_desc = node.op()
+                op_id = op_desc.id()
+                op_dist_attr_for_graph = self.get_op_dist_attr_for_graph(node)
+                dist_op_for_program = self._dist_ops_for_program[op_id]
+                dist_op_for_program.dist_attr = op_dist_attr_for_graph
+
+    def amend_dist_attr_for_program(self):
+        for dist_tensor in self._dist_tensors_for_program.values():
+            serial_tensor = dist_tensor.serial_tensor
+            dist_attr = dist_tensor.dist_attr
+            if serial_tensor.type == core.VarDesc.VarType.READER:
+                tensor_shape = []
+            else:
+                tensor_shape = serial_tensor.shape
+            dims_mapping = dist_attr.dims_mapping
+            process_mesh_shape = dist_attr.process_mesh.topology
+            # If the dimension of tensor is less than the sharding dimension of process mesh,
+            # we just amend the dimension mapping to -1. (Is this really OK?)
+            for i in range(len(tensor_shape)):
+                if dims_mapping[i] != -1 and tensor_shape[i] > 0 \
+                    and process_mesh_shape[dims_mapping[i]] > tensor_shape[i]:
+                    dims_mapping[i] = -1
+
+        for dist_op in self._dist_ops_for_program.values():
+            serial_op = dist_op.serial_op
+            dist_attr = dist_op.dist_attr
+            for arg_name in serial_op.input_arg_names:
+                if dist_op.get_serial_input(arg_name) is None:
+                    tensor_shape = []
+                else:
+                    if dist_op.get_serial_input(arg_name).type == core.VarDesc.VarType.READER \
+                        or dist_op.serial_op.type == "create_py_reader":
+                        tensor_shape = []
+                    else:
+                        tensor_shape = dist_op.get_serial_input(arg_name).shape
+                dims_mapping = dist_attr.get_input_dims_mapping(arg_name)
+                process_mesh_shape = dist_attr.process_mesh.topology
+                # If the dimension of tensor is less than the sharding dimension of process mesh,
+                # we just amend the dimension mapping to -1. (Is this really OK?)
+                for i in range(len(tensor_shape)):
+                    if dims_mapping[i] != -1 and tensor_shape[i] > 0 \
+                        and process_mesh_shape[dims_mapping[i]] > tensor_shape[i]:
+                        dims_mapping[i] = -1
+            for arg_name in serial_op.output_arg_names:
+                if dist_op.get_serial_output(
+                        arg_name).type == core.VarDesc.VarType.READER:
+                    tensor_shape = []
+                else:
+                    tensor_shape = dist_op.get_serial_output(arg_name).shape
+                dims_mapping = dist_attr.get_output_dims_mapping(arg_name)
+                process_mesh_shape = dist_attr.process_mesh.topology
+                # If the dimension of tensor is less than the sharding dimension of process mesh,
+                # we just amend the dimension mapping to -1. (Is this really OK?)
+                for i in range(len(tensor_shape)):
+                    if dims_mapping[i] != -1 and tensor_shape[i] > 0 \
+                        and process_mesh_shape[dims_mapping[i]] > tensor_shape[i]:
+                        dims_mapping[i] = -1
+
+    def validate_dist_attr_for_program(self):
+        if not self._is_initialized_for_program:
+            assert False, \
+                "Program must be initialized before validating its distributed attributes"
+        for block in self.serial_program.blocks:
+            for tensor in block.vars.values():
+                dist_tensor = self.get_dist_tensor_for_program(tensor)
+                if (dist_tensor is not None) and (
+                        not dist_tensor.validate_dist_attr()):
+                    assert False, "Tensor {} has a wrong distributed attributes {}.".format(
+                        dist_tensor.serial_tensor.name, dist_tensor.dist_attr)
+            for op in block.ops:
+                dist_op = self.get_dist_op_for_program(op)
+                if (dist_op is not None) and (not dist_op.validate_dist_attr()):
+                    assert False, "Operator {} has a wrong distributed attributes {}.".format(
+                        dist_op.serial_op.type, dist_tensor.dist_attr)
+        return True
+
+
+class DistributedOperatorContext:
+    """
+    DistributedOperatorContext is used to create a dist op desc in Program.
+    Every time to create a new dist op, the context should be updated for it accordingly.
+    """
+
+    def __init__(self):
+        self._dst_main_program = None
+        self._dst_startup_program = None
+        self._varname_mapping = None
+        self._rank_id = None
+        self._cur_src_op = None
+        self._cur_dist_attr = None
+        self.gradopidx2opidx = {}
+        self.already_init_sync_vars = set()
+
+    def set_dst_main_program(self, prog):
+        self._dst_main_program = prog
+
+    def get_dst_main_program(self):
+        return self._dst_main_program
+
+    def set_dst_startup_program(self, prog):
+        self._dst_startup_program = prog
+
+    def get_dst_startup_program(self):
+        return self._dst_startup_program
+
+    def set_varname_mapping(self, mapping):
+        self._varname_mapping = mapping
+
+    def get_varname_mapping(self):
+        return self._varname_mapping
+
+    def set_rank_id(self, rank_id):
+        self._rank_id = rank_id
+
+    def get_rank_id(self):
+        return self._rank_id
+
+    def set_cur_src_op(self, cur_src_op):
+        self._cur_src_op = cur_src_op
+
+    def get_cur_src_op(self):
+        return self._cur_src_op
+
+    def prepare_forward_context(self, src_op):
+
+        self.set_cur_src_op(src_op)
+
+        # build input varname mapping
+        kinputs = {}
+        for input_name in src_op.desc.input_names():
+            varnames = []
+            for varname in src_op.desc.input(input_name):
+                varnames.append(self._varname_mapping[varname])
+            kinputs[input_name] = varnames
+
+        # build output varname mapping
+        koutputs = {}
+        for output_name in src_op.desc.output_names():
+            varnames = []
+            for varname in src_op.desc.output(output_name):
+                varnames.append(self._varname_mapping[varname])
+            koutputs[output_name] = varnames
+
+        return kinputs, koutputs
+
+    def prepare_backward_context(self, backward_op):
+
+        self.set_cur_src_op(backward_op)
+
+        # build input varname mapping
+        kinputs = {}
+        for input_name in backward_op.desc.input_names():
+            varnames = []
+            for varname in backward_op.desc.input(input_name):
+                varnames.append(varname)
+            kinputs[input_name] = varnames
+
+        # build output varname mapping
+        koutputs = {}
+        for output_name in backward_op.desc.output_names():
+            varnames = []
+            for varname in backward_op.desc.output(output_name):
+                varnames.append(varname)
+            koutputs[output_name] = varnames
+
+        return kinputs, koutputs
diff --git a/python/paddle/distributed/auto_parallel/dist_op.py b/python/paddle/distributed/auto_parallel/dist_op.py
new file mode 100644
index 0000000000000..aa447d7a42347
--- /dev/null
+++ b/python/paddle/distributed/auto_parallel/dist_op.py
@@ -0,0 +1,243 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+
+import copy
+from collections import defaultdict
+import paddle
+from paddle.fluid import core
+from paddle.fluid.framework import Variable
+from .dist_attribute import TensorDistributedAttribute
+from .dist_attribute import OperatorDistributedAttribute
+from .dist_attribute import append_op_input_suffix
+from .dist_attribute import append_op_output_suffix
+from .dist_attribute import get_tensor_dist_attr_field_keys
+from .dist_attribute import get_op_dist_attr_field_keys
+
+
+class DistributedOperator:
+    def __init__(self, serial_op, dist_attr=None):
+        self._serial_op = serial_op
+        self._serial_inputs = {}
+        self._serial_outputs = {}
+        self._dist_attr = None
+        # Reuse the dist_attr setter to initialize _dist_attr
+        self.dist_attr = dist_attr
+
+    @property
+    def serial_op(self):
+        return self._serial_op
+
+    @property
+    def dist_attr(self):
+        return self._dist_attr
+
+    @dist_attr.setter
+    def dist_attr(self, dist_attr):
+        if self._dist_attr is None:
+            self._dist_attr = OperatorDistributedAttribute()
+        # Create new dist_attr related to current serial_op
+        dist_attr = self._filter_dist_attr(dist_attr)
+        # Append suffix to mark the inputs or outputs
+        if isinstance(dist_attr, dict):
+            # Copy the keys since we may add new ones
+            for key in list(dist_attr.keys()):
+                if isinstance(key, Variable):
+                    if key.name in self._serial_op.input_arg_names:
+                        dist_attr[append_op_input_suffix(key.name)] = True
+                    if key.name in self._serial_op.output_arg_names:
+                        dist_attr[append_op_output_suffix(key.name)] = True
+        self._dist_attr.init(dist_attr)
+        self._init_default_dist_attr()
+
+    def get_serial_input(self, name):
+        return self._serial_inputs.get(name, None)
+
+    def get_serial_output(self, name):
+        return self._serial_outputs.get(name, None)
+
+    def _init_default_dist_attr(self):
+        for tensor_name in self._serial_op.input_arg_names:
+            if self._serial_op.type == "create_py_reader":
+                tensor = None
+            else:
+                tensor = self._serial_op.block._var_recursive(tensor_name)
+            self._serial_inputs[tensor_name] = tensor
+            if tensor is None:
+                tensor_shape = []
+            else:
+                if tensor.type == core.VarDesc.VarType.READER:
+                    tensor_shape = []
+                else:
+                    tensor_shape = tensor.shape
+            if self._dist_attr.get_input_dims_mapping(tensor_name) is None:
+                tensor_dims_mapping = [-1 for _ in range(len(tensor_shape))]
+                self._dist_attr.set_input_dims_mapping(tensor_name,
+                                                       tensor_dims_mapping)
+        for tensor_name in self._serial_op.output_arg_names:
+            tensor = self._serial_op.block._var_recursive(tensor_name)
+            if tensor.type == core.VarDesc.VarType.READER:
+                tensor_shape = []
+            else:
+                tensor_shape = tensor.shape
+            self._serial_outputs[tensor_name] = tensor
+            if self._dist_attr.get_output_dims_mapping(tensor_name) is None:
+                tensor_dims_mapping = [-1 for _ in range(len(tensor_shape))]
+                self._dist_attr.set_output_dims_mapping(tensor_name,
+                                                        tensor_dims_mapping)
+        if self._dist_attr.impl_type is None:
+            self._dist_attr.impl_type = "default"
+        if self._dist_attr.impl_idx is None:
+            self._dist_attr.impl_idx = -2
+
+    def _filter_dist_attr(self, dist_attr):
+        if dist_attr is None:
+            return None
+        new_dist_attr = None
+        if isinstance(dist_attr, dict):
+            new_dist_attr = {}
+            for key, value in dist_attr.items():
+                if isinstance(key, Variable):
+                    if key.name in self._serial_op.input_arg_names \
+                        or key.name in self._serial_op.output_arg_names:
+                        new_dist_attr[key] = value
+                else:
+                    new_dist_attr[key] = value
+        elif isinstance(dist_attr, OperatorDistributedAttribute):
+            new_dist_attr = copy.deepcopy(dist_attr)
+            new_dist_attr._inputs_dist_attrs.clear()
+            new_dist_attr._outputs_dist_attrs.clear()
+            for tensor_name in self._serial_op.input_arg_names:
+                tensor_dist_attr = dist_attr.get_input_dist_attr(tensor_name)
+                if tensor_dist_attr:
+                    new_dist_attr.set_input_dist_attr(tensor_name,
+                                                      tensor_dist_attr)
+            for tensor_name in self._serial_op.output_arg_names:
+                tensor_dist_attr = dist_attr.get_output_dist_attr(tensor_name)
+                if tensor_dist_attr:
+                    new_dist_attr.set_output_dist_attr(tensor_name,
+                                                       tensor_dist_attr)
+        else:
+            assert False, "Cannot recognize the {} parameter.".format(dist_attr)
+        return new_dist_attr
+
+    def validate_dist_attr(self):
+        if "read" in self.serial_op.type:
+            return True
+        for name in self.serial_op.input_arg_names:
+            input_dist_attr = self.dist_attr.get_input_dist_attr(name)
+            dims_mapping = input_dist_attr.dims_mapping
+            shape = self.get_serial_input(name).shape
+            if len(shape) != len(dims_mapping):
+                return False
+            for i in range(len(dims_mapping)):
+                if dims_mapping[i] < -1 or dims_mapping[i] >= len(
+                        self.dist_attr.process_mesh.topology):
+                    return False
+            for i in range(len(self.dist_attr.process_mesh.topology)):
+                if dims_mapping.count(i) > 1:
+                    return False
+            if self.dist_attr.process_mesh != input_dist_attr.process_mesh:
+                return False
+
+        for name in self.serial_op.output_arg_names:
+            output_dist_attr = self.dist_attr.get_output_dist_attr(name)
+            dims_mapping = output_dist_attr.dims_mapping
+            shape = self.get_serial_output(name).shape
+            if len(shape) != len(dims_mapping):
+                return False
+            for i in range(len(dims_mapping)):
+                if dims_mapping[i] < -1 or dims_mapping[i] >= len(
+                        self.dist_attr.process_mesh.topology):
+                    return False
+            for i in range(len(self.dist_attr.process_mesh.topology)):
+                if dims_mapping.count(i) > 1:
+                    return False
+            if self.dist_attr.process_mesh != output_dist_attr.process_mesh:
+                return False
+        return True
+
+    def __str__(self):
+        str = "{{op type: {}, op id: {}".format(self.serial_op.desc.type(),
+                                                self.serial_op.desc.id())
+
+        # str += ", {}".format(self.dist_attr)
+        # return str
+
+        if self.dist_attr.is_annotated("process_mesh"):
+            annotated_str = "annotated"
+        else:
+            annotated_str = "non-annotated"
+        str += ", process_mesh ({}): {}".format(annotated_str,
+                                                self.dist_attr.process_mesh)
+
+        for arg_name in self.serial_op.desc.input_arg_names():
+            dims_mapping = self.dist_attr.get_input_dims_mapping(arg_name)
+            if self.dist_attr.is_annotated_input_dims_mapping(arg_name):
+                annotated_str = "annotated"
+            else:
+                annotated_str = "non-annotated"
+            if self.get_serial_input(arg_name) is not None:
+                if self.get_serial_input(arg_name).is_parameter:
+                    is_parameter_str = "parameter"
+                else:
+                    is_parameter_str = "non-parameter"
+            else:
+                is_parameter_str = "non-parameter"
+            str += ", {}'s dims_mapping (input, {}, {}): {}".format(
+                arg_name, annotated_str, is_parameter_str, dims_mapping)
+
+        for arg_name in self.serial_op.desc.output_arg_names():
+            dims_mapping = self.dist_attr.get_output_dims_mapping(arg_name)
+            if self.dist_attr.is_annotated_output_dims_mapping(arg_name):
+                annotated_str = "annotated"
+            else:
+                annotated_str = "non-annotated"
+            if self.get_serial_output(arg_name) is not None:
+                if self.get_serial_output(arg_name).is_parameter:
+                    is_parameter_str = "parameter"
+                else:
+                    is_parameter_str = "non-parameter"
+            else:
+                is_parameter_str = "non-parameter"
+            str += ", {}'s dims_mapping (output, {}, {}): {}".format(
+                arg_name, annotated_str, is_parameter_str, dims_mapping)
+
+        str += ", pipeline stage: {}".format(None)
+
+        str += ", dist_impl idx: {} }}".format(self.dist_attr._impl_idx)
+
+        return str
+
+
+class DistributedModule:
+    def __init__(self, serial_module, dist_attr=None):
+        self._serial_module = serial_module
+        self._dist_attr = dist_attr
+
+    def __call__(self, *args, **kwargs):
+        from .dist_context import get_default_distributed_context
+        main_prog = paddle.fluid.default_main_program()
+        main_block = main_prog.global_block()
+        op_size = len(main_block.ops)
+        output = self._serial_module(*args, **kwargs)
+        new_op_size = len(main_block.ops)
+        default_dist_ctx = get_default_distributed_context()
+        for idx in range(op_size, new_op_size):
+            op = main_block.ops[idx]
+            dist_op = DistributedOperator(op, self._dist_attr)
+            dist_op.dist_attr.mark_annotated_as(self._dist_attr)
+            default_dist_ctx.add_dist_op_for_program(dist_op)
+        if isinstance(output, Variable):
+            output = [output]
+        return list(output)
diff --git a/python/paddle/distributed/auto_parallel/dist_tensor.py b/python/paddle/distributed/auto_parallel/dist_tensor.py
new file mode 100644
index 0000000000000..3b292d7f435ec
--- /dev/null
+++ b/python/paddle/distributed/auto_parallel/dist_tensor.py
@@ -0,0 +1,103 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+
+import copy
+from paddle.fluid import core
+from .dist_attribute import TensorDistributedAttribute
+from .dist_attribute import get_tensor_dist_attr_field_keys
+
+
+class DistributedTensor:
+    def __init__(self, serial_tensor, dist_attr=None):
+        self._serial_tensor = serial_tensor
+        self._dist_attr = None
+        self._batch_dim = 0
+        # Reuse the dist_attr setter to initialize _dist_attr
+        self.dist_attr = dist_attr
+
+    @property
+    def serial_tensor(self):
+        return self._serial_tensor
+
+    @property
+    def dist_attr(self):
+        return self._dist_attr
+
+    @dist_attr.setter
+    def dist_attr(self, dist_attr):
+        if self._dist_attr is None:
+            self._dist_attr = TensorDistributedAttribute()
+        self._dist_attr.init(dist_attr)
+        self._init_default_dist_attr()
+
+    def _init_default_dist_attr(self):
+        if self._dist_attr.dims_mapping is None:
+            if self.serial_tensor.type == core.VarDesc.VarType.READER:
+                tensor_shape = []
+            else:
+                tensor_shape = self._serial_tensor.shape
+            tensor_dims_mapping = [-1 for _ in range(len(tensor_shape))]
+            self._dist_attr.dims_mapping = tensor_dims_mapping
+
+    def validate_dist_attr(self):
+        if self.serial_tensor.type == core.VarDesc.VarType.READER:
+            return True
+        tensor_shape = self.serial_tensor.shape
+        if len(tensor_shape) != len(self.dist_attr.dims_mapping):
+            return False
+        for i in range(len(self.dist_attr.dims_mapping)):
+            if self.dist_attr.dims_mapping[
+                    i] < -1 or self.dist_attr.dims_mapping[i] >= len(
+                        self.dist_attr.process_mesh.topology):
+                return False
+        for i in range(len(self.dist_attr.process_mesh.topology)):
+            if self.dist_attr.dims_mapping.count(i) > 1:
+                return False
+        return True
+
+    def __str__(self):
+        str = "{{tensor name: {}, tensor id: {}".format(
+            self.serial_tensor.desc.name(), self.serial_tensor.desc.id())
+
+        # str += ", {}".format(self.dist_attr)
+        # return str
+
+        if self.dist_attr.is_annotated("process_mesh"):
+            annotated_str = "annotated"
+        else:
+            annotated_str = "non-annotated"
+        str += ", process_mesh ({}): {}".format(annotated_str,
+                                                self.dist_attr.process_mesh)
+
+        str += ", is_parameter: {}".format(self.serial_tensor.is_parameter)
+
+        if self.dist_attr.is_annotated("dims_mapping"):
+            annotated_str = "annotated"
+        else:
+            annotated_str = "non-annotated"
+        str += ", dims_mapping ({}): {}".format(annotated_str,
+                                                self.dist_attr.dims_mapping)
+
+        if self.dist_attr.is_annotated("shard_mask"):
+            annotated_str = "annotated"
+        else:
+            annotated_str = "non-annotated"
+        str += ", shard_mask ({}): {}".format(annotated_str, None)
+
+        if self.dist_attr.is_annotated("offload_device"):
+            annotated_str = "annotated"
+        else:
+            annotated_str = "non-annotated"
+        str += ", offload_device ({}): {} }}".format(annotated_str, None)
+        return str
diff --git a/python/paddle/distributed/auto_parallel/interface.py b/python/paddle/distributed/auto_parallel/interface.py
index 30055c5b763a1..f12b85c6f2bb0 100644
--- a/python/paddle/distributed/auto_parallel/interface.py
+++ b/python/paddle/distributed/auto_parallel/interface.py
@@ -18,293 +18,34 @@
 import paddle.fluid.core as core
 from paddle.fluid.framework import Variable
 from paddle.fluid.framework import in_dygraph_mode
-
-__all__ = []
-
-# a map from ProcessMesh ids to the ProcessMesh instances
-_g_process_mesh_map = dict()
-
-# user defined map from logical process ids to physical ones
-_user_defined_physical_map = None
-
-
-def _append_attr_suffix(name):
-    """
-    Append auto parallel suffix for distributed attribute name.
-    """
-    return name + core.kAutoParallelSuffix()
-
-
-def _remove_attr_suffix(name):
-    """
-    Remove auto parallel suffix from distributed attribute name.
-    """
-    return name.strip(core.kAutoParallelSuffix())
+from .dist_context import get_default_distributed_context
+from .dist_tensor import DistributedTensor
+from .dist_op import DistributedModule
+from .dist_attribute import TensorDistributedAttribute
+from .dist_attribute import OperatorDistributedAttribute
 
 
 def _static_mode_check():
     if in_dygraph_mode():
-        raise RuntimeError("Auto-parallel only supports static mode, "
-                           "please use paddle.enable_static().")
-
-
-def _get_nested_list_shape(nested_list):
-    """
-    Get the shape of a nested_list.
-    """
-    result = []
-    while isinstance(nested_list, list):
-        result.append(len(nested_list))
-        nested_list = nested_list[0]
-    return result
-
-
-def _flatten_nested_list(nested_list):
-    """
-    Get a list of all items in a nested_list.
-    Ref: https://stackoverflow.com/questions/952914/how-to-make-a-flat-list-out-of-a-list-of-lists
-    """
-    result = numpy.array(nested_list).flatten().tolist()
-    return result
-
-
-class ProcessMesh(object):
-    r"""
-    The class `Processmesh` describes the topology of logical processes. 
-    A mesh is an N-dimensional array. The shape of the N-dimensional
-    array represents the topology of logical processes and every
-    element of the N-dimensional array represent a logical process. For
-    example, the 2-dimensional array [[2, 4, 5], [0, 1, 3]]
-    illustrates six logical processes organized as the topology [2, 3],
-    i.e., the shape of the 2-dimensional array. With the above topology,
-    there are two parallel groups, where the first parallel group has a
-    parallel degree of 2 and the second one has a parallel degree of 3.
-    And the first logical process is the one with id=2.
-
-    Args:
-        mesh (list): an N-dimensional array (nested list) describes the toplogy
-            of logical processes. The shape of the N-dimensional array
-            represents the topology of logical processes and every 
-            element of the N-dimensional array represents a logical process.
-        parent (ProcessMesh, optional): the parent ProcessMesh. None means
-            the ProcessMesh is the root one without parent ProcessMesh.
-            Default: None.
-    
-    Returns:
-        None
-
-    Raises:
-        ValueError: If `mesh` is not an instance of list.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle
-            import paddle.distributed as dist
-            
-            paddle.enable_static()
-            
-            mesh = dist.ProcessMesh([[2, 4, 5], [0, 1, 3]])
-            assert mesh.parent is None
-            assert mesh.topology == [2, 3]
-            assert mesh.process_group == [2, 4, 5, 0, 1, 3]
-            mesh.set_placement([0, 1, 2, 3, 4, 5])
-
-    """
-
-    def __init__(self, mesh, parent=None):
-        _static_mode_check()
-        if mesh is None or not isinstance(mesh, list):
-            raise ValueError('mesh must be an instance of list.')
-
-        self._topology = _get_nested_list_shape(mesh)
-        self._processes = _flatten_nested_list(mesh)
-
-        # Every element of mesh must be >= 0.
-        assert min(self._processes) >= 0, ('All elements of mesh must be >= 0.')
-
-        unique_ids = set(self._processes)
-        assert len(unique_ids) == len(self._processes), (
-            'All elements of mesh must be unique.')
-
-        if parent is None:
-            # For root ProcessMesh, the ids of logical processes must be range
-            # from 0 to N-1, where N is the number of logical processes. 
-            assert max(self._processes) == len(self._processes) - 1, (
-                'For root ProcessMesh, ids of logical processes must be range '
-                'from 0 to N-1, where N is the number of logical processes.')
-
-            parent_id = core.kNoneProcessMeshIndex()
-            assert len(_g_process_mesh_map.keys()) == 0, (
-                'The first ProcessMesh must be the root, which has no parent.')
-        else:
-            assert len(_g_process_mesh_map.keys()) > 0, (
-                'All ProcessMesh must have a parent except the root one.')
-
-            assert isinstance(parent, ProcessMesh), (
-                'parent must be an instance of ProcessMesh.')
-            parent_id = parent._desc.id
-
-            # All elements in mesh must belong to its parent
-            parent_ids = set(parent.process_group)
-            assert unique_ids <= parent_ids, (
-                'All elements in mesh must belong to its parent.')
-
-        self._desc = core.ProcessMeshDesc(self._topology, self._processes,
-                                          parent_id)
-
-        self._id = self._desc.id
-        self._parent_id = parent_id
-        assert self._id not in _g_process_mesh_map, (
-            "The ProcessMesh with id %d already exists." % self._id)
-        _g_process_mesh_map[self._id] = self
-
-    @property
-    def topology(self):
-        r"""
-        Get the topology of logical processes belonging to this ProcessMesh.
-        This is the shape of `mesh` used to initialized this ProcessMesh.
-        """
-        return self._topology
-
-    @property
-    def process_group(self):
-        r"""
-        Get a list of all processes belonging to this ProcessMesh.
-        """
-        return self._processes
-
-    @property
-    def parent(self):
-        r"""
-        Get the parent ProcessMesh.
-        """
-        if self._parent_id == core.kNoneProcessMeshIndex(): return None
-        assert self._parent_id in _g_process_mesh_map, (
-            "parent with id %d does not exist." % self._parent_id)
-        return _g_process_mesh_map[self._parent_id]
-
-    @property
-    def ndim(self):
-        r"""
-        Get the number of dimension of ProcessMesh.
-        """
-        return len(self._topology)
-
-    def set_placement(self, order):
-        """
-        Set the map from logical processes to physical ones using the
-        user defined order.
-
-        Args:
-            order (list): order of the physical process ids.
-        
-        Returns:
-            None
-
-        Examples:
-            .. code-block:: python
-
-                import paddle
-                import paddle.distributed as dist
-                
-                paddle.enable_static()
-                
-                mesh = dist.ProcessMesh([[2, 4, 5], [0, 1, 3]])
-                mesh.set_placement([0, 1, 2, 3, 4, 5])
-
-        """
-        assert self.parent is None, (
-            "This function can only be called by the root ProcessMesh.")
-        unique_ids = set(order)
-        assert isinstance(order, list)
-
-        assert len(unique_ids) == len(order), (
-            "All elements in order must be unique.")
-        assert min(order) == 0
-        assert max(order) == len(order) - 1, (
-            "All elements in order must be from 0 to N - 1, where N "
-            "is the number of physical processes.")
-
-        logical_order = self.process_group
-        global _user_defined_physical_map
-        assert _user_defined_physical_map is None, (
-            "This function can only be called once.")
-        _user_defined_physical_map = dict()
-
-        assert len(logical_order) == len(order)
-        for idx, l_id in enumerate(logical_order):
-            _user_defined_physical_map[l_id] = order[idx]
-
-    def _reset_global_process_mesh_map(self):
-        """
-        Remove all process mesh in _g_process_mesh_map, make it empty.
-        """
-
-        _g_process_mesh_map = dict()
-
-    def __eq__(self, other):
-        assert other and isinstance(other, ProcessMesh)
-        if self.topology != other.topology or self.process_group != other.process_group:
-            return False
-        return True
-
-    def __ne__(self, other):
-        return not self.__eq__(other)
-
-    def __str__(self):
-        str = "shape {} and process group {}".format(self.topology,
-                                                     self.process_group)
-        return str
-
-    def __deepcopy__(self, memo):
-        cls = self.__class__
-        result = cls.__new__(cls)
-        memo[id(self)] = result
-        for k, v in self.__dict__.items():
-            # No need to copy the owner tensor and context
-            if k == "_desc":
-                setattr(result, k, v)
-            else:
-                setattr(result, k, copy.deepcopy(v, memo))
-        return result
+        raise RuntimeError("Auto-parallel only supports static mode for now, "
+                           "please use paddle.enable_static() first.")
 
 
-def _dim_mapping_checker(tensor, mesh, dim_mapping):
-    assert isinstance(mesh,
-                      ProcessMesh), 'The type of mesh must be ProcessMesh.'
-    assert isinstance(dim_mapping,
-                      list), 'The type of dim_mapping must be list.'
-    assert len(tensor.shape) == len(dim_mapping), (
-        'The number of dimensions '
-        'of tensor must be the same as the length of its corresponding '
-        'dim_mapping.')
-    mesh_dim = len(mesh.topology)
-    dim_set = set()
-    for i in range(len(dim_mapping)):
-        assert dim_mapping[i] == -1 or (
-            dim_mapping[i] < mesh_dim and dim_mapping[i] >= 0), (
-                'Each element '
-                'in dim_mapping must be greater than zero and less than the '
-                'length of its corresponding topology, or it must be -1.')
-        if dim_mapping[i] >= 0:
-            assert dim_mapping[i] not in dim_set
-            dim_set.add(dim_mapping[i])
-
-
-def shard_tensor(x, mesh, dim_mapping):
+def shard_tensor(x, dist_attr=None):
     """
     Add distributed attributes for a tensors.
 
     Args:
-        x (Tensor): the tensor to process.
-        mesh (ProcessMesh): an instance of ProcessMesh to describe the topology of logical processes.
-        dim_mapping (list): a list to describe the mapping between `x` and `mesh`,
-            the dimension `i` of `x` is split across the dimension `dims_mapping[i]`, where -1 means
-            without parition along the corresponding dimension.
+        x (Tensor): the tensor to be sharded.
+        dist_attr (dict): the tensor distributed attributes. The accepted attributes are as follow:
+            "process_mesh": a nested list an to describe the mesh topology of logical processes.
+            "dims_mapping": a list to describe the mapping between `x` and `process_mesh`, the dimension 
+                `i` of `x` is split across the dimension `dims_mapping[i]` of `process_mesh`, 
+                where -1 means that tensor dimension is not split.
+            Both process_mesh and dims_mapping are optional and users can specify as need.
 
     Returns:
-        Tensor: the tensor `x` itself.
+        Tensor: the tensor `x` annotated with distributed attributes.
 
     Examples:
         .. code-block:: python
@@ -314,87 +55,36 @@ def shard_tensor(x, mesh, dim_mapping):
             
             paddle.enable_static()
 
-            mesh = dist.ProcessMesh([[2, 4, 5], [0, 1, 3]])
-            x = paddle.ones([4, 6])
-            dist.shard_tensor(x, mesh, [0, -1])
-
-    """
-    _static_mode_check()
-    _dim_mapping_checker(x, mesh, dim_mapping)
-    attr_name = _append_attr_suffix('mesh_id')
-    x._set_attr(attr_name, mesh._id)
-    attr_name = _append_attr_suffix('dim_mapping')
-    x._set_attr(attr_name, dim_mapping)
-    return x
-
-
-def set_shard_mask(x, mask):
-    """
-    Set the mask for a tensor which mask out the tensor from some processes in its mesh.
-
-    Args:
-        x (Tensor): the tensor to process.
-        mask (list): a nested list. The shape of `mask` must be the same as the ProcessMesh belonging to
-            the tensor `x`. Every value of `mask` must be one or zero, where one means 
-            the tenor `x` will be put on the corresponding logical process and zero means the tensor `x`
-            will not be put on the corresponding logical process.
-            For example, for a ProcessMesh represented by the 2-dimensional
-            array [[2, 4, 5], [0, 1, 3]], and a `mask` given by the
-            2-dimensional [[1, 0, 1], [0, 1, 0]],
-            then the tensor `x` will only be put on logical processes 2, 5 and 1.
-
-    Returns:
-        Tensor: the tensor `x` itself.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle
-            import paddle.distributed as dist
-
-            paddle.enable_static()
-            
-            mesh = dist.ProcessMesh([[2, 4, 5], [0, 1, 3]])
-            mask = [[1, 0, 1], [0, 1, 0]]
             x = paddle.ones([4, 6])
-            dist.shard_tensor(x, mesh, [-1, 1])
-            dist.set_shard_mask(x, mask)
+            dist.shard_tensor(x, dist_attr={"process_mesh": [[0, 1], [2, 3]],
+                                            "dims_mapping": [0, -1]})
 
     """
     _static_mode_check()
-    assert isinstance(mask, list)
-    np_mask = numpy.array(mask)
-    min_ele = numpy.min(np_mask)
-    max_ele = numpy.max(np_mask)
-    mesh_attr_name = _append_attr_suffix('mesh_id')
-    assert x._has_attr(mesh_attr_name), \
-        "Please set process mesh for the variable firstly."
-    assert min_ele >= 0 and max_ele <= 1, "Elements in mask must be 0 or 1."
-    x_mesh = x.process_mesh
-    assert x_mesh, "Please set process mesh for the variable firstly."
-    assert x_mesh.topology == list(np_mask.shape), (
-        "The shape of mask "
-        "must be the same as the shape of its Process Mesh.")
-    attr_name = _append_attr_suffix('mask')
-    x._set_attr(attr_name, _flatten_nested_list(mask))
+    assert dist_attr is None or isinstance(dist_attr, (dict, TensorDistributedAttribute)), \
+        "The type of dist_attr must be None, dict or TensorDistributedAttribute."
+    dist_tensor = DistributedTensor(x, dist_attr)
+    dist_tensor.dist_attr.mark_annotated_as(dist_attr)
+    default_dist_ctx = get_default_distributed_context()
+    default_dist_ctx.add_dist_tensor_for_program(dist_tensor)
     return x
 
 
-def shard_op(op_fn, mesh, dim_mapping_dict, **kwargs):
+def shard_op(op_fn, dist_attr=None):
     """
     Call a functioin and add distributed attributes for ops added by the function.
 
     Args:
-        op_fn (callable): a callable object of an API.
-        mesh (ProcessMesh): an instance of ProcessMesh specifies the topology of logical processes.
-        dim_mapping_dict (dict): a mapping from tensor's name to its dims_mapping.
-            The dim_mapping is a list to describe the mapping between a tensor and `mesh`,
-            the dimension `i` of the tensor is split across the dimension `dim_mapping[i]`,
-            where -1 means without parition along the corresponding dimension.
-        kwargs (dict): a dict of parameter passed to the function `op_fn`.
+        op_fn (callable): a callable operator or module to be sharded.
+        dist_attr (dict): the operator distributed attributes. The accepted attributes are classified into 
+            two categories. The first category decsribes the distributed attributes shared by all inputs and 
+            outputs, and only `process_mesh` can be specified now. The second category describes distributed
+            attributes for inputs or outputs same as the `dist_attr` of `shard_tensor`. All of them are
+            optional and users can specify them as need. Note that `process_mesh` for operators must be the
+            same as these process_meshes for inputs and outputs. 
 
     Returns:
-        list: the outputs of the function `op_fn`.
+        list: the outputs of the function `op_fn`, which are annotated with distributed attributes.
 
     Examples:
         .. code-block:: python
@@ -404,100 +94,19 @@ def shard_op(op_fn, mesh, dim_mapping_dict, **kwargs):
 
             paddle.enable_static()
             
-            mesh = dist.ProcessMesh([[2, 4, 5], [0, 1, 3]])
             x = paddle.ones([4, 6])
             y = paddle.zeros([4, 6])
-            kwargs = {'x': x, 'y': y}
-            dist.shard_op(paddle.add, mesh, None, **kwargs)
-
-    """
-    _static_mode_check()
-    main_prog = paddle.fluid.default_main_program()
-    main_block = main_prog.global_block()
-    op_size = len(main_block.ops)
-    output = op_fn(**kwargs)
-    new_op_size = len(main_block.ops)
-    if dim_mapping_dict is None:
-        dim_mapping_dict = dict()
-    else:
-        assert isinstance(dim_mapping_dict,
-                          dict), 'The type of dim_mapping_dict must be dict.'
-        for var_name in dim_mapping_dict.keys():
-            dim_mapping = dim_mapping_dict[var_name]
-            tensor = main_block.var(var_name)
-            _dim_mapping_checker(tensor, mesh, dim_mapping)
-    for idx in range(op_size, new_op_size):
-        op = main_block.ops[idx]
-        attr_name = _append_attr_suffix('mesh_id')
-        op._set_attr(attr_name, mesh._id)
-        for var_name in dim_mapping_dict.keys():
-            assert var_name in op.output_arg_names + op.input_arg_names
-            attr_name = _append_attr_suffix(var_name)
-            if var_name in op.input_arg_names:
-                # we use the prefix "IN_" to indicates an input argument name
-                attr_name = "IN_" + attr_name
-            else:
-                # we use the prefix "OUT_" to indicates an input argument name
-                attr_name = "OUT_" + attr_name
-            op._set_attr(attr_name, dim_mapping_dict[var_name])
-
-    if isinstance(output, Variable):
-        output = [output]
-    return list(output)
-
-
-def set_offload_device(x, device):
-    """
-    Set the device that the tensor `x` will be put on.
-
-    Args:
-        x (tensor): the tensor to process.
-        device (str): the device that the tensor `x` will be put on, e.g., 'cpu'.
-
-    Returns:
-        Tensor: the tensor `x` itself.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle
-            import paddle.distributed as dist
-
-            paddle.enable_static()
-            
-            x = paddle.ones([4, 6])
-            dist.set_offload_device(x, 'cpu')
-
-    """
-    _static_mode_check()
-    assert device == "cpu", "Only 'cpu' is supported for destination device."
-    attr_name = _append_attr_suffix("offload_device")
-    x._set_attr(attr_name, device)
-    return x
-
-
-def set_pipeline_stage(stage):
-    """
-    Set the pipeline stage of the following ops.
-
-    Args:
-        stage (int): the pipeline stage the following ops belonging to.
-
-    Returns:
-        None.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle
-            import paddle.distributed as dist
-
-            paddle.enable_static()
-            
-            dist.set_pipeline_stage(0)
+            dist_add = dist.shard_op(paddle.add,
+                                     dist_attr={
+                                         "process_mesh": [[2, 3, 1], [0, 4, 5]],
+                                         x: {"dims_mapping": [-1, 0]},
+                                         y: {"dims_mapping": [0, -1]}
+                                     })
+            dist_add(x, y)
 
     """
-    from paddle.fluid.framework import _set_pipeline_stage
     _static_mode_check()
-    assert isinstance(stage, int), 'The type of stage must be int.'
-    _set_pipeline_stage(stage)
+    assert dist_attr is None or isinstance(dist_attr, (dict, OperatorDistributedAttribute)), \
+        "The type of dist_attr must be dict or OperatorDistributedAttribute."
+    dist_module = DistributedModule(op_fn, dist_attr)
+    return dist_module
diff --git a/python/paddle/distributed/auto_parallel/operators/__init__.py b/python/paddle/distributed/auto_parallel/operators/__init__.py
index 3b3359b4ebf1c..d0ddeb1dcc711 100644
--- a/python/paddle/distributed/auto_parallel/operators/__init__.py
+++ b/python/paddle/distributed/auto_parallel/operators/__init__.py
@@ -12,9 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License
 
-from .common import DistributedOperator
+from .common import DistributedOperatorImplContainer
 from .common import DistributedOperatorImpl
-from .common import register_distributed_operator
+from .common import register_distributed_operator_impl_container
 from .common import register_distributed_operator_impl
 from .common import find_best_compatible_distributed_operator_impl
 from . import dist_embedding
diff --git a/python/paddle/distributed/auto_parallel/operators/common.py b/python/paddle/distributed/auto_parallel/operators/common.py
index 5685c40a3227b..376e1a8ac6851 100644
--- a/python/paddle/distributed/auto_parallel/operators/common.py
+++ b/python/paddle/distributed/auto_parallel/operators/common.py
@@ -12,10 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License
 
-DISTRIBUTED_OPERATORS = {}
+_g_distributed_operator_impl_registries = {}
 
 
-class DistributedOperator:
+class DistributedOperatorImplContainer:
     def __init__(self):
         self._impls = []
         self._name = None
@@ -47,67 +47,60 @@ def backward(dist_ctx, *grad_outputs, **kwargs):
     def get_name(self):
         return self._name
 
-    def is_process_mesh_compatible(self, op_dist_attr):
+    def is_input_compatible(self, dist_op):
         raise NotImplementedError("Please Implement this method in Subclass.")
 
-    def is_input_compatible(self, op_dist_attr):
+    def is_output_compatible(self, dist_op):
         raise NotImplementedError("Please Implement this method in Subclass.")
 
-    def is_output_compatible(self, op_dist_attr):
-        raise NotImplementedError("Please Implement this method in Subclass.")
-
-    def is_compatible(self, op_dist_attr):
-        return self.is_process_mesh_compatible(op_dist_attr) \
-            and self.is_input_compatible(op_dist_attr) \
-            and self.is_output_compatible(op_dist_attr)
+    def is_compatible(self, dist_op):
+        return self.is_input_compatible(dist_op) and \
+            self.is_output_compatible(dist_op)
 
-    def update_dims_mapping(self, op_dist_attr):
+    def update_dims_mapping(self, dist_op):
         raise NotImplementedError("Please Implement this method in Subclass.")
 
 
-def register_distributed_operator(name, dist_op):
-    global DISTRIBUTED_OPERATORS
-    DISTRIBUTED_OPERATORS[name] = dist_op
+def register_distributed_operator_impl_container(name, dist_op_impl_container):
+    global _g_distributed_operator_impl_registries
+    _g_distributed_operator_impl_registries[name] = dist_op_impl_container
 
 
-def get_distributed_operator(name):
-    global DISTRIBUTED_OPERATORS
-    return DISTRIBUTED_OPERATORS.get(name, None)
+def get_distributed_operator_impl_container(name):
+    global _g_distributed_operator_impl_registries
+    return _g_distributed_operator_impl_registries.get(name, None)
 
 
 def register_distributed_operator_impl(name, dist_impl):
-    dist_op = get_distributed_operator(name)
-    if dist_op is not None:
-        dist_op.register_impl(dist_impl)
+    dist_op_impl_container = get_distributed_operator_impl_container(name)
+    if dist_op_impl_container is not None:
+        dist_op_impl_container.register_impl(dist_impl)
     else:
-        assert False, "Must register distributed operator first."
+        assert False, "Must register distributed operator registry first."
 
 
 def get_distributed_operator_impl(name, impl_idx):
-    global DISTRIBUTED_OPERATORS
-    return DISTRIBUTED_OPERATORS[name].get_impl(impl_idx)
+    global _g_distributed_operator_impl_registries
+    return _g_distributed_operator_impl_registries[name].get_impl(impl_idx)
 
 
-def find_best_compatible_distributed_operator_impl(name, op_dist_attr,
-                                                   fwd=True):
+def find_best_compatible_distributed_operator_impl(name, dist_op, fwd=True):
     """
     Here just return the first compatible implemention. 
     This will be improved by cost model in the future.
     """
-    dist_op = get_distributed_operator(name)
-    if dist_op is None:
+    dist_op_impl_container = get_distributed_operator_impl_container(name)
+    if dist_op_impl_container is None:
         return None, -1
     compatible_impls = []
-    impls = dist_op.get_impls()
+    impls = dist_op_impl_container.get_impls()
     if fwd:
         for idx, impl in enumerate(impls):
-            if impl.is_process_mesh_compatible(op_dist_attr) \
-                and impl.is_input_compatible(op_dist_attr):
+            if impl.is_input_compatible(dist_op):
                 compatible_impls.append((impl, idx))
     else:
         for idx, impl in enumerate(impls):
-            if impl.is_process_mesh_compatible(op_dist_attr) \
-                and impl.is_output_compatible(op_dist_attr):
+            if impl.is_output_compatible(dist_op):
                 compatible_impls.append((impl, idx))
 
     if compatible_impls:
@@ -118,48 +111,37 @@ def find_best_compatible_distributed_operator_impl(name, op_dist_attr,
     return best_compatible_impl, idx
 
 
-def copy_distributed_attr_for_var(src_op_dist_attr, var, src_var):
+def copy_distributed_attr_for_var(dist_context, dst_var, src_var):
     """
     copy src var's dist_attr to dst var
     """
-    import copy
-
-    auto_paralle_context = src_op_dist_attr.get_owner_context()
-    dist_attr = copy.deepcopy(
-        auto_paralle_context.get_tensor_distributed_attr_for_program(src_var))
-    dist_attr._owner_tensor = var
-    dist_attr._owner_context = auto_paralle_context.get_tensor_distributed_attr_for_program(
-        src_var)._owner_context
-    auto_paralle_context.set_tensor_distributed_attr_for_program(var, dist_attr)
+    dist_attr = dist_context.get_tensor_dist_attr_for_program(src_var)
+    dist_context.set_tensor_dist_attr_for_program(dst_var, dist_attr)
 
 
-def copy_distributed_attr_for_dist_op(dist_op, dst_block, src_op_dist_attr):
+def copy_distributed_attr_for_dist_op(dist_context, dist_op, dst_block,
+                                      src_op_dist_attr):
     """
     copy src op's dist_attr to dst dist op
     """
-    from ..attribute import OperatorDistributedAttribute
+    from ..dist_attribute import OperatorDistributedAttribute
+    # need check dist op attr and its inputs and outputs
 
-    auto_paralle_context = src_op_dist_attr.get_owner_context()
-    op_dist_attr = OperatorDistributedAttribute(dist_op, auto_paralle_context)
-    auto_paralle_context._copy_distributed_attr_from_op_desc(dist_op.desc,
-                                                             op_dist_attr)
-    auto_paralle_context.set_op_distributed_attr_for_program(dist_op,
-                                                             op_dist_attr)
-
-    op_dist_attr.set_process_mesh(src_op_dist_attr.get_process_mesh())
-    op_dist_attr.set_impl_idx(src_op_dist_attr.get_impl_idx())
+    op_dist_attr = OperatorDistributedAttribute()
+    op_dist_attr.process_mesh = src_op_dist_attr.process_mesh
+    op_dist_attr.impl_idx = src_op_dist_attr.impl_idx
 
     for input_varname in dist_op.desc.input_arg_names():
         input_var = dst_block.var(input_varname)
-        tensor_dist_attr = auto_paralle_context.get_tensor_distributed_attr_for_program(
+        tensor_dist_attr = dist_context.get_tensor_dist_attr_for_program(
             input_var)
-        tensor_dims_mapping = tensor_dist_attr.get_dims_mapping()
-        op_dist_attr.set_input_dims_mapping(input_varname, tensor_dims_mapping)
+        op_dist_attr.set_input_dist_attr(input_varname, tensor_dist_attr)
 
     for output_varname in dist_op.desc.output_arg_names():
         output_var = dst_block.var(output_varname)
-        tensor_dist_attr = auto_paralle_context.get_tensor_distributed_attr_for_program(
+        tensor_dist_attr = dist_context.get_tensor_dist_attr_for_program(
             output_var)
-        tensor_dims_mapping = tensor_dist_attr.get_dims_mapping()
-        op_dist_attr.set_output_dims_mapping(output_varname,
-                                             tensor_dims_mapping)
+        op_dist_attr.set_output_dist_attr(output_varname, tensor_dist_attr)
+
+    dist_context.set_op_dist_attr_for_program(dist_op, op_dist_attr)
+    op_dist_attr = dist_context.get_op_dist_attr_for_program(dist_op)
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_default.py b/python/paddle/distributed/auto_parallel/operators/dist_default.py
index cf17b7afb0f39..05af1b402b425 100755
--- a/python/paddle/distributed/auto_parallel/operators/dist_default.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_default.py
@@ -12,9 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License
 
-from .common import DistributedOperator
+from .common import DistributedOperatorImplContainer
 from .common import DistributedOperatorImpl
-from .common import register_distributed_operator
+from .common import register_distributed_operator_impl_container
 from .common import register_distributed_operator_impl
 from ..utils import is_dim_shard
 from ..utils import is_dim_replicate
@@ -22,26 +22,27 @@
 from ..utils import compute_compatible_dim_mapping
 from ..utils import compute_compatible_dims_mapping
 from ..utils import compute_compatible_and_update_dim_mapping
-from ..attribute import OperatorDistributedAttribute
+from ..dist_attribute import OperatorDistributedAttribute
 from paddle.fluid import core, unique_name
 from paddle.fluid.framework import in_dygraph_mode
 from paddle.fluid.framework import Program, Parameter, Variable, program_guard
 from paddle.fluid.data_feeder import check_variable_and_dtype, check_dtype
 from paddle.distributed.fleet.meta_optimizers.common import OpRole, OP_ROLE_KEY, OP_ROLE_VAR_KEY
-from ..process import new_process_group
+from ..process_group import new_process_group
 from ..utils import _get_comm_group, _get_corresponding_rank
 
 
-class DistributedDefault(DistributedOperator):
+class DistributedDefault(DistributedOperatorImplContainer):
     def __init__(self, name):
         super(DistributedDefault, self).__init__()
         self._name = name
 
 
-register_distributed_operator("default", DistributedDefault("default"))
+register_distributed_operator_impl_container("default",
+                                             DistributedDefault("default"))
 
 
-# Replicated Default 
+# Replicated Default
 class DistributedDefaultImpl0(DistributedOperatorImpl):
     def __init__(self, name):
         super(DistributedDefaultImpl0, self).__init__()
@@ -49,29 +50,26 @@ def __init__(self, name):
         self._forward_implemented = True
         self._backward_implemented = True
 
-    def is_process_mesh_compatible(self, op_dist_attr):
+    def is_input_compatible(self, dist_op):
         raise NotImplementedError("Please Implement this method.")
 
-    def is_input_compatible(self, op_dist_attr):
+    def is_output_compatible(self, dist_op):
         raise NotImplementedError("Please Implement this method.")
 
-    def is_output_compatible(self, op_dist_attr):
-        raise NotImplementedError("Please Implement this method.")
-
-    def update_dims_mapping(self, op_dist_attr):
+    def update_dims_mapping(self, dist_op):
         raise NotImplementedError("Please Implement this method.")
 
     @staticmethod
     def forward(ctx, *args, **kwargs):
 
-        dist_op_helper = ctx.get_dist_op_helper()
-        main_block = dist_op_helper.get_dst_main_program().global_block()
-        startup_block = dist_op_helper.get_dst_startup_program().global_block()
-        src_op = dist_op_helper.get_cur_src_op()
-        varname_mapping = dist_op_helper.get_varname_mapping()
-        rank_id = dist_op_helper.get_rank_id()
+        dist_op_context = ctx.dist_op_context
+        main_block = dist_op_context.get_dst_main_program().global_block()
+        startup_block = dist_op_context.get_dst_startup_program().global_block()
+        src_op = dist_op_context.get_cur_src_op()
+        varname_mapping = dist_op_context.get_varname_mapping()
+        rank_id = dist_op_context.get_rank_id()
 
-        # check validation of inputs / outputs 
+        # check validation of inputs / outputs
         for input_name in src_op.desc.input_names():
             assert input_name in kwargs, "input [{}] is not given".format(
                 input_name)
@@ -100,26 +98,26 @@ def forward(ctx, *args, **kwargs):
         for varname in dist_op_desc.input_arg_names():
             if startup_block.has_var(varname) and startup_block.var(
                     varname
-            ).is_parameter and varname not in dist_op_helper.already_init_sync_vars:
-                dist_op_helper.already_init_sync_vars.add(varname)
+            ).is_parameter and varname not in dist_op_context.already_init_sync_vars:
+                dist_op_context.already_init_sync_vars.add(varname)
                 param = startup_block.var(varname)
-                param_dist_attr = ctx.get_tensor_distributed_attr_for_program(
-                    param)
-                process_mesh = param_dist_attr.get_process_mesh()
-                dims_mapping = param_dist_attr.get_dims_mapping()
+                param_dist_attr = ctx.get_tensor_dist_attr_for_program(param)
+                process_mesh = param_dist_attr.process_mesh
+                dims_mapping = param_dist_attr.dims_mapping
 
                 # FIXME (JZ-LIANG) Remove this hack to support any op mesh group for Pipeline Parallelism
-                if rank_id not in process_mesh.process_group:
-                    rank_id = _get_corresponding_rank(process_mesh, rank_id)
+                if rank_id not in process_mesh.processes:
+                    rank_id = _get_corresponding_rank(ctx, process_mesh,
+                                                      rank_id)
 
-                # NOTE all not splited axis should be presented in mesh 
+                # NOTE all not splited axis should be presented in mesh
                 for axis, size in enumerate(process_mesh.topology):
                     if size <= 1 or axis in dims_mapping:
                         pass
                     else:
-                        group_ranks = _get_comm_group(
-                            process_mesh.process_group, process_mesh.topology,
-                            axis, rank_id)
+                        group_ranks = _get_comm_group(process_mesh.processes,
+                                                      process_mesh.topology,
+                                                      axis, rank_id)
                         sync_group = new_process_group(group_ranks)
 
                         new_op = startup_block.append_op(
@@ -134,12 +132,12 @@ def forward(ctx, *args, **kwargs):
                             })
 
                         # set distributed attribute
-                        op_attr = OperatorDistributedAttribute(new_op, ctx)
-                        op_attr.set_process_mesh(process_mesh)
+                        op_attr = OperatorDistributedAttribute()
+                        op_attr.process_mesh = process_mesh
                         op_attr.set_output_dims_mapping(param.name,
                                                         dims_mapping)
                         op_attr.set_input_dims_mapping(param.name, dims_mapping)
-                        ctx.set_op_distributed_attr_for_program(new_op, op_attr)
+                        ctx.set_op_dist_attr_for_program(new_op, op_attr)
 
                 startup_block._sync_with_cpp()
 
@@ -147,16 +145,16 @@ def forward(ctx, *args, **kwargs):
     def backward(ctx, *args, **kwargs):
 
         # by now the backward function only insert the gradient allreduce for dist op itself
-        dist_op_helper = ctx.get_dist_op_helper()
-        main_block = dist_op_helper.get_dst_main_program().global_block()
-        backward_op = dist_op_helper.get_cur_src_op()
-        dist_attr = ctx.get_op_distributed_attr_for_program(backward_op)
+        dist_op_context = ctx.dist_op_context
+        main_block = dist_op_context.get_dst_main_program().global_block()
+        backward_op = dist_op_context.get_cur_src_op()
+        dist_attr = ctx.get_op_dist_attr_for_program(backward_op)
         assert dist_attr is not None, "backward op [{}] don't have dist attribute !".format(
             str(backward_op))
-        rank_id = dist_op_helper.get_rank_id()
+        rank_id = dist_op_context.get_rank_id()
 
         # check if need gradient allreduce
-        # if there is a non-gradient & non-parameter input and its batch dimension is splited, 
+        # if there is a non-gradient & non-parameter input and its batch dimension is splited,
         # we need insert gradient allreduce for the gradient of parameter in its output
         need_gradient_allreduce = False
         for input_name in backward_op.desc.input_names():
@@ -165,20 +163,21 @@ def backward(ctx, *args, **kwargs):
                         varname).is_parameter:
 
                     # NOTE input var's dim_mapping of backward op should be the same with input var instead of corresponding varname of forward op
-                    process_mesh = dist_attr.get_process_mesh()
+                    process_mesh = dist_attr.process_mesh
                     var_dim_mapping = dist_attr.get_input_dims_mapping(varname)
 
                     # FIXME (JZ-LIANG) Remove this hack to support any op mesh group for Pipeline Parallelism
-                    if rank_id not in process_mesh.process_group:
-                        rank_id = _get_corresponding_rank(process_mesh, rank_id)
+                    if rank_id not in process_mesh.processes:
+                        rank_id = _get_corresponding_rank(ctx, process_mesh,
+                                                          rank_id)
 
                     mesh_shape = process_mesh.topology
                     batch_size_axis = var_dim_mapping[0]
                     if batch_size_axis > -1 and mesh_shape[batch_size_axis] > 1:
                         need_gradient_allreduce = True
-                        group_ranks = _get_comm_group(
-                            process_mesh.process_group, process_mesh.topology,
-                            batch_size_axis, rank_id)
+                        group_ranks = _get_comm_group(process_mesh.processes,
+                                                      process_mesh.topology,
+                                                      batch_size_axis, rank_id)
                         dp_degree = len(group_ranks)
                         dp_group = new_process_group(group_ranks)
                         break
@@ -228,17 +227,17 @@ def backward(ctx, *args, **kwargs):
                             OP_ROLE_KEY: OpRole.Backward
                         })
 
-                    dims_mapping = ctx.get_tensor_distributed_attr_for_program(
-                        grad_var).get_dims_mapping()
-                    process_mesh = dist_attr.get_process_mesh()
+                    dims_mapping = ctx.get_tensor_dist_attr_for_program(
+                        grad_var).dims_mapping
+                    process_mesh = dist_attr.process_mesh
                     for op in [allreduce_op, scale_op]:
-                        op_attr = OperatorDistributedAttribute(op, ctx)
-                        op_attr.set_process_mesh(process_mesh)
+                        op_attr = OperatorDistributedAttribute()
+                        op_attr.process_mesh = process_mesh
                         op_attr.set_output_dims_mapping(grad_var.name,
                                                         dims_mapping)
                         op_attr.set_input_dims_mapping(grad_var.name,
                                                        dims_mapping)
-                        ctx.set_op_distributed_attr_for_program(op, op_attr)
+                        ctx.set_op_dist_attr_for_program(op, op_attr)
 
                 main_block._sync_with_cpp()
 
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_embedding.py b/python/paddle/distributed/auto_parallel/operators/dist_embedding.py
index cd6d2255c81f1..0099d6a09c47f 100755
--- a/python/paddle/distributed/auto_parallel/operators/dist_embedding.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_embedding.py
@@ -12,9 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License
 
-from .common import DistributedOperator
+from .common import DistributedOperatorImplContainer
 from .common import DistributedOperatorImpl
-from .common import register_distributed_operator
+from .common import register_distributed_operator_impl_container
 from .common import register_distributed_operator_impl
 from .common import copy_distributed_attr_for_var
 from .common import copy_distributed_attr_for_dist_op
@@ -24,25 +24,26 @@
 from ..utils import compute_compatible_dim_mapping
 from ..utils import compute_compatible_dims_mapping
 from ..utils import compute_compatible_and_update_dim_mapping
-from ..attribute import OperatorDistributedAttribute
+from ..dist_attribute import OperatorDistributedAttribute
 from paddle.fluid import core, unique_name
 from paddle.fluid.framework import in_dygraph_mode
 from paddle.fluid.framework import Program, Parameter, Variable, program_guard
 from paddle.fluid.data_feeder import check_variable_and_dtype, check_dtype
 from paddle.distributed.fleet.meta_optimizers.common import OpRole, OP_ROLE_KEY, OP_ROLE_VAR_KEY
-from ..process import new_process_group
+from ..process_group import new_process_group
 from ..utils import _get_comm_group, _get_idx_in_axis, _get_corresponding_rank
 
 
-class DistributedEmbedding(DistributedOperator):
+class DistributedEmbedding(DistributedOperatorImplContainer):
     def __init__(self, name):
         super(DistributedEmbedding, self).__init__()
         self._name = name
 
 
-register_distributed_operator("lookup_table_v2",
-                              DistributedEmbedding("embedding"))
-register_distributed_operator("c_embedding", DistributedEmbedding("embedding"))
+register_distributed_operator_impl_container("lookup_table_v2",
+                                             DistributedEmbedding("embedding"))
+register_distributed_operator_impl_container("c_embedding",
+                                             DistributedEmbedding("embedding"))
 
 
 # RowParallel
@@ -53,12 +54,9 @@ def __init__(self, name):
         self._forward_implemented = True
         self._backward_implemented = True
 
-    def is_process_mesh_compatible(self, op_dist_attr):
-        """ No restriction for now. """
-        return True
-
-    def is_input_compatible(self, op_dist_attr):
-        op_desc = op_dist_attr.get_owner_op().desc
+    def is_input_compatible(self, dist_op):
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
         ids_name = op_desc.input('Ids')[0]
         w_name = op_desc.input('W')[0]
         ids_dims_mapping = op_dist_attr.get_input_dims_mapping(ids_name)
@@ -72,8 +70,9 @@ def is_input_compatible(self, op_dist_attr):
                 return False
         return True
 
-    def is_output_compatible(self, op_dist_attr):
-        op_desc = op_dist_attr.get_owner_op().desc
+    def is_output_compatible(self, dist_op):
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
         out_name = op_desc.output('Out')[0]
         out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
         # Other dimensions must be replicate except the batch dimension
@@ -82,9 +81,10 @@ def is_output_compatible(self, op_dist_attr):
                 return False
         return True
 
-    def update_dims_mapping(self, op_dist_attr):
+    def update_dims_mapping(self, dist_op):
         changed = False
-        op_desc = op_dist_attr.get_owner_op().desc
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
         ids_name = op_desc.input('Ids')[0]
         w_name = op_desc.input('W')[0]
         out_name = op_desc.output('Out')[0]
@@ -111,16 +111,16 @@ def forward(ctx, *args, **kwargs):
         kwargs: inputname_mapping & outputname_mapping
         """
 
-        dist_op_helper = ctx.get_dist_op_helper()
-        main_block = dist_op_helper.get_dst_main_program().global_block()
-        startup_block = dist_op_helper.get_dst_startup_program().global_block()
-        src_op = dist_op_helper.get_cur_src_op()
-        rank_id = dist_op_helper.get_rank_id()
-        op_dist_attr = ctx.get_op_distributed_attr_for_program(src_op)
+        dist_op_context = ctx.dist_op_context
+        main_block = dist_op_context.get_dst_main_program().global_block()
+        startup_block = dist_op_context.get_dst_startup_program().global_block()
+        src_op = dist_op_context.get_cur_src_op()
+        rank_id = dist_op_context.get_rank_id()
+        op_dist_attr = ctx.get_op_dist_attr_for_program(src_op)
         assert op_dist_attr is not None, "backward op [{}] don't have dist attribute !".format(
             str(src_op))
 
-        # check validation of inputs / outputs 
+        # check validation of inputs / outputs
         assert 'Ids' in kwargs, "input [{}] is not given".format('Ids')
         assert 'W' in kwargs, "input [{}] is not given".format('W')
         assert 'Out' in kwargs, "output [{}] is not given".format('Out')
@@ -147,12 +147,12 @@ def forward(ctx, *args, **kwargs):
             Weight_var.name)[0]
         assert embedding_row_dim_mapping >= 0, "row_parallel_embedding's row should be divided by a specific mesh axis, but got [{}]".format(
             embedding_row_dim_mapping)
-        process_mesh_shape = op_dist_attr.get_process_mesh().topology
-        process_mesh_group = op_dist_attr.get_process_mesh().process_group
+        process_mesh_shape = op_dist_attr.process_mesh.topology
+        process_mesh_group = op_dist_attr.process_mesh.processes
 
         # FIXME (JZ-LIANG) Remove this hack to support any op mesh group for Pipeline Parallelism
         if rank_id not in process_mesh_group:
-            rank_id = _get_corresponding_rank(op_dist_attr.get_process_mesh(),
+            rank_id = _get_corresponding_rank(ctx, op_dist_attr.process_mesh,
                                               rank_id)
 
         # A generalized method to caculate embedding offset using cartisian product
@@ -162,7 +162,7 @@ def forward(ctx, *args, **kwargs):
         per_part_size = Weight_var.shape[0]
         relative_idx = relative_idx * per_part_size
 
-        # TODO caculate ring id 
+        # TODO caculate ring id
         parallel_axis = embedding_row_dim_mapping
         group_ranks = _get_comm_group(process_mesh_group, process_mesh_shape,
                                       parallel_axis, rank_id)
@@ -182,7 +182,7 @@ def forward(ctx, *args, **kwargs):
             stop_gradient=Out_var.stop_gradient)
 
         # copy Out_var's dist_attr to intermediate_var_0's dist_attr
-        copy_distributed_attr_for_var(op_dist_attr, intermediate_var_0, Out_var)
+        copy_distributed_attr_for_var(ctx, intermediate_var_0, Out_var)
 
         check_variable_and_dtype(
             Out_var, 'tensor',
@@ -208,25 +208,25 @@ def forward(ctx, *args, **kwargs):
             })
 
         # copy serial op's dist_attr to dist op's dist_attr
-        copy_distributed_attr_for_dist_op(c_embedding_op, main_block,
+        copy_distributed_attr_for_dist_op(ctx, c_embedding_op, main_block,
                                           op_dist_attr)
-        copy_distributed_attr_for_dist_op(c_allreduce_sum_op, main_block,
+        copy_distributed_attr_for_dist_op(ctx, c_allreduce_sum_op, main_block,
                                           op_dist_attr)
 
         # param initialization sync
-        assert Weight_var.name not in dist_op_helper.already_init_sync_vars
-        dist_op_helper.already_init_sync_vars.add(Weight_var.name)
+        assert Weight_var.name not in dist_op_context.already_init_sync_vars
+        dist_op_context.already_init_sync_vars.add(Weight_var.name)
         param = startup_block.var(Weight_var.name)
-        param_dist_attr = ctx.get_tensor_distributed_attr_for_program(param)
-        process_mesh = param_dist_attr.get_process_mesh()
-        dim_mapping = param_dist_attr.get_dims_mapping()
+        param_dist_attr = ctx.get_tensor_dist_attr_for_program(param)
+        process_mesh = param_dist_attr.process_mesh
+        dim_mapping = param_dist_attr.dims_mapping
 
-        # NOTE all not splited axis should be presented in mesh 
+        # NOTE all not splited axis should be presented in mesh
         for axis, size in enumerate(process_mesh.topology):
             if size <= 1 or axis in dim_mapping:
                 pass
             else:
-                group_ranks = _get_comm_group(process_mesh.process_group,
+                group_ranks = _get_comm_group(process_mesh.processes,
                                               process_mesh.topology, axis,
                                               rank_id)
                 sync_group = new_process_group(group_ranks)
@@ -247,17 +247,17 @@ def forward(ctx, *args, **kwargs):
     def backward(ctx, *args, **kwargs):
 
         # by now the backward function only insert the gradient allreduce for dist op itself
-        dist_op_helper = ctx.get_dist_op_helper()
-        main_block = dist_op_helper.get_dst_main_program().global_block()
-        backward_op = dist_op_helper.get_cur_src_op()
-        rank_id = dist_op_helper.get_rank_id()
-        dist_attr = ctx.get_op_distributed_attr_for_program(backward_op)
+        dist_op_context = ctx.dist_op_context
+        main_block = dist_op_context.get_dst_main_program().global_block()
+        backward_op = dist_op_context.get_cur_src_op()
+        rank_id = dist_op_context.get_rank_id()
+        dist_attr = ctx.get_op_dist_attr_for_program(backward_op)
         assert dist_attr is not None, "backward op [{}] don't have dist attribute !".format(
             str(backward_op))
 
         # FIXME (JZ-LIANG) Remove this hack to support any op mesh group for Pipeline Parallelism
-        if rank_id not in dist_attr.get_process_mesh().process_group:
-            rank_id = _get_corresponding_rank(dist_attr.get_process_mesh(),
+        if rank_id not in dist_attr.process_mesh.processes:
+            rank_id = _get_corresponding_rank(ctx, dist_attr.process_mesh,
                                               rank_id)
 
         # check if need gradient allreduce
@@ -286,14 +286,14 @@ def backward(ctx, *args, **kwargs):
             kwargs['W@GRAD'])
 
         Ids_var = main_block.var(kwargs['Ids'][0])
-        process_mesh = dist_attr.get_process_mesh()
+        process_mesh = dist_attr.process_mesh
         var_dim_mapping = dist_attr.get_input_dims_mapping(Ids_var.name)
         mesh_shape = process_mesh.topology
         batch_size_axis = var_dim_mapping[0]
         if batch_size_axis > -1 and mesh_shape[batch_size_axis] > 1:
             need_gradient_allreduce = True
 
-            group_ranks = _get_comm_group(process_mesh.process_group,
+            group_ranks = _get_comm_group(process_mesh.processes,
                                           process_mesh.topology,
                                           batch_size_axis, rank_id)
             dp_degree = len(group_ranks)
@@ -318,15 +318,15 @@ def backward(ctx, *args, **kwargs):
                        OP_ROLE_KEY: OpRole.Backward})
             main_block._sync_with_cpp()
 
-            dims_mapping = ctx.get_tensor_distributed_attr_for_program(
-                W_Grad_var).get_dims_mapping()
-            process_mesh = dist_attr.get_process_mesh()
+            dims_mapping = ctx.get_tensor_dist_attr_for_program(
+                W_Grad_var).dims_mapping
+            process_mesh = dist_attr.process_mesh
             for op in [allreduce_op, scale_op]:
-                op_attr = OperatorDistributedAttribute(op, ctx)
-                op_attr.set_process_mesh(process_mesh)
+                op_attr = OperatorDistributedAttribute()
+                op_attr.process_mesh = process_mesh
                 op_attr.set_output_dims_mapping(W_Grad_var.name, dims_mapping)
                 op_attr.set_input_dims_mapping(W_Grad_var.name, dims_mapping)
-                ctx.set_op_distributed_attr_for_program(op, op_attr)
+                ctx.set_op_dist_attr_for_program(op, op_attr)
 
 
 register_distributed_operator_impl("lookup_table_v2",
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py b/python/paddle/distributed/auto_parallel/operators/dist_matmul.py
index 2edbcd2318cdf..43816ba88af80 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_matmul.py
@@ -12,9 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License
 
-from .common import DistributedOperator
+from .common import DistributedOperatorImplContainer
 from .common import DistributedOperatorImpl
-from .common import register_distributed_operator
+from .common import register_distributed_operator_impl_container
 from .common import register_distributed_operator_impl
 from .common import copy_distributed_attr_for_var
 from .common import copy_distributed_attr_for_dist_op
@@ -24,19 +24,20 @@
 from ..utils import compute_compatible_dim_mapping
 from ..utils import compute_compatible_dims_mapping
 from ..utils import compute_compatible_and_update_dim_mapping
-from ..attribute import OperatorDistributedAttribute
+from ..dist_attribute import OperatorDistributedAttribute
 from paddle.fluid import core, unique_name
 from paddle.fluid.framework import in_dygraph_mode
 from paddle.fluid.framework import Program, Parameter, Variable, program_guard
 from paddle.fluid.data_feeder import check_variable_and_dtype, check_dtype
 from paddle.distributed.fleet.meta_optimizers.common import OpRole, OP_ROLE_KEY, OP_ROLE_VAR_KEY
-from ..process import new_process_group
+from ..process_group import new_process_group
 from ..utils import _get_comm_group, _get_corresponding_rank
 
 
-def _update_dims_mapping_for_matmul(op_dist_attr):
+def _update_dims_mapping_for_matmul(dist_op):
     changed = False
-    op_desc = op_dist_attr.get_owner_op().desc
+    op_desc = dist_op.serial_op.desc
+    op_dist_attr = dist_op.dist_attr
     x_name = op_desc.input('X')[0]
     y_name = op_desc.input('Y')[0]
     out_name = op_desc.output('Out')[0]
@@ -53,7 +54,7 @@ def _update_dims_mapping_for_matmul(op_dist_attr):
     if y_dims_mapping_len == 1:
         y_dims_mapping.insert(1, -1)
 
-    # Deal with dim > 2 and take care of broadcasting 
+    # Deal with dim > 2 and take care of broadcasting
     if out_dims_mapping_len > 2:
         broadcast_x_dims_mapping = []
         broadcast_y_dims_mapping = []
@@ -95,7 +96,7 @@ def _update_dims_mapping_for_matmul(op_dist_attr):
                 out_dims_mapping[i] = compatible_dims_mapping[i]
                 changed = True
 
-    # The following which uses negative index can be work 
+    # The following which uses negative index can be work
     # when len(out_dims_mapping) > 2 and len(out_dims_mapping) <=2
     dim_changed = compute_compatible_and_update_dim_mapping(
         [x_dims_mapping, y_dims_mapping], [-1, -2])
@@ -112,7 +113,7 @@ def _update_dims_mapping_for_matmul(op_dist_attr):
     if dim_changed:
         changed = True
 
-    # Remove unnecessary dim mapping to make sure the lenght of dims_mapping is same as its tensor
+    # Remove unnecessary dim mapping to make sure the length of dims_mapping is same as its tensor
     if x_dims_mapping_len == 1:
         x_dims_mapping.pop(0)
     if y_dims_mapping_len == 1:
@@ -129,17 +130,17 @@ def _right_operand_parameter_matmul_backward(ctx, *args, **kwargs):
 
     # by now the backward function only insert the gradient allreduce for dist op itself
 
-    dist_op_helper = ctx.get_dist_op_helper()
-    main_block = dist_op_helper.get_dst_main_program().global_block()
-    backward_op = dist_op_helper.get_cur_src_op()
-    rank_id = dist_op_helper.get_rank_id()
-    dist_attr = ctx.get_op_distributed_attr_for_program(backward_op)
+    dist_op_context = ctx.dist_op_context
+    main_block = dist_op_context.get_dst_main_program().global_block()
+    backward_op = dist_op_context.get_cur_src_op()
+    rank_id = dist_op_context.get_rank_id()
+    dist_attr = ctx.get_op_dist_attr_for_program(backward_op)
     assert dist_attr is not None, "backward op [{}] don't have dist attribute !".format(
         str(backward_op))
 
     # FIXME (JZ-LIANG) Remove this hack to support any op mesh group for Pipeline Parallelism
-    if rank_id not in dist_attr.get_process_mesh().process_group:
-        rank_id = _get_corresponding_rank(dist_attr.get_process_mesh(), rank_id)
+    if rank_id not in dist_attr.process_mesh.processes:
+        rank_id = _get_corresponding_rank(ctx, dist_attr.process_mesh, rank_id)
 
     # check if need gradient allreduce
     need_gradient_allreduce = False
@@ -175,13 +176,13 @@ def _right_operand_parameter_matmul_backward(ctx, *args, **kwargs):
     assert not X_var.is_parameter, "left operand(X) [{}] of dist matmul should not be parameter".format(
         X_var.name)
 
-    process_mesh = dist_attr.get_process_mesh()
+    process_mesh = dist_attr.process_mesh
     var_dim_mapping = dist_attr.get_input_dims_mapping(X_var.name)
     mesh_shape = process_mesh.topology
     batch_size_axis = var_dim_mapping[0]
     if batch_size_axis > -1 and mesh_shape[batch_size_axis] > 1:
         need_gradient_allreduce = True
-        group_ranks = _get_comm_group(process_mesh.process_group,
+        group_ranks = _get_comm_group(process_mesh.processes,
                                       process_mesh.topology, batch_size_axis,
                                       rank_id)
         dp_degree = len(group_ranks)
@@ -207,32 +208,32 @@ def _right_operand_parameter_matmul_backward(ctx, *args, **kwargs):
                    OP_ROLE_KEY: OpRole.Backward})
         main_block._sync_with_cpp()
 
-        dims_mapping = ctx.get_tensor_distributed_attr_for_program(
-            Y_Grad_var).get_dims_mapping()
-        process_mesh = dist_attr.get_process_mesh()
+        dims_mapping = ctx.get_tensor_dist_attr_for_program(
+            Y_Grad_var).dims_mapping
+        process_mesh = dist_attr.process_mesh
         for op in [allreduce_op, scale_op]:
-            op_attr = OperatorDistributedAttribute(op, ctx)
-            op_attr.set_process_mesh(process_mesh)
+            op_attr = OperatorDistributedAttribute()
+            op_attr.process_mesh = process_mesh
             op_attr.set_output_dims_mapping(Y_Grad_var.name, dims_mapping)
             op_attr.set_input_dims_mapping(Y_Grad_var.name, dims_mapping)
-            ctx.set_op_distributed_attr_for_program(op, op_attr)
+            ctx.set_op_dist_attr_for_program(op, op_attr)
 
 
-def _init_param_sync(Weight_var, dist_op_helper, startup_block, ctx, rank_id):
+def _init_param_sync(Weight_var, dist_op_context, startup_block, ctx, rank_id):
 
-    assert Weight_var.name not in dist_op_helper.already_init_sync_vars
+    assert Weight_var.name not in dist_op_context.already_init_sync_vars
     assert startup_block.has_var(Weight_var.name)
-    dist_op_helper.already_init_sync_vars.add(Weight_var.name)
+    dist_op_context.already_init_sync_vars.add(Weight_var.name)
     param = startup_block.var(Weight_var.name)
-    param_dist_attr = ctx.get_tensor_distributed_attr_for_program(param)
-    process_mesh = param_dist_attr.get_process_mesh()
-    dim_mapping = param_dist_attr.get_dims_mapping()
+    param_dist_attr = ctx.get_tensor_dist_attr_for_program(param)
+    process_mesh = param_dist_attr.process_mesh
+    dim_mapping = param_dist_attr.dims_mapping
 
     for axis, size in enumerate(process_mesh.topology):
         if size <= 1 or axis in dim_mapping:
             pass
         else:
-            group_ranks = _get_comm_group(process_mesh.process_group,
+            group_ranks = _get_comm_group(process_mesh.processes,
                                           process_mesh.topology, axis, rank_id)
             sync_group = new_process_group(group_ranks)
 
@@ -249,13 +250,14 @@ def _init_param_sync(Weight_var, dist_op_helper, startup_block, ctx, rank_id):
     startup_block._sync_with_cpp()
 
 
-class DistributedMatmul(DistributedOperator):
+class DistributedMatmul(DistributedOperatorImplContainer):
     def __init__(self, name):
         super(DistributedMatmul, self).__init__()
         self._name = name
 
 
-register_distributed_operator("matmul", DistributedMatmul("matmul"))
+register_distributed_operator_impl_container("matmul",
+                                             DistributedMatmul("matmul"))
 
 
 # ColumnParallel
@@ -266,12 +268,9 @@ def __init__(self, name):
         self._forward_implemented = True
         self._backward_implemented = True
 
-    def is_process_mesh_compatible(self, op_dist_attr):
-        """ No restriction for now. """
-        return True
-
-    def is_input_compatible(self, op_dist_attr):
-        op_desc = op_dist_attr.get_owner_op().desc
+    def is_input_compatible(self, dist_op):
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
         x_name = op_desc.input('X')[0]
         y_name = op_desc.input('Y')[0]
         x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
@@ -286,8 +285,9 @@ def is_input_compatible(self, op_dist_attr):
                 return False
         return True
 
-    def is_output_compatible(self, op_dist_attr):
-        op_desc = op_dist_attr.get_owner_op().desc
+    def is_output_compatible(self, dist_op):
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
         out_name = op_desc.output('Out')[0]
         out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
         if is_dim_replicate(out_dims_mapping[-1]):
@@ -297,9 +297,9 @@ def is_output_compatible(self, op_dist_attr):
                 return False
         return True
 
-    def update_dims_mapping(self, op_dist_attr):
+    def update_dims_mapping(self, dist_op):
         changed = False
-        dim_changed = _update_dims_mapping_for_matmul(op_dist_attr)
+        dim_changed = _update_dims_mapping_for_matmul(dist_op)
         if dim_changed:
             changed = True
         return changed
@@ -310,21 +310,21 @@ def forward(ctx, *args, **kwargs):
         kwargs: inputname_mapping & outputname_mapping
         """
 
-        dist_op_helper = ctx.get_dist_op_helper()
-        main_block = dist_op_helper.get_dst_main_program().global_block()
-        startup_block = dist_op_helper.get_dst_startup_program().global_block()
-        src_op = dist_op_helper.get_cur_src_op()
-        rank_id = dist_op_helper.get_rank_id()
-        op_dist_attr = ctx.get_op_distributed_attr_for_program(src_op)
+        dist_op_context = ctx.dist_op_context
+        main_block = dist_op_context.get_dst_main_program().global_block()
+        startup_block = dist_op_context.get_dst_startup_program().global_block()
+        src_op = dist_op_context.get_cur_src_op()
+        rank_id = dist_op_context.get_rank_id()
+        op_dist_attr = ctx.get_op_dist_attr_for_program(src_op)
         assert op_dist_attr is not None, "backward op [{}] don't have dist attribute !".format(
             str(src_op))
 
         # FIXME (JZ-LIANG) Remove this hack to support any op mesh group for Pipeline Parallelism
-        if rank_id not in op_dist_attr.get_process_mesh().process_group:
-            rank_id = _get_corresponding_rank(op_dist_attr.get_process_mesh(),
+        if rank_id not in op_dist_attr.process_mesh.processes:
+            rank_id = _get_corresponding_rank(ctx, op_dist_attr.process_mesh,
                                               rank_id)
 
-        # check validation of inputs / outputs 
+        # check validation of inputs / outputs
         for input_name in src_op.desc.input_names():
             assert input_name in kwargs, "input [{}] is not given".format(
                 input_name)
@@ -348,8 +348,8 @@ def forward(ctx, *args, **kwargs):
             Weight_var.name)[1]
         assert matmul_col_dim_mapping >= 0, "col_parallel_matmul's row should be divided by a specific mesh axis, but got [{}]".format(
             matmul_col_dim_mapping)
-        process_mesh_shape = op_dist_attr.get_process_mesh().topology
-        process_mesh_group = op_dist_attr.get_process_mesh().process_group
+        process_mesh_shape = op_dist_attr.process_mesh.topology
+        process_mesh_group = op_dist_attr.process_mesh.processes
 
         parallel_axis = matmul_col_dim_mapping
         group_ranks = _get_comm_group(process_mesh_group, process_mesh_shape,
@@ -365,7 +365,7 @@ def forward(ctx, *args, **kwargs):
             persistable=False,
             stop_gradient=X_var.stop_gradient)
         # copy X_var's dist_attr to intermediate_var_0's dist_attr
-        copy_distributed_attr_for_var(op_dist_attr, intermediate_var_0, X_var)
+        copy_distributed_attr_for_var(ctx, intermediate_var_0, X_var)
 
         check_variable_and_dtype(
             X_var, 'tensor',
@@ -395,13 +395,14 @@ def forward(ctx, *args, **kwargs):
             type='matmul', inputs=inputs, outputs={'Out': Out_var}, attrs=attrs)
 
         # copy serial op's dist_attr to dist op's dist_attr
-        copy_distributed_attr_for_dist_op(c_identity_op, main_block,
+        copy_distributed_attr_for_dist_op(ctx, c_identity_op, main_block,
+                                          op_dist_attr)
+        copy_distributed_attr_for_dist_op(ctx, matmul_op, main_block,
                                           op_dist_attr)
-        copy_distributed_attr_for_dist_op(matmul_op, main_block, op_dist_attr)
 
         # init param sync
         if Weight_var.is_parameter:
-            _init_param_sync(Weight_var, dist_op_helper, startup_block, ctx,
+            _init_param_sync(Weight_var, dist_op_context, startup_block, ctx,
                              rank_id)
 
     @staticmethod
@@ -417,12 +418,9 @@ def __init__(self, name):
         self._forward_implemented = True
         self._backward_implemented = True
 
-    def is_process_mesh_compatible(self, op_dist_attr):
-        """ No restriction for now. """
-        return True
-
-    def is_input_compatible(self, op_dist_attr):
-        op_desc = op_dist_attr.get_owner_op().desc
+    def is_input_compatible(self, dist_op):
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
         x_name = op_desc.input('X')[0]
         y_name = op_desc.input('Y')[0]
         x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
@@ -438,8 +436,9 @@ def is_input_compatible(self, op_dist_attr):
                 return False
         return True
 
-    def is_output_compatible(self, op_dist_attr):
-        op_desc = op_dist_attr.get_owner_op().desc
+    def is_output_compatible(self, dist_op):
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
         out_name = op_desc.output('Out')[0]
         out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
         if is_dim_shard(out_dims_mapping[-1]):
@@ -450,9 +449,9 @@ def is_output_compatible(self, op_dist_attr):
                 return False
         return True
 
-    def update_dims_mapping(self, op_dist_attr):
+    def update_dims_mapping(self, dist_op):
         changed = False
-        dim_changed = _update_dims_mapping_for_matmul(op_dist_attr)
+        dim_changed = _update_dims_mapping_for_matmul(dist_op)
         if dim_changed:
             changed = True
         return changed
@@ -463,21 +462,21 @@ def forward(ctx, *args, **kwargs):
         kwargs: inputname_mapping & outputname_mapping
         """
 
-        dist_op_helper = ctx.get_dist_op_helper()
-        main_block = dist_op_helper.get_dst_main_program().global_block()
-        startup_block = dist_op_helper.get_dst_startup_program().global_block()
-        src_op = dist_op_helper.get_cur_src_op()
-        rank_id = dist_op_helper.get_rank_id()
-        op_dist_attr = ctx.get_op_distributed_attr_for_program(src_op)
+        dist_op_context = ctx.dist_op_context
+        main_block = dist_op_context.get_dst_main_program().global_block()
+        startup_block = dist_op_context.get_dst_startup_program().global_block()
+        src_op = dist_op_context.get_cur_src_op()
+        rank_id = dist_op_context.get_rank_id()
+        op_dist_attr = ctx.get_op_dist_attr_for_program(src_op)
         assert op_dist_attr is not None, "backward op [{}] don't have dist attribute !".format(
             str(src_op))
 
         # FIXME (JZ-LIANG) Remove this hack to support any op mesh group for Pipeline Parallelism
-        if rank_id not in op_dist_attr.get_process_mesh().process_group:
-            rank_id = _get_corresponding_rank(op_dist_attr.get_process_mesh(),
+        if rank_id not in op_dist_attr.process_mesh.processes:
+            rank_id = _get_corresponding_rank(ctx, op_dist_attr.process_mesh,
                                               rank_id)
 
-        # check validation of inputs / outputs 
+        # check validation of inputs / outputs
         for input_name in src_op.desc.input_names():
             assert input_name in kwargs, "input [{}] is not given".format(
                 input_name)
@@ -501,8 +500,8 @@ def forward(ctx, *args, **kwargs):
             Weight_var.name)[0]
         assert matmul_row_dim_mapping >= 0, "row_parallel_matmul's row should be divided by a specific mesh axis, but got [{}]".format(
             matmul_row_dim_mapping)
-        process_mesh_shape = op_dist_attr.get_process_mesh().topology
-        process_mesh_group = op_dist_attr.get_process_mesh().process_group
+        process_mesh_shape = op_dist_attr.process_mesh.topology
+        process_mesh_group = op_dist_attr.process_mesh.processes
 
         parallel_axis = matmul_row_dim_mapping
         group_ranks = _get_comm_group(process_mesh_group, process_mesh_shape,
@@ -528,7 +527,7 @@ def forward(ctx, *args, **kwargs):
             is_data=False,
             need_check_feed=Out_var.desc.need_check_feed())
         # copy Out_var's dist_attr to intermediate_var_0's dist_attr
-        copy_distributed_attr_for_var(op_dist_attr, intermediate_var_0, Out_var)
+        copy_distributed_attr_for_var(ctx, intermediate_var_0, Out_var)
 
         matmul_op = main_block.append_op(
             type='matmul',
@@ -547,13 +546,14 @@ def forward(ctx, *args, **kwargs):
             })
 
         # copy serial op's dist_attr to dist op's dist_attr
-        copy_distributed_attr_for_dist_op(matmul_op, main_block, op_dist_attr)
-        copy_distributed_attr_for_dist_op(c_allreduce_sum_op, main_block,
+        copy_distributed_attr_for_dist_op(ctx, matmul_op, main_block,
+                                          op_dist_attr)
+        copy_distributed_attr_for_dist_op(ctx, c_allreduce_sum_op, main_block,
                                           op_dist_attr)
 
         # init param sync
         if Weight_var.is_parameter:
-            _init_param_sync(Weight_var, dist_op_helper, startup_block, ctx,
+            _init_param_sync(Weight_var, dist_op_context, startup_block, ctx,
                              rank_id)
 
     @staticmethod
@@ -561,18 +561,15 @@ def backward(ctx, *args, **kwargs):
         _right_operand_parameter_matmul_backward(ctx, *args, **kwargs)
 
 
-# ReplicateParallel 
+# ReplicateParallel
 class DistributedMatmulImpl2(DistributedOperatorImpl):
     def __init__(self, name):
         super(DistributedMatmulImpl2, self).__init__()
         self._name = name
 
-    def is_process_mesh_compatible(self, op_dist_attr):
-        """ No restriction for now. """
-        return True
-
-    def is_input_compatible(self, op_dist_attr):
-        op_desc = op_dist_attr.get_owner_op().desc
+    def is_input_compatible(self, dist_op):
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
         x_name = op_desc.input('X')[0]
         y_name = op_desc.input('Y')[0]
         x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
@@ -592,8 +589,9 @@ def is_input_compatible(self, op_dist_attr):
 
         return True
 
-    def is_output_compatible(self, op_dist_attr):
-        op_desc = op_dist_attr.get_owner_op().desc
+    def is_output_compatible(self, dist_op):
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
         out_name = op_desc.output('Out')[0]
         out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
 
@@ -605,9 +603,9 @@ def is_output_compatible(self, op_dist_attr):
 
         return True
 
-    def update_dims_mapping(self, op_dist_attr):
+    def update_dims_mapping(self, dist_op):
         changed = False
-        dim_changed = _update_dims_mapping_for_matmul(op_dist_attr)
+        dim_changed = _update_dims_mapping_for_matmul(dist_op)
         if dim_changed:
             changed = True
         return changed
@@ -625,13 +623,14 @@ def backward(ctx, *args, **kwargs):
                                    DistributedMatmulImpl2("replicate_parallel"))
 
 
-class DistributedMatmulV2(DistributedOperator):
+class DistributedMatmulV2(DistributedOperatorImplContainer):
     def __init__(self, name):
         super(DistributedMatmulV2, self).__init__()
         self._name = name
 
 
-register_distributed_operator("matmul_v2", DistributedMatmulV2("matmul_v2"))
+register_distributed_operator_impl_container("matmul_v2",
+                                             DistributedMatmulV2("matmul_v2"))
 
 
 # ColumnParallel
@@ -642,12 +641,9 @@ def __init__(self, name):
         self._forward_implemented = True
         self._backward_implemented = True
 
-    def is_process_mesh_compatible(self, op_dist_attr):
-        """ No restriction for now. """
-        return True
-
-    def is_input_compatible(self, op_dist_attr):
-        op_desc = op_dist_attr.get_owner_op().desc
+    def is_input_compatible(self, dist_op):
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
         x_name = op_desc.input('X')[0]
         y_name = op_desc.input('Y')[0]
         x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
@@ -662,8 +658,9 @@ def is_input_compatible(self, op_dist_attr):
                 return False
         return True
 
-    def is_output_compatible(self, op_dist_attr):
-        op_desc = op_dist_attr.get_owner_op().desc
+    def is_output_compatible(self, dist_op):
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
         out_name = op_desc.output('Out')[0]
         out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
         if is_dim_replicate(out_dims_mapping[-1]):
@@ -673,9 +670,9 @@ def is_output_compatible(self, op_dist_attr):
                 return False
         return True
 
-    def update_dims_mapping(self, op_dist_attr):
+    def update_dims_mapping(self, dist_op):
         changed = False
-        dim_changed = _update_dims_mapping_for_matmul(op_dist_attr)
+        dim_changed = _update_dims_mapping_for_matmul(dist_op)
         if dim_changed:
             changed = True
         return changed
@@ -686,21 +683,21 @@ def forward(ctx, *args, **kwargs):
         kwargs: inputname_mapping & outputname_mapping
         """
 
-        dist_op_helper = ctx.get_dist_op_helper()
-        main_block = dist_op_helper.get_dst_main_program().global_block()
-        startup_block = dist_op_helper.get_dst_startup_program().global_block()
-        src_op = dist_op_helper.get_cur_src_op()
-        rank_id = dist_op_helper.get_rank_id()
-        op_dist_attr = ctx.get_op_distributed_attr_for_program(src_op)
+        dist_op_context = ctx.dist_op_context
+        main_block = dist_op_context.get_dst_main_program().global_block()
+        startup_block = dist_op_context.get_dst_startup_program().global_block()
+        src_op = dist_op_context.get_cur_src_op()
+        rank_id = dist_op_context.get_rank_id()
+        op_dist_attr = ctx.get_op_dist_attr_for_program(src_op)
         assert op_dist_attr is not None, "backward op [{}] don't have dist attribute !".format(
             str(src_op))
 
         # FIXME (JZ-LIANG) Remove this hack to support any op mesh group for Pipeline Parallelism
-        if rank_id not in op_dist_attr.get_process_mesh().process_group:
-            rank_id = _get_corresponding_rank(op_dist_attr.get_process_mesh(),
+        if rank_id not in op_dist_attr.process_mesh.processes:
+            rank_id = _get_corresponding_rank(ctx, op_dist_attr.process_mesh,
                                               rank_id)
 
-        # check validation of inputs / outputs 
+        # check validation of inputs / outputs
         for input_name in src_op.desc.input_names():
             assert input_name in kwargs, "input [{}] is not given".format(
                 input_name)
@@ -724,8 +721,8 @@ def forward(ctx, *args, **kwargs):
             Weight_var.name)[1]
         assert matmul_col_dim_mapping >= 0, "col_parallel_matmul's row should be divided by a specific mesh axis, but got [{}]".format(
             matmul_col_dim_mapping)
-        process_mesh_shape = op_dist_attr.get_process_mesh().topology
-        process_mesh_group = op_dist_attr.get_process_mesh().process_group
+        process_mesh_shape = op_dist_attr.process_mesh.topology
+        process_mesh_group = op_dist_attr.process_mesh.processes
 
         parallel_axis = matmul_col_dim_mapping
         group_ranks = _get_comm_group(process_mesh_group, process_mesh_shape,
@@ -741,7 +738,7 @@ def forward(ctx, *args, **kwargs):
             persistable=False,
             stop_gradient=X_var.stop_gradient)
         # copy X_var's dist_attr to intermediate_var_0's dist_attr
-        copy_distributed_attr_for_var(op_dist_attr, intermediate_var_0, X_var)
+        copy_distributed_attr_for_var(ctx, intermediate_var_0, X_var)
 
         check_variable_and_dtype(
             X_var, 'tensor',
@@ -770,14 +767,14 @@ def forward(ctx, *args, **kwargs):
             attrs=attrs)
 
         # copy serial op's dist_attr to dist op's dist_attr
-        copy_distributed_attr_for_dist_op(c_identity_op, main_block,
+        copy_distributed_attr_for_dist_op(ctx, c_identity_op, main_block,
                                           op_dist_attr)
-        copy_distributed_attr_for_dist_op(matmul_v2_op, main_block,
+        copy_distributed_attr_for_dist_op(ctx, matmul_v2_op, main_block,
                                           op_dist_attr)
 
         # init param sync
         if Weight_var.is_parameter:
-            _init_param_sync(Weight_var, dist_op_helper, startup_block, ctx,
+            _init_param_sync(Weight_var, dist_op_context, startup_block, ctx,
                              rank_id)
 
     @staticmethod
@@ -793,12 +790,9 @@ def __init__(self, name):
         self._forward_implemented = True
         self._backward_implemented = True
 
-    def is_process_mesh_compatible(self, op_dist_attr):
-        """ No restriction for now. """
-        return True
-
-    def is_input_compatible(self, op_dist_attr):
-        op_desc = op_dist_attr.get_owner_op().desc
+    def is_input_compatible(self, dist_op):
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
         x_name = op_desc.input('X')[0]
         y_name = op_desc.input('Y')[0]
         x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
@@ -814,8 +808,9 @@ def is_input_compatible(self, op_dist_attr):
                 return False
         return True
 
-    def is_output_compatible(self, op_dist_attr):
-        op_desc = op_dist_attr.get_owner_op().desc
+    def is_output_compatible(self, dist_op):
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
         out_name = op_desc.output('Out')[0]
         out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
         if is_dim_shard(out_dims_mapping[-1]):
@@ -826,9 +821,9 @@ def is_output_compatible(self, op_dist_attr):
                 return False
         return True
 
-    def update_dims_mapping(self, op_dist_attr):
+    def update_dims_mapping(self, dist_op):
         changed = False
-        dim_changed = _update_dims_mapping_for_matmul(op_dist_attr)
+        dim_changed = _update_dims_mapping_for_matmul(dist_op)
         if dim_changed:
             changed = True
         return changed
@@ -839,21 +834,21 @@ def forward(ctx, *args, **kwargs):
         kwargs: inputname_mapping & outputname_mapping
         """
 
-        dist_op_helper = ctx.get_dist_op_helper()
-        main_block = dist_op_helper.get_dst_main_program().global_block()
-        startup_block = dist_op_helper.get_dst_startup_program().global_block()
-        src_op = dist_op_helper.get_cur_src_op()
-        rank_id = dist_op_helper.get_rank_id()
-        op_dist_attr = ctx.get_op_distributed_attr_for_program(src_op)
+        dist_op_context = ctx.dist_op_context
+        main_block = dist_op_context.get_dst_main_program().global_block()
+        startup_block = dist_op_context.get_dst_startup_program().global_block()
+        src_op = dist_op_context.get_cur_src_op()
+        rank_id = dist_op_context.get_rank_id()
+        op_dist_attr = ctx.get_op_dist_attr_for_program(src_op)
         assert op_dist_attr is not None, "backward op [{}] don't have dist attribute !".format(
             str(src_op))
 
         # FIXME (JZ-LIANG) Remove this hack to support any op mesh group for Pipeline Parallelism
-        if rank_id not in op_dist_attr.get_process_mesh().process_group:
-            rank_id = _get_corresponding_rank(op_dist_attr.get_process_mesh(),
+        if rank_id not in op_dist_attr.process_mesh.processes:
+            rank_id = _get_corresponding_rank(ctx, op_dist_attr.process_mesh,
                                               rank_id)
 
-        # check validation of inputs / outputs 
+        # check validation of inputs / outputs
         for input_name in src_op.desc.input_names():
             assert input_name in kwargs, "input [{}] is not given".format(
                 input_name)
@@ -877,8 +872,8 @@ def forward(ctx, *args, **kwargs):
             Weight_var.name)[0]
         assert matmul_row_dim_mapping >= 0, "row_parallel_matmul's row should be divided by a specific mesh axis, but got [{}]".format(
             matmul_row_dim_mapping)
-        process_mesh_shape = op_dist_attr.get_process_mesh().topology
-        process_mesh_group = op_dist_attr.get_process_mesh().process_group
+        process_mesh_shape = op_dist_attr.process_mesh.topology
+        process_mesh_group = op_dist_attr.process_mesh.processes
 
         parallel_axis = matmul_row_dim_mapping
         group_ranks = _get_comm_group(process_mesh_group, process_mesh_shape,
@@ -900,7 +895,7 @@ def forward(ctx, *args, **kwargs):
             is_data=False,
             need_check_feed=Out_var.desc.need_check_feed())
         # copy Out_var's dist_attr to intermediate_var_0's dist_attr
-        copy_distributed_attr_for_var(op_dist_attr, intermediate_var_0, Out_var)
+        copy_distributed_attr_for_var(ctx, intermediate_var_0, Out_var)
 
         matmul_v2_op = main_block.append_op(
             type='matmul_v2',
@@ -919,14 +914,14 @@ def forward(ctx, *args, **kwargs):
             })
 
         # copy serial op's dist_attr to dist op's dist_attr
-        copy_distributed_attr_for_dist_op(matmul_v2_op, main_block,
+        copy_distributed_attr_for_dist_op(ctx, matmul_v2_op, main_block,
                                           op_dist_attr)
-        copy_distributed_attr_for_dist_op(c_allreduce_sum_op, main_block,
+        copy_distributed_attr_for_dist_op(ctx, c_allreduce_sum_op, main_block,
                                           op_dist_attr)
 
         # init param sync
         if Weight_var.is_parameter:
-            _init_param_sync(Weight_var, dist_op_helper, startup_block, ctx,
+            _init_param_sync(Weight_var, dist_op_context, startup_block, ctx,
                              rank_id)
 
     @staticmethod
@@ -934,18 +929,15 @@ def backward(ctx, *args, **kwargs):
         _right_operand_parameter_matmul_backward(ctx, *args, **kwargs)
 
 
-# ReplicateParallel 
+# ReplicateParallel
 class DistributedMatmulV2Impl2(DistributedOperatorImpl):
     def __init__(self, name):
         super(DistributedMatmulV2Impl2, self).__init__()
         self._name = name
 
-    def is_process_mesh_compatible(self, op_dist_attr):
-        """ No restriction for now. """
-        return True
-
-    def is_input_compatible(self, op_dist_attr):
-        op_desc = op_dist_attr.get_owner_op().desc
+    def is_input_compatible(self, dist_op):
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
         x_name = op_desc.input('X')[0]
         y_name = op_desc.input('Y')[0]
         x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
@@ -965,8 +957,11 @@ def is_input_compatible(self, op_dist_attr):
 
         return True
 
-    def is_output_compatible(self, op_dist_attr):
-        op_desc = op_dist_attr.get_owner_op().desc
+    def is_output_compatible(self, dist_op):
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
         out_name = op_desc.output('Out')[0]
         out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
 
@@ -978,9 +973,9 @@ def is_output_compatible(self, op_dist_attr):
 
         return True
 
-    def update_dims_mapping(self, op_dist_attr):
+    def update_dims_mapping(self, dist_op):
         changed = False
-        dim_changed = _update_dims_mapping_for_matmul(op_dist_attr)
+        dim_changed = _update_dims_mapping_for_matmul(dist_op)
         if dim_changed:
             changed = True
         return changed
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_reshape.py b/python/paddle/distributed/auto_parallel/operators/dist_reshape.py
index 39e97850b8656..8821f3bc65782 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_reshape.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_reshape.py
@@ -12,9 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License
 
-from .common import DistributedOperator
+from .common import DistributedOperatorImplContainer
 from .common import DistributedOperatorImpl
-from .common import register_distributed_operator
+from .common import register_distributed_operator_impl_container
 from .common import register_distributed_operator_impl
 from ..utils import is_dim_shard
 from ..utils import is_dim_replicate
@@ -28,13 +28,14 @@
 from paddle.fluid.data_feeder import check_variable_and_dtype, check_dtype
 
 
-class DistributedReshape2(DistributedOperator):
+class DistributedReshape2(DistributedOperatorImplContainer):
     def __init__(self, name):
         super(DistributedReshape2, self).__init__()
         self._name = name
 
 
-register_distributed_operator("reshape2", DistributedReshape2("reshape2"))
+register_distributed_operator_impl_container("reshape2",
+                                             DistributedReshape2("reshape2"))
 
 
 class DistributedReshapeImpl0(DistributedOperatorImpl):
@@ -44,12 +45,9 @@ def __init__(self, name):
         self._forward_implemented = True
         self._backward_implemented = True
 
-    def is_process_mesh_compatible(self, op_dist_attr):
-        """ No restriction for now. """
-        return True
-
-    def is_input_compatible(self, op_dist_attr):
-        op_desc = op_dist_attr.get_owner_op().desc
+    def is_input_compatible(self, dist_op):
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
         x_name = op_desc.input('X')[0]
         out_name = op_desc.output('Out')[0]
         x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
@@ -60,8 +58,9 @@ def is_input_compatible(self, op_dist_attr):
 
         return True
 
-    def is_output_compatible(self, op_dist_attr):
-        op_desc = op_dist_attr.get_owner_op().desc
+    def is_output_compatible(self, dist_op):
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
         x_name = op_desc.input('X')[0]
         out_name = op_desc.output('Out')[0]
         x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
@@ -75,9 +74,10 @@ def is_output_compatible(self, op_dist_attr):
 
         return True
 
-    def update_dims_mapping(self, op_dist_attr):
+    def update_dims_mapping(self, dist_op):
         changed = False
-        op_desc = op_dist_attr.get_owner_op().desc
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
         x_name = op_desc.input('X')[0]
         out_name = op_desc.output('Out')[0]
         x_shape_name = op_desc.output('XShape')[0]
@@ -103,15 +103,15 @@ def forward(ctx, *args, **kwargs):
         kwargs: inputname_mapping & outputname_mapping
         """
 
-        dist_op_helper = ctx.get_dist_op_helper()
-        main_block = dist_op_helper.get_dst_main_program().global_block()
-        src_op = dist_op_helper.get_cur_src_op()
-        rank_id = dist_op_helper.get_rank_id()
-        op_dist_attr = ctx.get_op_distributed_attr_for_program(src_op)
+        dist_op_context = ctx.dist_op_context
+        main_block = dist_op_context.get_dst_main_program().global_block()
+        src_op = dist_op_context.get_cur_src_op()
+        rank_id = dist_op_context.get_rank_id()
+        op_dist_attr = ctx.get_op_dist_attr_for_program(src_op)
         assert op_dist_attr is not None, "backward op [{}] don't have dist attribute !".format(
             str(src_op))
 
-        # check validation of inputs / outputs 
+        # check validation of inputs / outputs
         for input_name in src_op.desc.input_names():
             assert input_name in kwargs, "input [{}] is not given".format(
                 input_name)
@@ -139,7 +139,7 @@ def forward(ctx, *args, **kwargs):
 
         # got dist attribute info
         dim_mapping = op_dist_attr.get_output_dims_mapping(Out_var.name)
-        process_mesh_shape = op_dist_attr.get_process_mesh().topology
+        process_mesh_shape = op_dist_attr.process_mesh.topology
 
         # modify target shape
         for idx, axis in enumerate(dim_mapping):
@@ -172,12 +172,9 @@ def __init__(self, name):
         self._forward_implemented = True
         self._backward_implemented = True
 
-    def is_process_mesh_compatible(self, op_dist_attr):
-        """ No restriction for now. """
-        return True
-
-    def is_input_compatible(self, op_dist_attr):
-        op_desc = op_dist_attr.get_owner_op().desc
+    def is_input_compatible(self, dist_op):
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
         x_name = op_desc.input('X')[0]
         out_name = op_desc.output('Out')[0]
         x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
@@ -191,8 +188,9 @@ def is_input_compatible(self, op_dist_attr):
 
         return True
 
-    def is_output_compatible(self, op_dist_attr):
-        op_desc = op_dist_attr.get_owner_op().desc
+    def is_output_compatible(self, dist_op):
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
         x_name = op_desc.input('X')[0]
         out_name = op_desc.output('Out')[0]
         x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
@@ -203,9 +201,10 @@ def is_output_compatible(self, op_dist_attr):
 
         return True
 
-    def update_dims_mapping(self, op_dist_attr):
+    def update_dims_mapping(self, dist_op):
         changed = False
-        op_desc = op_dist_attr.get_owner_op().desc
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
         x_name = op_desc.input('X')[0]
         out_name = op_desc.output('Out')[0]
         x_shape_name = op_desc.output('XShape')[0]
@@ -231,15 +230,15 @@ def forward(ctx, *args, **kwargs):
         kwargs: inputname_mapping & outputname_mapping
         """
 
-        dist_op_helper = ctx.get_dist_op_helper()
-        main_block = dist_op_helper.get_dst_main_program().global_block()
-        src_op = dist_op_helper.get_cur_src_op()
-        rank_id = dist_op_helper.get_rank_id()
-        op_dist_attr = ctx.get_op_distributed_attr_for_program(src_op)
+        dist_op_context = ctx.dist_op_context
+        main_block = dist_op_context.get_dst_main_program().global_block()
+        src_op = dist_op_context.get_cur_src_op()
+        rank_id = dist_op_context.get_rank_id()
+        op_dist_attr = ctx.get_op_dist_attr_for_program(src_op)
         assert op_dist_attr is not None, "backward op [{}] don't have dist attribute !".format(
             str(src_op))
 
-        # check validation of inputs / outputs 
+        # check validation of inputs / outputs
         for input_name in src_op.desc.input_names():
             assert input_name in kwargs, "input [{}] is not given".format(
                 input_name)
@@ -267,7 +266,7 @@ def forward(ctx, *args, **kwargs):
 
         # got dist attribute info
         dim_mapping = op_dist_attr.get_output_dims_mapping(Out_var.name)
-        process_mesh_shape = op_dist_attr.get_process_mesh().topology
+        process_mesh_shape = op_dist_attr.process_mesh.topology
 
         # modify target shape
         for idx, axis in enumerate(dim_mapping):
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_softmax.py b/python/paddle/distributed/auto_parallel/operators/dist_softmax.py
index 56be75b3beaf2..c90fc7da89d33 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_softmax.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_softmax.py
@@ -12,9 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License
 
-from .common import DistributedOperator
+from .common import DistributedOperatorImplContainer
 from .common import DistributedOperatorImpl
-from .common import register_distributed_operator
+from .common import register_distributed_operator_impl_container
 from .common import register_distributed_operator_impl
 from ..utils import is_dim_shard
 from ..utils import is_dim_replicate
@@ -24,13 +24,14 @@
 from ..utils import compute_compatible_and_update_dim_mapping
 
 
-class DistributedSoftmax(DistributedOperator):
+class DistributedSoftmax(DistributedOperatorImplContainer):
     def __init__(self, name):
         super(DistributedSoftmax, self).__init__()
         self._name = name
 
 
-register_distributed_operator("softmax", DistributedSoftmax("softmax"))
+register_distributed_operator_impl_container("softmax",
+                                             DistributedSoftmax("softmax"))
 
 
 class DistributedSoftmaxImpl(DistributedOperatorImpl):
@@ -40,12 +41,9 @@ def __init__(self, name):
         self._forward_implemented = False
         self._backward_implemented = True
 
-    def is_process_mesh_compatible(self, op_dist_attr):
-        """ No restriction for now. """
-        return True
-
-    def is_input_compatible(self, op_dist_attr):
-        op_desc = op_dist_attr.get_owner_op().desc
+    def is_input_compatible(self, dist_op):
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
         x_name = op_desc.input('X')[0]
         axis = op_desc.attr('axis')
         x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
@@ -58,8 +56,9 @@ def is_input_compatible(self, op_dist_attr):
 
         return True
 
-    def is_output_compatible(self, op_dist_attr):
-        op_desc = op_dist_attr.get_owner_op().desc
+    def is_output_compatible(self, dist_op):
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
         out_name = op_desc.output('Out')[0]
         axis = op_desc.attr('axis')
         out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
@@ -72,9 +71,10 @@ def is_output_compatible(self, op_dist_attr):
 
         return True
 
-    def update_dims_mapping(self, op_dist_attr):
+    def update_dims_mapping(self, dist_op):
         changed = False
-        op_desc = op_dist_attr.get_owner_op().desc
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
         x_name = op_desc.input('X')[0]
         out_name = op_desc.output('Out')[0]
         x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_transpose.py b/python/paddle/distributed/auto_parallel/operators/dist_transpose.py
index 10b8bf2666f4b..0bfc7d9f4ca05 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_transpose.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_transpose.py
@@ -12,9 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License
 
-from .common import DistributedOperator
+from .common import DistributedOperatorImplContainer
 from .common import DistributedOperatorImpl
-from .common import register_distributed_operator
+from .common import register_distributed_operator_impl_container
 from .common import register_distributed_operator_impl
 from ..utils import is_dim_shard
 from ..utils import is_dim_replicate
@@ -24,13 +24,14 @@
 from ..utils import compute_compatible_and_update_dim_mapping
 
 
-class DistributedTranspose2(DistributedOperator):
+class DistributedTranspose2(DistributedOperatorImplContainer):
     def __init__(self, name):
         super(DistributedTranspose2, self).__init__()
         self._name = name
 
 
-register_distributed_operator("transpose2", DistributedTranspose2("transpose2"))
+register_distributed_operator_impl_container(
+    "transpose2", DistributedTranspose2("transpose2"))
 
 
 class DistributedTranspose2Impl(DistributedOperatorImpl):
@@ -40,19 +41,16 @@ def __init__(self, name):
         self._forward_implemented = False
         self._backward_implemented = True
 
-    def is_process_mesh_compatible(self, op_dist_attr):
-        """ No restriction for now. """
+    def is_input_compatible(self, dist_op):
         return True
 
-    def is_input_compatible(self, op_dist_attr):
+    def is_output_compatible(self, dist_op):
         return True
 
-    def is_output_compatible(self, op_dist_attr):
-        return True
-
-    def update_dims_mapping(self, op_dist_attr):
+    def update_dims_mapping(self, dist_op):
         changed = False
-        op_desc = op_dist_attr.get_owner_op().desc
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
         x_name = op_desc.input('X')[0]
         out_name = op_desc.output('Out')[0]
         x_shape_name = op_desc.output('XShape')[0]
diff --git a/python/paddle/distributed/auto_parallel/parallelizer.py b/python/paddle/distributed/auto_parallel/parallelizer.py
index 8f4a4866eb8db..7a0cbd7da31c5 100644
--- a/python/paddle/distributed/auto_parallel/parallelizer.py
+++ b/python/paddle/distributed/auto_parallel/parallelizer.py
@@ -15,11 +15,12 @@
 import paddle
 from paddle.distributed.fleet import cloud_utils
 import paddle.fluid.core as core
-from .context import DistributedContext
-from .context import get_default_distributed_context
+from .dist_context import DistributedContext
+from .dist_context import get_default_distributed_context
+from .dist_context import set_default_distributed_context
 from .completion import complete_annotation, complete_backward_annotation
 from .partitioner import Partitioner
-from .process import get_all_process_groups
+from .process_group import get_all_process_groups
 from .utils import make_data_unshard
 from .reshard import reshard
 
@@ -38,8 +39,7 @@ def __init__(self, fleet):
         self._fleet = fleet
         self._optimizer = self._fleet.user_defined_optimizer
         self._dist_strategy = self._fleet._user_defined_strategy
-        # self._dist_context = DistributedContext()
-        self._dist_context = get_default_distributed_context()
+        self._dist_context = DistributedContext()
 
     def _remove_distributed_attrs(self, main_program):
         suffix = core.kAutoParallelSuffix()
@@ -53,24 +53,15 @@ def _remove_distributed_attrs(self, main_program):
 
     def parallelize(self,
                     loss,
-                    startup_program=None,
+                    startup_program,
                     parameter_list=None,
                     no_grad_set=None):
-        self._original_main_program = loss.block.program
-        # For now, we only allow user to use the default startup and main program
         assert startup_program is not None
-        if startup_program == None:
-            self._original_startup_program = \
-                paddle.static.default_startup_program().clone(for_test=False)
-            startup_program = paddle.static.default_startup_program()
-        else:
-            self._original_startup_program = \
-                startup_program.clone(for_test=False)
+        main_program = loss.block.program
 
         # Annotation completion
-        completed_main_program = complete_annotation(
-            self._original_main_program, self._dist_context)
-
+        completed_main_program = complete_annotation(main_program,
+                                                     self._dist_context)
         # Logical partition 
         rank = paddle.distributed.get_rank()
         partitioner = Partitioner(self._dist_strategy, self._dist_context, rank)
@@ -94,9 +85,13 @@ def parallelize(self,
         # The last step: remove all distributed attributes to be compatiable
         # with inference.
         self._remove_distributed_attrs(partitioned_main_prog)
-        make_data_unshard(partitioned_main_prog, partitioned_startup_prog)
+        make_data_unshard(partitioned_main_prog, partitioned_startup_prog,
+                          self._dist_context)
 
         reshard(partitioned_main_prog, partitioned_startup_prog, rank,
                 self._dist_context)
 
+        # Copy distributed info to the default context
+        set_default_distributed_context(self._dist_context)
+
         return dist_optimize_ops, dist_params_grads, partitioned_startup_prog, partitioned_main_prog
diff --git a/python/paddle/distributed/auto_parallel/partitioner.py b/python/paddle/distributed/auto_parallel/partitioner.py
index c0a91f4b53a0d..9af194e810fb6 100755
--- a/python/paddle/distributed/auto_parallel/partitioner.py
+++ b/python/paddle/distributed/auto_parallel/partitioner.py
@@ -22,15 +22,15 @@
 from paddle.fluid.framework import Program, Parameter, Variable, program_guard
 from paddle.fluid.data_feeder import check_variable_and_dtype, check_dtype
 from paddle.fluid.backward import append_backward, _some_in_set_, _append_grad_suffix_
-from paddle.distributed.auto_parallel.operators.common import get_distributed_operator
+from paddle.distributed.auto_parallel.operators.common import get_distributed_operator_impl_container
 from paddle.fluid.clip import GradientClipBase, GradientClipByNorm, error_clip_callback, append_gradient_clip_ops, ClipGradByGlobalNorm
 from paddle.distributed.fleet.base.distributed_strategy import DistributedStrategy
-from paddle.distributed.auto_parallel.context import DistributedContext, DistOpHelper
+from paddle.distributed.auto_parallel.dist_context import DistributedContext, DistributedOperatorContext
 from paddle.distributed.fleet.meta_optimizers.common import is_loss_grad_op, is_backward_op, is_optimizer_op
 from paddle.distributed.fleet.meta_optimizers.common import OpRole, OP_ROLE_KEY, OP_ROLE_VAR_KEY
-from .process import new_process_group
-from .interface import _g_process_mesh_map
-from .attribute import OperatorDistributedAttribute
+from .dist_attribute import OperatorDistributedAttribute
+from .process_group import new_process_group
+from .utils import print_program_with_dist_attr
 from paddle.distributed.auto_parallel.completion import complete_backward_annotation, complete_update_annotation
 
 __varname_not_in_block__ = ["lod_tensor_blocking_queue_0"]
@@ -68,14 +68,14 @@ class Partitioner(object):
             # auto completion
             auto.ProcessMesh(shape=[2, 4], process_group=[0, 1, 2, 3, 4, 5, 6, 7])
             annotated_main_program = auto.complete_annotation(serial_main_program)
-            auto_paralle_context = get_default_distributed_context()
+            dist_context = get_default_distributed_context()
                 
             # distributed strategy & rank info
             rank_id = paddle.distributed.get_rank()
             dist_strategy = fleet.DistributedStrategy()
     
             # create partitioner
-            Partitioner = Partitioner(dist_strategy, auto_paralle_context, rank_id)
+            Partitioner = Partitioner(dist_strategy, dist_context, rank_id)
 
             # create dist program with forward only
             # for distributed inference, using partitioned_main_prog from here
@@ -93,11 +93,11 @@ class Partitioner(object):
             opt_ops = Partitioner.apply_optimize(optimizer, dist_params_grads, partitioned_main_prog, partitioned_startup_prog)
     """
 
-    def __init__(self, dist_strategy, auto_parallel_context, rank_id=0):
+    def __init__(self, dist_strategy, dist_context, rank_id=0):
         """
         Args:
             dist_strategy (paddle.fleet.distributed_strategy): used to determine the user defined distributed strategy.
-            auto_parallel_context (paddle.fluid.DistributedContext): used to access the distributed_attr of var & op, every Partitioner object could maintain its own DistributedContext member, and partition program base on that shard scenario.
+            dist_context (paddle.fluid.DistributedContext): used to access the distributed_attr of var & op, every Partitioner object could maintain its own DistributedContext member, and partition program base on that shard scenario.
             rank_id (int): global rank id to which the partitioned distributed program belong.
         """
 
@@ -106,13 +106,13 @@ def __init__(self, dist_strategy, auto_parallel_context, rank_id=0):
                 "dist_strategy be paddle.fleet.base.DistributedStrategy, got %s here"
                 % type(dist_strategy))
 
-        if not isinstance(auto_parallel_context, DistributedContext):
+        if not isinstance(dist_context, DistributedContext):
             raise TypeError(
-                "auto_parallel_context be paddle.fluid.DistributedContext, got %s here"
-                % type(auto_parallel_context))
+                "dist_context be paddle.fluid.DistributedContext, got %s here" %
+                type(dist_context))
 
         self._dist_strategy = dist_strategy
-        self._auto_parallel_context = auto_parallel_context
+        self._dist_context = dist_context
         self._rank_id = rank_id
         self._serial2dist_varname_mapping = {}
         self._dist_varname_suffix = ""
@@ -218,8 +218,8 @@ def transpile_forward_impl(self, main_program, startup_program):
 
         if not isinstance(startup_program, (Program)):
             raise TypeError(
-                "auto_parallel_context be paddle.fluid.framework.program, got %s here"
-                % type(startup_program))
+                "dist_context be paddle.fluid.framework.program, got %s here" %
+                type(startup_program))
 
         # check if shard annotated serial program valid
         if not self._is_valid_annotated_program(main_program):
@@ -310,13 +310,12 @@ def _dist_var_op_forward_transpile(self,
                 if isinstance(var, Parameter):
                     # TODO if var not belong to this rank, should be filtered
                     serial_main_var = serial_main_block.var(var.name)
-                    dist_attr = self._auto_parallel_context.get_tensor_distributed_attr_for_program(
+                    dist_attr = self._dist_context.get_tensor_dist_attr_for_program(
                         serial_main_var)
                     target_shape = _get_dist_shape(serial_main_var, dist_attr)
                     new_name = var.name + self._dist_varname_suffix
                     temp_varname_map[var.name] = new_name
-                    _partition_parameter(self._auto_parallel_context,
-                                         serial_main_var,
+                    _partition_parameter(self._dist_context, serial_main_var,
                                          partitioned_startup_global_block,
                                          new_name, target_shape)
                     param2shape[new_name] = target_shape
@@ -346,24 +345,22 @@ def _dist_var_op_forward_transpile(self,
                 assert new_op.desc == new_op_desc
                 output_var = partitioned_startup_global_block.var(output_vars[
                     0])
-                output_var_attr = self._auto_parallel_context.get_tensor_distributed_attr_for_program(
+                output_var_attr = self._dist_context.get_tensor_dist_attr_for_program(
                     output_var)
-                op_attr = OperatorDistributedAttribute(
-                    new_op, self._auto_parallel_context)
-                op_attr.set_process_mesh(output_var_attr.get_process_mesh())
-                op_attr.set_output_dims_mapping(
-                    output_var.name, output_var_attr.get_dims_mapping())
-                op_attr.set_input_dims_mapping(
-                    output_var.name, output_var_attr.get_dims_mapping())
-                self._auto_parallel_context.set_op_distributed_attr_for_program(
-                    new_op, op_attr)
+                op_attr = OperatorDistributedAttribute()
+                op_attr.process_mesh = output_var_attr.process_mesh
+                op_attr.set_output_dims_mapping(output_var.name,
+                                                output_var_attr.dims_mapping)
+                op_attr.set_input_dims_mapping(output_var.name,
+                                               output_var_attr.dims_mapping)
+                self._dist_context.set_op_dist_attr_for_program(new_op, op_attr)
 
         # TODO move helper init to a comm place
-        dist_op_helper = self._auto_parallel_context.get_dist_op_helper()
-        dist_op_helper.set_dst_main_program(partitioned_main_prog)
-        dist_op_helper.set_dst_startup_program(partitioned_startup_prog)
-        dist_op_helper.set_varname_mapping(self._serial2dist_varname_mapping)
-        dist_op_helper.set_rank_id(self._rank_id)
+        dist_op_context = self._dist_context.dist_op_context
+        dist_op_context.set_dst_main_program(partitioned_main_prog)
+        dist_op_context.set_dst_startup_program(partitioned_startup_prog)
+        dist_op_context.set_varname_mapping(self._serial2dist_varname_mapping)
+        dist_op_context.set_rank_id(self._rank_id)
 
         # transpile main program
         for op in serial_ops:
@@ -373,8 +370,7 @@ def _dist_var_op_forward_transpile(self,
                 if serial_input_varname not in self._serial2dist_varname_mapping:
                     new_varname = serial_input_varname + self._dist_varname_suffix
                     if serial_main_block.has_var(serial_input_varname):
-                        _partition_var(self._auto_parallel_context,
-                                       serial_main_block,
+                        _partition_var(self._dist_context, serial_main_block,
                                        partitioned_global_block,
                                        serial_input_varname, new_varname)
                     else:
@@ -387,28 +383,25 @@ def _dist_var_op_forward_transpile(self,
             for serial_output_varname in op.desc.output_arg_names():
                 if serial_output_varname not in self._serial2dist_varname_mapping:
                     new_varname = serial_output_varname + self._dist_varname_suffix
-                    _partition_var(self._auto_parallel_context,
-                                   serial_main_block, partitioned_global_block,
+                    _partition_var(self._dist_context, serial_main_block,
+                                   partitioned_global_block,
                                    serial_output_varname, new_varname)
                     self._serial2dist_varname_mapping[
                         serial_output_varname] = new_varname
 
             # partition op
-            kinputs, koutputs = dist_op_helper.prepare_forward_context(op)
-            dist_attr = self._auto_parallel_context.get_op_distributed_attr_for_program(
-                op)
-            if _is_dist_op_forward_implement(self._auto_parallel_context, op):
-                dist_ops = get_distributed_operator(op.type)
-                dist_op_impl = dist_ops.get_impl(dist_attr.get_impl_idx())
-                dist_op_impl.forward(self._auto_parallel_context, **kinputs,
-                                     **koutputs)
+            kinputs, koutputs = dist_op_context.prepare_forward_context(op)
+            dist_attr = self._dist_context.get_op_dist_attr_for_program(op)
+            if _is_dist_op_forward_implement(self._dist_context, op):
+                dist_ops = get_distributed_operator_impl_container(op.type)
+                dist_op_impl = dist_ops.get_impl(dist_attr.impl_idx)
+                dist_op_impl.forward(self._dist_context, **kinputs, **koutputs)
 
             else:
                 # replicate op
-                dist_ops = get_distributed_operator("default")
+                dist_ops = get_distributed_operator_impl_container("default")
                 dist_op_impl = dist_ops.get_impl(0)
-                dist_op_impl.forward(self._auto_parallel_context, **kinputs,
-                                     **koutputs)
+                dist_op_impl.forward(self._dist_context, **kinputs, **koutputs)
 
         return partitioned_main_prog, partitioned_startup_prog
 
@@ -453,18 +446,18 @@ def _dist_var_op_backward_transpile(self,
                     for param in no_grad_set
                 ]
 
-            dist_op_helper = self._auto_parallel_context.get_dist_op_helper()
+            dist_op_context = self._dist_context.dist_op_context
             params_and_grads = _auto_backward(
                 dist_loss,
                 dist_startup_program,
                 parameter_list=parameter_list,
                 no_grad_set=no_grad_set,
                 callbacks=callbacks,
-                distop_context=dist_op_helper)
+                distop_context=dist_op_context)
 
             # backward completion 
             complete_backward_annotation(
-                dist_main_program, dist_context=self._auto_parallel_context)
+                dist_main_program, dist_context=self._dist_context)
 
             # transpiler backward for dist op
             # get backward ops
@@ -485,31 +478,33 @@ def _dist_var_op_backward_transpile(self,
             backward_ops = ops[first_backward_op_idx:]
             for backward_op in backward_ops:
                 # if the backward op has a corresponding forward op
-                if backward_op.desc.id() in dist_op_helper.gradopidx2opidx:
-                    forward_op_id = dist_op_helper.gradopidx2opidx[
+                if backward_op.desc.id() in dist_op_context.gradopidx2opidx:
+                    forward_op_id = dist_op_context.gradopidx2opidx[
                         backward_op.desc.id()]
                     forward_op = forward_op_id2forward_op[forward_op_id]
                     # TODO backward attr should has _impl_idx
-                    forward_op_dist_attr = self._auto_parallel_context.get_op_distributed_attr_for_program(
+                    forward_op_dist_attr = self._dist_context.get_op_dist_attr_for_program(
                         forward_op)
                     # TODO use the backward op itself to find the dist op
-                    dist_ops = get_distributed_operator(forward_op.type)
-                    kinputs, koutputs = dist_op_helper.prepare_backward_context(
+                    dist_ops = get_distributed_operator_impl_container(
+                        forward_op.type)
+                    kinputs, koutputs = dist_op_context.prepare_backward_context(
                         backward_op)
 
                     # TODO use backward op itself to determine impl idx
-                    if _is_dist_op_backward_implement(
-                            self._auto_parallel_context, forward_op):
+                    if _is_dist_op_backward_implement(self._dist_context,
+                                                      forward_op):
                         dist_op_impl = dist_ops.get_impl(
-                            forward_op_dist_attr.get_impl_idx())
-                        dist_op_impl.backward(self._auto_parallel_context,
-                                              **kinputs, **koutputs)
+                            forward_op_dist_attr.impl_idx)
+                        dist_op_impl.backward(self._dist_context, **kinputs,
+                                              **koutputs)
                     else:
                         # replicate op
-                        dist_ops = get_distributed_operator("default")
+                        dist_ops = get_distributed_operator_impl_container(
+                            "default")
                         dist_op_impl = dist_ops.get_impl(0)
-                        dist_op_impl.backward(self._auto_parallel_context,
-                                              **kinputs, **koutputs)
+                        dist_op_impl.backward(self._dist_context, **kinputs,
+                                              **koutputs)
 
             return params_and_grads
         # replace dist grad ops
@@ -524,7 +519,7 @@ def _optimize_transpile(self, user_define_optimizer, params_grads,
 
         # update completion 
         complete_update_annotation(
-            main_program, dist_context=self._auto_parallel_context)
+            main_program, dist_context=self._dist_context)
 
         return optimize_ops
 
@@ -534,12 +529,11 @@ def _is_valid_annotated_program(self, program):
         ops = program.global_block().ops
         vars_ = program.list_vars()
         op_dist_attrs = [
-            self._auto_parallel_context.get_op_distributed_attr_for_program(op)
-            for op in ops
+            self._dist_context.get_op_dist_attr_for_program(op) for op in ops
         ]
         var_dist_attrs = [
-            self._auto_parallel_context.get_tensor_distributed_attr_for_program(
-                var) for var in vars_
+            self._dist_context.get_tensor_dist_attr_for_program(var)
+            for var in vars_
         ]
 
         all_ops_annotated = all(dist_attr is not None
@@ -563,8 +557,7 @@ def _serial_varname2dist_var(self, serial_varname, dist_program):
 
     def _is_var_distributed(self, var):
 
-        dist_attr = self._auto_parallel_context.get_tensor_distributed_attr_for_program(
-            var)
+        dist_attr = self._dist_context.get_tensor_dist_attr_for_program(var)
         assert dist_attr is not None, "dist_attr of var [{}] is None".format(
             var.name)
         return _is_distributed(dist_attr)
@@ -637,20 +630,20 @@ def _get_no_grad_set(loss, no_grad_set=None):
     return no_grad_set
 
 
-def _is_dist_op_forward_implement(auto_paralle_context, op):
-    dist_attr = auto_paralle_context.get_op_distributed_attr_for_program(op)
-    dist_ops = get_distributed_operator(op.type)
+def _is_dist_op_forward_implement(dist_context, op):
+    dist_attr = dist_context.get_op_dist_attr_for_program(op)
+    dist_ops = get_distributed_operator_impl_container(op.type)
 
-    return dist_ops and dist_attr.get_impl_idx() >= 0 and dist_ops.get_impl( \
-        dist_attr.get_impl_idx())._forward_implemented
+    return dist_ops and dist_attr.impl_idx >= 0 and dist_ops.get_impl( \
+        dist_attr.impl_idx)._forward_implemented
 
 
-def _is_dist_op_backward_implement(auto_paralle_context, op):
-    dist_attr = auto_paralle_context.get_op_distributed_attr_for_program(op)
-    dist_ops = get_distributed_operator(op.type)
+def _is_dist_op_backward_implement(dist_context, op):
+    dist_attr = dist_context.get_op_dist_attr_for_program(op)
+    dist_ops = get_distributed_operator_impl_container(op.type)
 
-    return dist_ops and dist_attr.get_impl_idx() >= 0 and dist_ops.get_impl( \
-        dist_attr.get_impl_idx())._backward_implemented
+    return dist_ops and dist_attr.impl_idx >= 0 and dist_ops.get_impl( \
+        dist_attr.impl_idx)._backward_implemented
 
 
 def _auto_backward(loss,
@@ -690,8 +683,8 @@ def _auto_backward(loss,
 
 def _is_distributed(dist_attr):
 
-    mapping = dist_attr.get_dims_mapping()
-    mesh = dist_attr.get_process_mesh().topology
+    mapping = dist_attr.dims_mapping
+    mesh = dist_attr.process_mesh.topology
     for idx in range(len(mapping)):
         if mapping[idx] >= 0 and mesh[mapping[idx]] > 1:
             return True
@@ -702,8 +695,8 @@ def _is_distributed(dist_attr):
 def _get_dist_shape(var, dist_attr):
 
     var_shape = var.shape
-    mapping = dist_attr.get_dims_mapping()
-    mesh = dist_attr.get_process_mesh().topology
+    mapping = dist_attr.dims_mapping
+    mesh = dist_attr.process_mesh.topology
     assert len(var_shape) == len(
         mapping
     ), "variable shape [{}] and dim_mapping [{}] is NOT match !".format(
@@ -721,7 +714,7 @@ def _get_dist_shape(var, dist_attr):
     return new_shape
 
 
-def _partition_parameter(auto_paralle_context, src_var, dst_block, dst_varname,
+def _partition_parameter(dist_context, src_var, dst_block, dst_varname,
                          dst_shape):
     # NOTE hack to copied Parameter
     # not initialized parameter, need to initialize it 
@@ -749,17 +742,13 @@ def _partition_parameter(auto_paralle_context, src_var, dst_block, dst_varname,
     # distributed_attr_uid = src_var.desc.get_distributed_attr_uid()
     # param.desc.set_distributed_attr_uid(distributed_attr_uid)
     dist_attr = copy.deepcopy(
-        auto_paralle_context.get_tensor_distributed_attr_for_program(src_var))
+        dist_context.get_tensor_dist_attr_for_program(src_var))
     assert dist_attr is not None
-    dist_attr._owner_tensor = param
-    dist_attr._owner_context = auto_paralle_context.get_tensor_distributed_attr_for_program(
-        src_var)._owner_context
-    auto_paralle_context.set_tensor_distributed_attr_for_program(param,
-                                                                 dist_attr)
+    dist_context.set_tensor_dist_attr_for_program(param, dist_attr)
 
 
-def _partition_intermediate_var(auto_paralle_context, src_var, dst_block,
-                                dst_varname, dst_shape):
+def _partition_intermediate_var(dist_context, src_var, dst_block, dst_varname,
+                                dst_shape):
     var = dst_block.create_var(
         type=src_var.type,
         name=dst_varname,
@@ -776,15 +765,12 @@ def _partition_intermediate_var(auto_paralle_context, src_var, dst_block,
     # distributed_attr_uid = src_var.desc.get_distributed_attr_uid()
     # var.desc.set_distributed_attr_uid(distributed_attr_uid)
     dist_attr = copy.deepcopy(
-        auto_paralle_context.get_tensor_distributed_attr_for_program(src_var))
+        dist_context.get_tensor_dist_attr_for_program(src_var))
     assert dist_attr is not None
-    dist_attr._owner_tensor = var
-    dist_attr._owner_context = auto_paralle_context.get_tensor_distributed_attr_for_program(
-        src_var)._owner_context
-    auto_paralle_context.set_tensor_distributed_attr_for_program(var, dist_attr)
+    dist_context.set_tensor_dist_attr_for_program(var, dist_attr)
 
 
-def _partition_var(auto_paralle_context, src_block, dst_block, src_varname,
+def _partition_var(dist_context, src_block, dst_block, src_varname,
                    dst_varname):
     """
     partition include: split + replicate
@@ -798,16 +784,15 @@ def _partition_var(auto_paralle_context, src_block, dst_block, src_varname,
             persistable=True,
             stop_gradient=True)
     else:
-        dist_attr = auto_paralle_context.get_tensor_distributed_attr_for_program(
-            src_var)
+        dist_attr = dist_context.get_tensor_dist_attr_for_program(src_var)
         target_shape = _get_dist_shape(src_var, dist_attr)
 
         if isinstance(src_var, Parameter):
-            _partition_parameter(auto_paralle_context, src_var, dst_block,
-                                 dst_varname, target_shape)
+            _partition_parameter(dist_context, src_var, dst_block, dst_varname,
+                                 target_shape)
         else:
-            _partition_intermediate_var(auto_paralle_context, src_var,
-                                        dst_block, dst_varname, target_shape)
+            _partition_intermediate_var(dist_context, src_var, dst_block,
+                                        dst_varname, target_shape)
 
 
 def _insert_src_op(src_op, dst_block, varname_mapping):
@@ -822,8 +807,7 @@ def _insert_src_op(src_op, dst_block, varname_mapping):
     dst_block._sync_with_cpp()
 
 
-def _insert_dist_op(src_op, dst_block, varname_mapping, auto_paralle_context,
-                    rank_id):
+def _insert_dist_op(src_op, dst_block, varname_mapping, dist_context, rank_id):
 
     # build input varname mapping
     input_mapping = {}
@@ -842,10 +826,9 @@ def _insert_dist_op(src_op, dst_block, varname_mapping, auto_paralle_context,
         output_mapping[output_name] = varnames
 
     # append dist op 
-    dist_attr = auto_paralle_context.get_op_distributed_attr_for_program(src_op)
-    dist_ops = get_distributed_operator(src_op.type)
-    append_op_handle = dist_ops.get_impl(dist_attr.get_impl_idx()).forward(
-        src_op)
+    dist_attr = dist_context.get_op_dist_attr_for_program(src_op)
+    dist_ops = get_distributed_operator_impl_container(src_op.type)
+    append_op_handle = dist_ops.get_impl(dist_attr.impl_idx).forward(src_op)
     append_op_handle(
         dst_block,
         src_op,
diff --git a/python/paddle/distributed/auto_parallel/process.py b/python/paddle/distributed/auto_parallel/process_group.py
similarity index 76%
rename from python/paddle/distributed/auto_parallel/process.py
rename to python/paddle/distributed/auto_parallel/process_group.py
index b919645b96ccc..8bbe6f69155a4 100644
--- a/python/paddle/distributed/auto_parallel/process.py
+++ b/python/paddle/distributed/auto_parallel/process_group.py
@@ -19,62 +19,32 @@
 from ...fluid.framework import in_dygraph_mode
 from ...fluid.layers.tensor import fill_constant
 
-LOGICAL_PROCESS_TO_PHYSICAL_PROCESS_MAP = None
-PROCESSOR_TO_PHYSICAL_PROCESS_MAP = None
-
-
-def get_all_logical_process_set():
-    from .interface import _g_process_mesh_map
-    all_logical_process_set = set(_g_process_mesh_map[0].process_group)
-    return all_logical_process_set
-
-
-def get_logical_process_to_physical_process_map():
-    global LOGICAL_PROCESS_TO_PHYSICAL_PROCESS_MAP
-    return LOGICAL_PROCESS_TO_PHYSICAL_PROCESS_MAP
-
-
-def set_logical_process_to_physical_process_map(mapping):
-    global LOGICAL_PROCESS_TO_PHYSICAL_PROCESS_MAP
-    LOGICAL_PROCESS_TO_PHYSICAL_PROCESS_MAP = mapping
-
-
-def get_processor_to_physical_process_map():
-    global PROCESSOR_TO_PHYSICAL_PROCESS_MAP
-    return PROCESSOR_TO_PHYSICAL_PROCESS_MAP
-
-
-def set_processor_to_physical_process_map(mapping):
-    global PROCESSOR_TO_PHYSICAL_PROCESS_MAP
-    PROCESSOR_TO_PHYSICAL_PROCESS_MAP = mapping
-
-
-PROCESS_GROUP_MAP = {}
+_g_process_group_map = {}
 
 
 def get_all_process_groups():
-    global PROCESS_GROUP_MAP
-    return PROCESS_GROUP_MAP.values()
+    global _g_process_group_map
+    return _g_process_group_map.values()
 
 
 def new_process_group(ranks):
-    global PROCESS_GROUP_MAP
-    if not PROCESS_GROUP_MAP:
+    global _g_process_group_map
+    if not _g_process_group_map:
         genv = _get_global_env()
-        PROCESS_GROUP_MAP["global_group"] = ProcessGroup(
+        _g_process_group_map["global_group"] = ProcessGroup(
             0, list(range(genv.world_size)))
     # A key constructed from ranks is used in the global process group map
     key = ''.join(map(str, sorted(ranks)))
-    if key not in PROCESS_GROUP_MAP:
-        num_groups = len(PROCESS_GROUP_MAP)
+    if key not in _g_process_group_map:
+        num_groups = len(_g_process_group_map)
         # Note: our process group may interfere with the original implementation
         # so the created group id should start from the original _new_ring_id()
         group_id = _new_ring_id() + num_groups + 1
         pg = ProcessGroup(group_id, ranks)
-        PROCESS_GROUP_MAP[key] = pg
+        _g_process_group_map[key] = pg
         return pg
     else:
-        pg = PROCESS_GROUP_MAP[key]
+        pg = _g_process_group_map[key]
         return pg
 
 
diff --git a/python/paddle/distributed/auto_parallel/process_mesh.py b/python/paddle/distributed/auto_parallel/process_mesh.py
new file mode 100644
index 0000000000000..ecdd77f7ea754
--- /dev/null
+++ b/python/paddle/distributed/auto_parallel/process_mesh.py
@@ -0,0 +1,135 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy
+import copy
+
+
+def _get_nested_list_shape(nested_list):
+    """
+    Get the shape of a nested_list.
+    """
+    result = []
+    while isinstance(nested_list, list):
+        result.append(len(nested_list))
+        nested_list = nested_list[0]
+    return result
+
+
+def _flatten_nested_list(nested_list):
+    """
+    Get a list of all items in a nested_list.
+    Ref: https://stackoverflow.com/questions/952914/how-to-make-a-flat-list-out-of-a-list-of-lists
+    """
+    result = numpy.array(nested_list).flatten().tolist()
+    return result
+
+
+class ProcessMesh(object):
+    r"""
+    The class `Processmesh` describes the topology of logical processes. 
+    A mesh is an N-dimensional array. The shape of the N-dimensional
+    array represents the topology of logical processes and every
+    element of the N-dimensional array represent a logical process. For
+    example, the 2-dimensional array [[2, 4, 5], [0, 1, 3]]
+    illustrates six logical processes organized as the topology [2, 3],
+    i.e., the shape of the 2-dimensional array. With the above topology,
+    there are two parallel groups, where the first parallel group has a
+    parallel degree of 2 and the second one has a parallel degree of 3.
+    And the first logical process is the one with id=2.
+
+    Args:
+        mesh (list): an N-dimensional array (nested list) describes the toplogy
+            of logical processes. The shape of the N-dimensional array
+            represents the topology of logical processes and every 
+            element of the N-dimensional array represents a logical process.
+    
+    Returns:
+        None
+
+    Raises:
+        ValueError: If `mesh` is not an instance of list.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.distributed as dist
+            
+            paddle.enable_static()
+            
+            mesh = dist.ProcessMesh([[2, 4, 5], [0, 1, 3]])
+            assert mesh.topology == [2, 3]
+            assert mesh.processes == [2, 4, 5, 0, 1, 3]
+
+    """
+
+    def __init__(self, mesh):
+        if mesh is None or not isinstance(mesh, list):
+            raise ValueError('mesh must be an instance of list.')
+
+        processes = _flatten_nested_list(mesh)
+
+        assert all(isinstance(p, int) for p in processes), \
+            ("All elements of mesh must be integer")
+
+        assert min(processes) >= 0, ('All elements of mesh must be >= 0.')
+
+        unique_processes = set(processes)
+        assert len(unique_processes) == len(processes), (
+            'All elements of mesh must be unique.')
+
+        self._topology = _get_nested_list_shape(mesh)
+        self._processes = processes
+
+        from .dist_context import get_default_distributed_context
+        default_dist_cxt = get_default_distributed_context()
+        default_dist_cxt.add_process_mesh(self)
+
+    @property
+    def topology(self):
+        r"""
+        Get the topology of logical processes belonging to this ProcessMesh.
+        This is the shape of `mesh` used to initialized this ProcessMesh.
+        """
+        return self._topology
+
+    @property
+    def processes(self):
+        r"""
+        Get a list of all processes belonging to this ProcessMesh.
+        """
+        return self._processes
+
+    @property
+    def ndim(self):
+        r"""
+        Get the number of dimension of ProcessMesh.
+        """
+        return len(self._topology)
+
+    def __eq__(self, other):
+        if not isinstance(other, ProcessMesh):
+            return False
+        if self.topology != other.topology or self.processes != other.processes:
+            return False
+        return True
+
+    def __ne__(self, other):
+        return not self.__eq__(other)
+
+    def __str__(self):
+        str = "shape {} and process group {}".format(self.topology,
+                                                     self.processes)
+        return str
diff --git a/python/paddle/distributed/auto_parallel/reshard.py b/python/paddle/distributed/auto_parallel/reshard.py
index 2d54bf8a7887a..fb130e9deefe8 100644
--- a/python/paddle/distributed/auto_parallel/reshard.py
+++ b/python/paddle/distributed/auto_parallel/reshard.py
@@ -22,9 +22,9 @@
 from paddle.fluid.framework import Program, OpProtoHolder
 import paddle.fluid.layers.utils as utils
 from ..collective import _get_global_env
-from .context import DistributedContext
-from .attribute import OperatorDistributedAttribute, TensorDistributedAttribute
-from .process import new_process_group, ProcessGroup, PROCESS_GROUP_MAP
+from .dist_context import DistributedContext
+from .dist_attribute import OperatorDistributedAttribute, TensorDistributedAttribute
+from .process_group import new_process_group, ProcessGroup, _g_process_group_map
 
 
 class AllGatherOpDesc:
@@ -276,20 +276,22 @@ def _is_overlapped(shape_x, shape_y):
     return overlapped
 
 
-def _need_reshard(tensor_dist_attr, op_dist_attr):
+def _need_reshard(dist_tensor, dist_op):
     """Judge the tensor whether needs to be resharded."""
     is_reshard = False
-    tensor_dims_mapping = tensor_dist_attr.get_dims_mapping()
-    tensor_process_mesh = tensor_dist_attr.get_process_mesh()
-    op_input_dims_mapping = op_dist_attr.get_input_dims_mapping(
-        tensor_dist_attr.get_owner_tensor().name)
-    op_process_mesh = op_dist_attr.get_process_mesh()
+    tensor_dist_attr = dist_tensor.dist_attr
+    tensor_name = dist_tensor.serial_tensor.name
+    tensor_dims_mapping = tensor_dist_attr.dims_mapping
+    tensor_process_mesh = tensor_dist_attr.process_mesh
+    op_dist_attr = dist_op.dist_attr
+    op_input_dims_mapping = op_dist_attr.get_input_dims_mapping(tensor_name)
+    op_process_mesh = op_dist_attr.process_mesh
     if all(
             map(lambda x: x is not None, [
                 tensor_dims_mapping, tensor_process_mesh, op_input_dims_mapping,
                 op_process_mesh
             ])):
-        if tensor_dims_mapping != op_input_dims_mapping or tensor_process_mesh._id != op_process_mesh._id:
+        if tensor_dims_mapping != op_input_dims_mapping or tensor_process_mesh != op_process_mesh:
             is_reshard = True
     return is_reshard
 
@@ -305,28 +307,30 @@ def _compute_complete_shape(slice_shape, process_shape, dims_mapping):
     return complete_shape
 
 
-def find_op_desc_seq(source_tensor, tensor_dist_attr, op_dist_attr):
+def find_op_desc_seq(dist_tensor, dist_op):
     """
     Find the op description sequence to reshard the source tensor for matching the op requirement.
 
     Args:
-        source_tensor (Variable): A tensor with distributed attribute.
-        tensor_dist_attr (TensorDistributedAttribute): The distributed attribute of tensor.
-        op_dist_attr (OperatorDistributedAttribute): The distributed attribute of operator.
+        dist_tensor (DistributedTensor): A distributed tensor.
+        dist_op (DistributedOperator): A distributed operator.
 
     Returns:
         Dict, the dict represents the required op description sequence corresponding to process, The key of dict is
         process and value is a list containing op description.
     """
-    source_dims_mapping = tensor_dist_attr.get_dims_mapping()
-    source_process_mesh = tensor_dist_attr.get_process_mesh()
-    source_process_group = source_process_mesh.process_group
+    tensor_dist_attr = dist_tensor.dist_attr
+    source_tensor = dist_tensor.serial_tensor
+    tensor_name = source_tensor.name
+    source_dims_mapping = tensor_dist_attr.dims_mapping
+    source_process_mesh = tensor_dist_attr.process_mesh
+    source_process_group = source_process_mesh.processes
     source_process_shape = source_process_mesh.topology
 
-    target_process_mesh = op_dist_attr.get_process_mesh()
-    target_dims_mapping = op_dist_attr.get_input_dims_mapping(
-        tensor_dist_attr.get_owner_tensor().name)
-    target_process_group = target_process_mesh.process_group
+    op_dist_attr = dist_op.dist_attr
+    target_process_mesh = op_dist_attr.process_mesh
+    target_dims_mapping = op_dist_attr.get_input_dims_mapping(tensor_name)
+    target_process_group = target_process_mesh.processes
     target_process_shape = target_process_mesh.topology
 
     complete_shape = _compute_complete_shape(
@@ -662,11 +666,11 @@ def _concat_partitions_with_op(partition_tensor_list, tensor, partition_index,
 
 
 def _init_comm_for_send_recv():
-    if not PROCESS_GROUP_MAP:
+    if not _g_process_group_map:
         genv = _get_global_env()
-        PROCESS_GROUP_MAP["global_group"] = ProcessGroup(
+        _g_process_group_map["global_group"] = ProcessGroup(
             0, list(range(genv.world_size)))
-        PROCESS_GROUP_MAP["global_group"].instantiate()
+        _g_process_group_map["global_group"].instantiate()
 
 
 HAS_SENT = {}
@@ -773,31 +777,29 @@ def parse_op_desc(program, rank_id, op_desc_seq, var_name, reshard_op,
                 axes=op_desc.axes,
                 new_var_name=new_name)
 
-            tensor_attr = TensorDistributedAttribute(target_tensor,
-                                                     dist_context)
-            process_mesh = dist_context.get_op_distributed_attr_for_program(
-                matched_op).get_process_mesh()
-            dims_mapping = dist_context.get_op_distributed_attr_for_program(
+            tensor_attr = TensorDistributedAttribute()
+            process_mesh = dist_context.get_op_dist_attr_for_program(
+                matched_op).process_mesh
+            dims_mapping = dist_context.get_op_dist_attr_for_program(
                 matched_op).get_input_dims_mapping(var_name)
-            tensor_attr.set_dims_mapping(dims_mapping)
-            tensor_attr.set_process_mesh(process_mesh)
-            dist_context.set_tensor_distributed_attr_for_program(target_tensor,
-                                                                 tensor_attr)
+            tensor_attr.dims_mapping = dims_mapping
+            tensor_attr.process_mesh = process_mesh
+            dist_context.set_tensor_dist_attr_for_program(target_tensor,
+                                                          tensor_attr)
 
             # rename op input name according to new name
             for op in block.ops:
                 for name in op.input_arg_names:
-                    op_dist_attr = dist_context.get_op_distributed_attr_for_program(
-                        op)
+                    op_dist_attr = dist_context.get_op_dist_attr_for_program(op)
                     if name == var_name and op_dist_attr is not None:
-                        op_process_mesh = op_dist_attr.get_process_mesh()
+                        op_process_mesh = op_dist_attr.process_mesh
                         op_input_dims_mapping = op_dist_attr.get_input_dims_mapping(
                             var_name)
-                        if op_process_mesh._id == process_mesh._id and op_input_dims_mapping == dims_mapping:
+                        if op_process_mesh == process_mesh and op_input_dims_mapping == dims_mapping:
                             op.desc._rename_input(name, target_tensor.name)
                             op_dist_attr.set_input_dims_mapping(
                                 target_tensor.name, dims_mapping)
-                            op_dist_attr._dims_mapping.pop(name, None)
+                            op_dist_attr.set_input_dist_attr(name, None)
 
 
 def _remove_no_need_ops(auto_parallel_main_prog, dist_context, rank_id):
@@ -825,9 +827,9 @@ def _remove_no_need_ops(auto_parallel_main_prog, dist_context, rank_id):
         if op.type == "c_sync_comm_stream":
             need_save = []
             for var_name in op.input_arg_names:
-                process_mesh = dist_context.get_tensor_distributed_attr_for_program(
-                    vars[var_name]).get_process_mesh()
-                if rank_id in process_mesh.process_group:
+                process_mesh = dist_context.get_tensor_dist_attr_for_program(
+                    vars[var_name]).process_mesh
+                if rank_id in process_mesh.processes:
                     need_save.append(var_name)
             if not need_save:
                 remove_op_idx.append(idx)
@@ -839,10 +841,10 @@ def _remove_no_need_ops(auto_parallel_main_prog, dist_context, rank_id):
             continue
 
         # judge the other op whether should be removed.
-        op_dist_attr = dist_context.get_op_distributed_attr_for_program(op)
+        op_dist_attr = dist_context.get_op_dist_attr_for_program(op)
         if op_dist_attr is not None:
-            op_process_mesh = op_dist_attr.get_process_mesh()
-            if rank_id not in op_process_mesh.process_group and op.type not in not_remove_op_ref:
+            op_process_mesh = op_dist_attr.process_mesh
+            if rank_id not in op_process_mesh.processes and op.type not in not_remove_op_ref:
                 remove_op_idx.append(idx)
 
     for idx in remove_op_idx[::-1]:
@@ -974,20 +976,18 @@ def reshard(auto_parallel_main_prog, auto_parallel_startup_prog, rank_id,
     while idx < len(block.ops):
         pre_op_count = len(block.ops)
         op = block.ops[idx]
-        op_dist_attr = dist_context.get_op_distributed_attr_for_program(op)
-        if op_dist_attr is not None:
+        dist_op = dist_context.get_dist_op_for_program(op)
+        if dist_op is not None:
             idx_offset = 0
             for var_name in op.input_arg_names:
                 # skip lod_tensor_blocking_queue_0
                 if var_name == "lod_tensor_blocking_queue_0":
                     continue
                 var = block.vars[var_name]
-                tensor_dist_attr = dist_context.get_tensor_distributed_attr_for_program(
-                    var)
-                if tensor_dist_attr is not None and _need_reshard(
-                        tensor_dist_attr, op_dist_attr):
-                    reshard_op_desc = find_op_desc_seq(var, tensor_dist_attr,
-                                                       op_dist_attr)
+                dist_tensor = dist_context.get_dist_tensor_for_program(var)
+                if dist_tensor is not None and _need_reshard(dist_tensor,
+                                                             dist_op):
+                    reshard_op_desc = find_op_desc_seq(dist_tensor, dist_op)
                     parse_op_desc(auto_parallel_main_prog, rank_id,
                                   reshard_op_desc, var_name, op, dist_context)
                     cur_op_count = len(block.ops)
diff --git a/python/paddle/distributed/auto_parallel/utils.py b/python/paddle/distributed/auto_parallel/utils.py
index 813bd481d9286..4e2c739119c57 100755
--- a/python/paddle/distributed/auto_parallel/utils.py
+++ b/python/paddle/distributed/auto_parallel/utils.py
@@ -12,10 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License
 
+import os
+import paddle
 import threading
-import paddle.fluid.core as core
 import numpy as np
-from .interface import _g_process_mesh_map
+import warnings
+import logging
+
+import paddle.fluid.core as core
+from paddle.fluid.io import is_parameter, is_belong_to_optimizer
+from paddle.framework.io import _to_LodTensor
 
 
 def is_valid_list_index(list, index):
@@ -119,34 +125,35 @@ def remove_distributed_attr_suffix(name):
 
 
 def check_distributed_attr_for_program(program, dist_context=None):
-    from .context import get_default_distributed_context
+    from .dist_context import get_default_distributed_context
     if dist_context is None:
         dist_context = get_default_distributed_context()
     assert dist_context.is_initialized_for_program(), \
         "Distributed attributes must be initialized before check."
     for block in program.blocks:
         for tensor in block.vars.values():
-            tensor_dist_attr = dist_context.get_tensor_distributed_attr_for_program(
+            dist_tensor = dist_context.get_dist_tensor_for_graph(tensor)
+            tensor_dist_attr = dist_context.get_tensor_dist_attr_for_program(
                 tensor)
-            if (tensor_dist_attr is not None) and (
-                    not tensor_dist_attr.is_valid()):
+            if (tensor_dist_attr is not None) and (not dist_tensor.is_valid()):
                 return False
         for op in block.ops:
-            op_dist_attr = dist_context.get_op_distributed_attr_for_program(op)
-            if (op_dist_attr is not None) and (not op_dist_attr.is_valid()):
+            dist_op = dist_context.get_dist_op_for_graph(tensor)
+            op_dist_attr = dist_context.get_op_dist_attr_for_program(op)
+            if (op_dist_attr is not None) and (not dist_op.is_valid()):
                 return False
     return True
 
 
-def print_program_with_distributed_attr(program, dist_context=None):
+def print_program_with_dist_attr(program, dist_context=None):
     """
     This function reuses the original program output ability with a distributed context.
     Using lock can avoid multiple threads change the default distributed context simultaneously.
     """
     lock = threading.Lock()
     lock.acquire()
-    from .context import get_default_distributed_context
-    from .context import set_default_distributed_context
+    from .dist_context import get_default_distributed_context
+    from .dist_context import set_default_distributed_context
     if dist_context is None:
         dist_context = get_default_distributed_context()
         print(program)
@@ -233,12 +240,12 @@ def _coordinate2linear_idx(mesh_shape, coordinate):
 
     """
     # NOTE the following function work based on a strong an assumption
-    # that the processes in mesh are 
+    # that the processes in mesh are
     #    1. starts from 0
-    #    2. continuous  
-    # it will be wrong if ths above condition doesnot meet, 
+    #    2. continuous
+    # it will be wrong if ths above condition doesnot meet,
     # e.g. process_mesh = { process_groups = [7, 8, 9,10, 12, 13, 14, 15], mesh = [2, 4]}
-    # if you want a more general mapping, you should use cartesian product 
+    # if you want a more general mapping, you should use cartesian product
 
     assert len(mesh_shape) == len(
         coordinate
@@ -301,31 +308,29 @@ def _linear_idx2coordinate(mesh_shape, linear_idx):
     return coordinate
 
 
-def _get_corresponding_rank(target_mesh, rank):
+def _get_corresponding_rank(dist_context, target_mesh, rank):
 
     # TODO(JZ-LIANG) a hack method to support varying mesh in Pipeline parallelism case.
     # we assume that all mesh are evenly divide from a parent mesh and should have same size.
     # to revise this in future.
 
     coordinate = None
-    for key, mesh in _g_process_mesh_map.items():
-        if key == 0:
-            continue
-        if rank in mesh.process_group and mesh.topology == target_mesh.topology:
+    for mesh in dist_context.process_meshes:
+        if rank in mesh.processes and mesh.topology == target_mesh.topology:
             coordinate = _linear_idx2coordinate(mesh.topology,
-                                                mesh.process_group.index(rank))
+                                                mesh.processes.index(rank))
             break
 
     assert coordinate is not None, "could NOT found rank [{}] in any registered mesh".format(
         rank)
-    return target_mesh.process_group[_coordinate2linear_idx(mesh.topology,
-                                                            coordinate)]
+    return target_mesh.processes[_coordinate2linear_idx(mesh.topology,
+                                                        coordinate)]
 
 
 def _get_unshard_dist_shape(var, dist_attr):
     var_shape = var.shape
-    mapping = dist_attr.get_dims_mapping()
-    mesh = dist_attr.get_process_mesh().topology
+    mapping = dist_attr.dims_mapping
+    mesh = dist_attr.process_mesh.topology
     assert len(var_shape) == len(
         mapping
     ), "variable shape [{}] and dim_mapping [{}] is NOT match !".format(
@@ -340,20 +345,155 @@ def _get_unshard_dist_shape(var, dist_attr):
     return new_shape
 
 
-def make_data_unshard(dist_main_prog, dist_startup_prog):
-    from .context import get_default_distributed_context
-    dist_context = get_default_distributed_context()
+def make_data_unshard(dist_main_prog, dist_startup_prog, dist_context=None):
+    from .dist_context import get_default_distributed_context
+    if dist_context is None:
+        dist_context = get_default_distributed_context()
 
     for var in dist_main_prog.list_vars():
         if var.is_data:
-            tensor_dist_attr = dist_context.get_tensor_distributed_attr_for_program(
+            tensor_dist_attr = dist_context.get_tensor_dist_attr_for_program(
                 var)
             inverse_shape = _get_unshard_dist_shape(var, tensor_dist_attr)
             var.desc.set_shape(inverse_shape)
-            dim_mapping = tensor_dist_attr.get_dims_mapping()
+            dim_mapping = tensor_dist_attr.dims_mapping
             dim_mapping = [-1] * len(dim_mapping)
-            tensor_dist_attr.set_dims_mapping(dim_mapping)
-            dist_context.set_tensor_distributed_attr_for_program(
-                var, tensor_dist_attr)
-            var._set_attr('dim_mapping' + core.kAutoParallelSuffix(),
-                          dim_mapping)
+            tensor_dist_attr.dims_mapping = dim_mapping
+            dist_context.set_tensor_dist_attr_for_program(var, tensor_dist_attr)
+
+
+def _check_addition_info(addition_info):
+    """
+    Validity check of additional information
+    """
+    if not addition_info:
+        return addition_info
+    elif not isinstance(addition_info, dict):
+        raise TypeError(
+            "The type of addition_info should be 'dict', but got {}".format(
+                str(type(addition_info))))
+    else:
+        return addition_info
+
+
+def _check_valid_path(file_path):
+    """
+    Validity check of input file path
+    """
+    if not file_path:
+        return file_path
+    elif isinstance(file_path, str):
+        if not os.path.exists(file_path):
+            raise ValueError("The file_path '{}' does not exist.".format(
+                file_path))
+        else:
+            return [file_path]
+    elif isinstance(file_path, list):
+        if not all(isinstance(file, str) for file in file_path):
+            raise ValueError("The type of each file_path should be str.")
+        if not all(os.path.exists(file) for file in file_path):
+            raise ValueError("The file_path's file does not exist.")
+        return file_path
+    else:
+        raise TypeError(
+            "The type of file_path should be 'str' or 'list', but got '{}'.".
+            format(str(type(file_path))))
+
+
+def save_distributed_checkpoint(program,
+                                checkpoint_path,
+                                is_integrated=False,
+                                addition_info=None,
+                                dist_attr_path=None):
+    """ 
+    Save model parameter state, optimzer state, distributed attribute and 
+    additional information of each rank.
+
+    Args:
+        program(Program): The program to be saved.
+        checkpoint_path(str): The path of the checkpoint file to be saved.
+        is_integrated(bool, optional): Whether to integrate param before save. Default: False.
+        addition_info(dict, optional): Additional information. Default: None.
+        dist_attr_path(str, optional): The path of distributed attribute file to be saved. Default: None
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+
+            ckpt_path = os.path.join(args.output_dir, "step_%d" % step)
+            os.makedirs(ckpt_path, exist_ok=True)
+            save_distributed_checkpoint(program, ckpt_path)
+    """
+    if not is_integrated:
+        rank = paddle.distributed.get_rank()
+        ckpt_file_name = os.path.join(checkpoint_path,
+                                      "model_state_rank{}.pdmodel".format(rank))
+
+        state_dict = {
+            "model": program.state_dict(),
+            "ranks": paddle.distributed.get_world_size()
+        }
+        if _check_addition_info(addition_info):
+            state_dict["addition_info"] = addition_info
+
+        paddle.save(state_dict, ckpt_file_name)
+        logging.info("Already save model to {}".format(checkpoint_path))
+
+        if dist_attr_path:
+            raise NotImplementedError(
+                "Save distributed attribute has not been implemented.")
+    else:
+        # TODO: integrate param before save
+        raise NotImplementedError(
+            "Integrating parameter has not been implemented.")
+
+
+def load_distributed_checkpoint(checkpoint_path,
+                                program=None,
+                                dist_attr_path=None):
+    """ 
+    Load parameter, optimizer, distributed attribute and addition_info of model.
+
+    Args:
+        checkpoint_path(str|list[str]): checkpoint_path's type can be 'str' or 'list', \
+            which must be in order of rank id when type is 'list'.
+        program(Program, optional): The program to be updated with checkpoint_path. Default: None.
+        dist_attr_path(str|list[str], optional): dist_attr_path's type can be 'str' or 'list', \
+            which must be in order of rank id when type is 'list'. Default: None.
+
+    Returns:
+        None or addition_info which user saved in last train.
+
+    Examples:
+        .. code-block:: python
+
+            exe.run(startup_program)
+            ckpt_path = ['./output/step_10/model_state_rank0.pdmodel', 
+                         './output/step_10/model_state_rank1.pdmodel']
+            load_distributed_checkpoint(ckpt_path, main_program)
+    """
+    checkpoint_path = _check_valid_path(checkpoint_path)
+    dist_attr_path = _check_valid_path(dist_attr_path)
+
+    if checkpoint_path and dist_attr_path:
+        raise NotImplementedError(
+            "Merge&Slice parameter with dist_attr has not been implemented.")
+
+    elif checkpoint_path:
+        assert len(checkpoint_path) == paddle.distributed.get_world_size(), \
+            "The number of checkpoint_path must equal to the number of ranks"
+        rank = paddle.distributed.get_rank()
+        state_dict_info = paddle.load(checkpoint_path[rank])
+        state_dict = state_dict_info["model"]
+    else:
+        raise ValueError("'checkpoint_path' can not be None.")
+
+    program.set_state_dict(state_dict) if program else \
+        warnings.warn("'Program' is None, parameters will not be loaded.")
+
+    if "addition_info" not in state_dict_info:
+        return
+
+    return state_dict_info["addition_info"]
diff --git a/python/paddle/distributed/fleet/data_generator/__init__.py b/python/paddle/distributed/fleet/data_generator/__init__.py
index 230ada2abec06..2288aca43f751 100644
--- a/python/paddle/distributed/fleet/data_generator/__init__.py
+++ b/python/paddle/distributed/fleet/data_generator/__init__.py
@@ -11,6 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 
-from .data_generator import DataGenerator  # noqa: F401
+from .data_generator import DataGenerator, MultiSlotDataGenerator  # noqa: F401
 
 __all__ = []
diff --git a/python/paddle/distributed/fleet/elastic/manager.py b/python/paddle/distributed/fleet/elastic/manager.py
index 4e8853780f4dc..2a344e92765d9 100644
--- a/python/paddle/distributed/fleet/elastic/manager.py
+++ b/python/paddle/distributed/fleet/elastic/manager.py
@@ -20,8 +20,13 @@
 import signal
 import random
 
-logging.basicConfig(level=os.environ.get('LOGLEVEL', 'INFO').upper())
 logger = logging.getLogger("ELASTIC")
+logger.setLevel(logging.INFO)
+formatter = logging.Formatter(
+    fmt='%(name)s %(levelname)s %(asctime)s %(message)s')
+ch = logging.StreamHandler()
+ch.setFormatter(formatter)
+logger.addHandler(ch)
 
 ELASTIC_EXIT_CODE = 101
 
diff --git a/python/paddle/distributed/fleet/launch.py b/python/paddle/distributed/fleet/launch.py
index 16b39e0fc8e45..946c89866994c 100644
--- a/python/paddle/distributed/fleet/launch.py
+++ b/python/paddle/distributed/fleet/launch.py
@@ -65,6 +65,7 @@
 import time
 import six
 import copy
+import argparse
 from argparse import ArgumentParser, REMAINDER
 import paddle
 import paddle.fluid as fluid
@@ -162,6 +163,31 @@ def _parse_args():
         type=str,
         default="127.0.0.1",
         help="Paddle cluster nodes ips, such as 192.168.0.16,192.168.0.17..")
+    collective_group.add_argument(
+        "--rank_mapping_file",
+        type=argparse.FileType('r'),
+        default=sys.stdin,
+        help="This rank mapping information in json format is used specifically "
+        "for lazy launch for auto parallel. Some of the ranks in each node "
+        "may not be used, and the indices of rank should be kept the same "
+        "as the indices of sub-task splited by auto parallel. "
+        " { "
+        "   \"ip_ranks\": [ "
+        "     { "
+        "       \"ip\": \"127.0.0.1\", "
+        "       \"ranks\": [0,1] "
+        "     }, "
+        "     { "
+        "       \"ip\": \"127.0.0.2\", "
+        "       \"ranks\": [2,3,4] "
+        "     } "
+        "   ] "
+        " } ")
+    collective_group.add_argument(
+        "--enable_auto_mapping",
+        type=bool,
+        default=False,
+        help="Set true to enable the lazy launch for auto-parallel scenario.")
 
     ps_group = parser.add_argument_group("Parameter-Server Parameters")
     # for parameter server
@@ -261,21 +287,25 @@ def launch_collective(args):
     start_port = 6170
     if os.environ.get('FLAGS_START_PORT') is not None:
         start_port = os.environ.get('FLAGS_START_PORT')
-    if cloud_utils.use_paddlecloud() and trainers_num != 1:
-        cluster, pod = cloud_utils.get_cloud_cluster(
-            args.ips, device_mode, devices_per_proc, start_port)
-        logger.debug("get cluster from cloud:{}".format(cluster))
-    elif device_mode == DeviceMode.ASCEND_NPU:
-        # for ascend
-        cluster, pod = ascend_utils.get_cloud_cluster(
-            rank_table_file=os.getenv("RANK_TABLE_FILE", None),
-            device_mode=device_mode,
-            start_port=start_port)
+    # lazy launch for auto-parallel
+    if args.enable_auto_mapping == True:
+        cluster, pod = get_mapped_cluster_from_args(args, device_mode)
     else:
-        # trainers_num = 1 or not use paddlecloud ips="a,b"
-        cluster, pod = get_cluster_from_args(args, device_mode,
-                                             devices_per_proc)
-        logger.debug("get cluster from args:{}".format(cluster))
+        # for ascend
+        if device_mode == DeviceMode.ASCEND_NPU:
+            cluster, pod = ascend_utils.get_cloud_cluster(
+                rank_table_file=os.getenv("RANK_TABLE_FILE", None),
+                device_mode=device_mode,
+                start_port=start_port)
+        elif cloud_utils.use_paddlecloud() and trainers_num != 1:
+            cluster, pod = cloud_utils.get_cloud_cluster(
+                args.ips, device_mode, devices_per_proc, start_port)
+            logger.debug("get cluster from cloud:{}".format(cluster))
+        else:
+            # trainers_num = 1 or not use paddlecloud ips="a,b"
+            cluster, pod = get_cluster_from_args(args, device_mode,
+                                                 devices_per_proc)
+            logger.debug("get cluster from args:{}".format(cluster))
 
     global_envs = copy.copy(os.environ.copy())
     gloo_rendezvous_dir = tempfile.mkdtemp()
@@ -334,7 +364,20 @@ def launch_ps(args, distribute_mode):
     return
 
 
+def infer_backend(args):
+    if args.backend != "auto": return
+    if fluid.core.is_compiled_with_cuda():
+        args.backend = 'nccl'
+    elif fluid.core.is_compiled_with_npu():
+        args.backend = 'unknown'
+    elif fluid.core.is_compiled_with_xpu():
+        args.backend = 'bkcl'
+    else:
+        args.backend = 'gloo'
+
+
 def which_distributed_mode(args):
+    infer_backend(args)  # modify the args.backend
     if args.run_mode is not None:
         assert args.run_mode in ["collective", "ps", "ps-heter"]
 
@@ -368,12 +411,9 @@ def which_distributed_mode(args):
 
     if fluid.core.is_compiled_with_cuda():
         accelerators = fluid.core.get_cuda_device_count()
-        args.backend = 'nccl'
     elif fluid.core.is_compiled_with_npu():
-        args.backend = 'unknown'
         accelerators = fluid.core.get_npu_device_count()
     elif fluid.core.is_compiled_with_xpu():
-        args.backend = 'bkcl'
         accelerators = fluid.core.get_xpu_device_count()
     else:
         accelerators = 0
@@ -400,7 +440,6 @@ def which_distributed_mode(args):
 But found args.servers not empty, default use ps mode")
                 return DistributeMode.PS
             else:
-                args.backend = "gloo"
                 return DistributeMode.COLLECTIVE
         else:
             logger.warning(
@@ -583,20 +622,21 @@ def launch():
     _print_arguments(args)
 
     if args.backend == 'auto':
-        distribute_mode = which_distributed_mode(args)
-        assert args.backend in [
-            'gloo', 'nccl', 'bkcl', 'unknown'
-        ]  # which_distributed_mode must modify args.backend
+        distribute_mode = which_distributed_mode(
+            args)  # which_distributed_mode must modify args.backend
     else:
         assert args.run_mode == 'collective' or args.run_mode == None, "When backend is not 'auto', run mode must be collective"
         check_backend(args.backend)
         distribute_mode = DistributeMode.COLLECTIVE
 
-    block_windows_and_macos(
-        args.backend)  # raise error when using gloo on windows or macos
+    assert args.backend in ['gloo', 'nccl', 'bkcl', 'unknown']
+
     if args.backend == 'gloo':
         logger.warning("launch start with CPUONLY mode")
 
+    block_windows_and_macos(
+        args.backend)  # raise error when using gloo on windows or macos
+
     if enable_elastic(args, distribute_mode):
         launch_elastic(args, distribute_mode)
         return
diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py
index 3aced0ab996cb..b4ebe9ef125b0 100644
--- a/python/paddle/distributed/fleet/launch_utils.py
+++ b/python/paddle/distributed/fleet/launch_utils.py
@@ -27,6 +27,7 @@
 import warnings
 import six
 import struct
+import json
 
 import paddle
 import paddle.fluid as fluid
@@ -527,8 +528,9 @@ def start_local_trainers(cluster,
                             pretty_print_envs(proc_env, ("Distributed Envs",
                                                          "Value"))))
             logger.info(
-                "details abouts PADDLE_TRAINER_ENDPOINTS can be found in {}/endpoints.log, and detail running logs maybe found in {}/workerlog.0".
-                format(log_dir, log_dir))
+                "details about PADDLE_TRAINER_ENDPOINTS can be found in "
+                "{}/endpoints.log, and detail running logs maybe found in "
+                "{}/workerlog.0".format(log_dir, log_dir))
         fn = None
         pre_fn = None if os.name == 'nt' else os.setsid
         if log_dir is not None:
@@ -805,6 +807,97 @@ def cloud_ps_heter_env_set(args):
         pretty_print_envs(environs)))
 
 
+def get_mapped_cluster(node_ips, node_ip, trainer_endpoints, device_mode,
+                       node_mapping_ranks):
+    assert type(trainer_endpoints) is list, "trainer_endpoints must be list"
+    assert device_mode == DeviceMode.GPU, \
+        "Only support get mapped cluster for gpu now."
+    cluster = Cluster(hdfs=None)
+    for node_rank, ip in enumerate(node_ips):
+        pod = Pod()
+        pod.rank = node_rank
+        pod.addr = ip
+        pod.device_mode = device_mode
+        cur_node_endpoints = trainer_endpoints[node_rank]
+
+        # choose rank from global mapped ranks and set it to the trainer.
+        ranks_per_node = node_mapping_ranks[node_rank]
+        for i in range(len(ranks_per_node)):
+            trainer = Trainer()
+            # change global rank(mapped) to local rank within each node.
+            # e.g. mapped ranks of node: 3,4,7 -> 0,1,2
+            local_rank = ranks_per_node.index(ranks_per_node[i])
+            trainer.accelerators.append(local_rank)
+            trainer.endpoint = "%s" % (cur_node_endpoints[i])
+            # global mapped ranks
+            trainer.rank = ranks_per_node[i]
+
+            pod.trainers.append(trainer)
+        cluster.pods.append(pod)
+
+    pod_rank = node_ips.index(node_ip)
+    return cluster, cluster.pods[pod_rank]
+
+
+def get_mapped_cluster_from_args(args, device_mode):
+    assert device_mode == DeviceMode.GPU, \
+        "Only support get mapped cluster for gpu now."
+    gpus_num = fluid.core.get_cuda_device_count()
+
+    # parse ip-ranks json file
+    json_data = None
+    with args.rank_mapping_file as json_file:
+        json_data = json.load(json_file)
+
+    node_ips = []
+    node_ranks_mapping = []
+    ip_ranks_list = json_data['ip_ranks']
+    for ip_ranks in ip_ranks_list:
+        node_ips.append(ip_ranks['ip'])
+        node_ranks_mapping.append(ip_ranks['ranks'])
+
+    if len(node_ips) == 1:
+        node_ip = node_ips[0]
+    else:
+        if args.host:
+            node_ip = args.host
+        else:
+            _, node_ip = get_host_name_ip()
+
+    assert node_ip in node_ips, \
+        "Can't find your local ip {%s} in node_ips: {%s}" % (node_ip, node_ips)
+    node_rank = node_ips.index(node_ip)
+
+    assert len(node_ranks_mapping[node_rank]) <= gpus_num, \
+        "number of ranks mapped to one node should not exceed the avaiable ones."
+    assert len(node_ranks_mapping) == len(node_ips), \
+        "ranks length should be equal to ips length."
+
+    logger.debug("parsed from args: node_ips:{} node_ip:{} "
+                 "node_rank:{} node_ranks_mapping:{}".format(
+                     node_ips, node_ip, node_rank, node_ranks_mapping[
+                         node_rank]))
+
+    # NOTE: there are different number of global mapped ranks on each node.
+    free_ports = []
+    trainer_endpoints = []
+    for ip in node_ips:
+        node_rank = node_ips.index(ip)
+        if os.environ.get('FLAGS_START_PORT') is not None:
+            start_port = int(os.environ.get('FLAGS_START_PORT'))
+            free_ports = [
+                x
+                for x in range(start_port, start_port + len(node_ranks_mapping[
+                    node_rank]))
+            ]
+        else:
+            free_ports = find_free_ports(len(node_ranks_mapping[node_rank]))
+        trainer_endpoints.append(["%s:%d" % (ip, port) for port in free_ports])
+
+    return get_mapped_cluster(node_ips, node_ip, trainer_endpoints, device_mode,
+                              node_ranks_mapping)
+
+
 class ParameterServerLauncher(object):
     def __init__(self, args, distribute_mode):
         self.args = args
diff --git a/python/paddle/distributed/fleet/runtime/the_one_ps.py b/python/paddle/distributed/fleet/runtime/the_one_ps.py
index 642d0e427fa8c..0b874b8c61ac4 100644
--- a/python/paddle/distributed/fleet/runtime/the_one_ps.py
+++ b/python/paddle/distributed/fleet/runtime/the_one_ps.py
@@ -868,11 +868,11 @@ def _init_server(self, dirname=None, var_names=None, **kwargs):
 
         for var_name in load_varnames:
             table_id = sparse_table_maps[var_name]
-            path = os.path.join(dirname, var_name + PSERVER_SAVE_SUFFIX,
-                                "{}.block{}.txt".format(var_name, pserver_id))
-            meta = os.path.join(dirname, var_name + PSERVER_SAVE_SUFFIX,
-                                "{}.block{}.meta".format(var_name, pserver_id))
-            self._server.load_sparse(path, meta, table_id)
+            # path = os.path.join(dirname, var_name + PSERVER_SAVE_SUFFIX,
+            #                     "{}.block{}.txt".format(var_name, pserver_id))
+            # meta = os.path.join(dirname, var_name + PSERVER_SAVE_SUFFIX,
+            #                     "{}.block{}.meta".format(var_name, pserver_id))
+            self._server.load_sparse(dirname, "0", table_id)
 
     def _run_server(self):
         if self.role_maker._is_heter_worker():
@@ -967,8 +967,12 @@ def _save_distributed_persistables(self,
                 TheOnePSRuntime.__exclude_vars(saved_varnames),
                 main_program.list_vars()))
 
+        self._communicator.pull_dense(denses)
+
         import paddle
         for var in remaining_vars:
+            if var.name not in recv_dense_varnames:
+                continue
             tensor = var.get_value()
             paddle.save(
                 tensor, os.path.join(dirname, var.name), use_binary_format=True)
@@ -1063,8 +1067,64 @@ def _save_inference_model(self, *args, **kwargs):
     def _save_persistables(self, *args, **kwargs):
         self._ps_inference_save_persistables(*args, **kwargs)
 
+    def _load_sparse_params(self, dirname, context, main_program, mode):
+        from paddle.fluid.incubate.fleet.parameter_server.ir.public import get_sparse_tablenames
+        distributed_varnames = get_sparse_tablenames(
+            self.compiled_strategy.origin_main_program, True)
+        values = []
+        for id, names in context.items():
+            if names[0] not in distributed_varnames:
+                # TODO: only load sparse param from local
+                warnings.warn("varname is not in distributed_varnames, pass")
+            # load sparse & distributed param on server
+            self._worker.load_one_table(id, dirname, mode)
+            values.extend(names)
+        return values
+
+    def _load_distributed_persistables(self, dirname, main_program=None,
+                                       mode=0):
+        if main_program is None:
+            main_program = self.compiled_strategy.get_origin_ps_main_program()
+
+        if isinstance(main_program, CompiledProgram):
+            raise TypeError(
+                "in fleet.save() function, main_program must be as Program type, CompiledProgram is not allowed"
+            )
+
+        denses = self.compiled_strategy.get_the_one_recv_context(
+            is_dense=True,
+            split_dense_table=self.role_maker._is_heter_parameter_server_mode,
+            use_origin_program=True)
+        sparses = self.compiled_strategy.get_the_one_recv_context(
+            is_dense=False,
+            split_dense_table=self.role_maker._is_heter_parameter_server_mode,
+            use_origin_program=True)
+
+        sparse_varnames = self._load_sparse_params(dirname, sparses,
+                                                   main_program, mode)
+
+        recv_dense_varnames = []
+        for id, names in denses.items():
+            recv_dense_varnames.extend(names)
+
+        loaded_varnames = sparse_varnames
+
+        remaining_vars = list(
+            filter(
+                TheOnePSRuntime.__exclude_vars(loaded_varnames),
+                main_program.list_vars()))
+
+        import paddle
+        for var in remaining_vars:
+            if var.name not in recv_dense_varnames:
+                continue
+            tensor = paddle.load(os.path.join(dirname, var.name))
+            var.set_value(tensor)
+
+        self._communicator.init_params(denses)
+
     def load_model(self, path, mode):
-        self._worker.load_model(path, mode)
+        self._load_distributed_persistables(path, mode=mode)
 
     def _shrink(self, threshold):
         import paddle.distributed.fleet as fleet
diff --git a/python/paddle/fft.py b/python/paddle/fft.py
index de15eba0feffa..a62e502203b63 100644
--- a/python/paddle/fft.py
+++ b/python/paddle/fft.py
@@ -1300,13 +1300,13 @@ def fftshift(x, axes=None, name=None):
     shape = paddle.shape(x)
     if axes is None:
         # shift all axes
-        rank = paddle.rank(x).reshape([1])
-        axes = axes or paddle.arange(0, rank)
-        shifts = [size // 2 for size in shape]
+        rank = len(x.shape)
+        axes = list(range(0, rank))
+        shifts = shape // 2
     elif isinstance(axes, int):
         shifts = shape[axes] // 2
     else:
-        shifts = [shape[ax] // 2 for ax in axes]
+        shifts = paddle.concat([shape[ax] // 2 for ax in axes])
     return paddle.roll(x, shifts, axes, name=name)
 
 
@@ -1343,13 +1343,13 @@ def ifftshift(x, axes=None, name=None):
     shape = paddle.shape(x)
     if axes is None:
         # shift all axes
-        rank = paddle.rank(x).reshape([1])
-        axes = axes or paddle.arange(0, rank)
-        shifts = [-size // 2 for size in shape]
+        rank = len(x.shape)
+        axes = list(range(0, rank))
+        shifts = -shape // 2
     elif isinstance(axes, int):
         shifts = -shape[axes] // 2
     else:
-        shifts = [-shape[ax] // 2 for ax in axes]
+        shifts = paddle.concat([-shape[ax] // 2 for ax in axes])
     return paddle.roll(x, shifts, axes, name=name)
 
 
diff --git a/python/paddle/fluid/communicator.py b/python/paddle/fluid/communicator.py
index fa497f5c2840d..9a75ef8c58edf 100644
--- a/python/paddle/fluid/communicator.py
+++ b/python/paddle/fluid/communicator.py
@@ -161,6 +161,9 @@ def recv(self):
     def init_params(self, context):
         self.communicator_.init_params(context)
 
+    def pull_dense(self, context):
+        self.communicator_.pull_dense(context)
+
     def push_sparse_param(self, var_name, table_id=-1, scope=global_scope()):
         if not self.is_running():
             raise ValueError(
diff --git a/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py b/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
index 5b662b09f1cf6..95e597c703b4e 100644
--- a/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
+++ b/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
@@ -104,7 +104,7 @@ def _update_list(self):
     'reduce_sum',
 }
 
-# This set contains two types of ops. All ops supported fp16 calculation. One 
+# This set contains two types of ops. All ops supported fp16 calculation. One
 # of two types is considered numerically-safe, but may be made unsafe by an
 # upstream blacklist op. Another type do not have numerically-significant
 # effects, like stack, flatten2.
@@ -153,6 +153,8 @@ def _update_list(self):
     'c_allreduce_sum',
     'concat',
     'split',
+    'fused_feedforward',
+    'fused_attention',
 }
 
 # The set of ops that don't support fp16 calculation
diff --git a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
index 6317be9a2e2e2..36546c1de1204 100644
--- a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
+++ b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
@@ -40,7 +40,7 @@
 
 def _rename_arg(op, old_name, new_name):
     """
-    If an op has old_name input and output, rename these input 
+    If an op has old_name input and output, rename these input
     args new_name.
 
     Args:
@@ -89,6 +89,10 @@ def _keep_fp32_input(op, in_name):
         return in_name not in {'X', 'Z'}
     if op_type == 'resnet_unit':
         return in_name not in {'X', 'FilterX', 'Z', 'FilterZ'}
+    if op_type in ['fused_attention', 'fused_feedforward']:
+        return in_name in {
+            'LnScale', 'LnBias', 'Ln2Scale', 'Ln2Bias', "Ln1Scale", "Ln1Bias"
+        }
     return False
 
 
@@ -98,6 +102,11 @@ def _keep_fp32_output(op, out_name):
         return out_name != 'Y'
     if op_type == 'resnet_unit':
         return out_name not in {'Y', 'ConvX', 'ConvZ'}
+    if op_type in ['fused_attention', 'fused_feedforward']:
+        return out_name in {
+            'LnMean', 'LnVariance', 'Ln2Mean', 'Ln2Variance', 'Ln1Mean',
+            'Ln1Variance'
+        }
     return False
 
 
@@ -256,16 +265,16 @@ def find_true_post_op(ops, cur_op, var_name, search_all=False):
         ops (list): A list of ops.
         cur_op (Operator): Current operator which has var_name variable.
         var_name (string): Variable name.
-        search_all (bool): The type of operator search. Use if \"cur_op\" is not in the \"ops\" set. 
+        search_all (bool): The type of operator search. Use if \"cur_op\" is not in the \"ops\" set.
     """
     post_op = []
     if search_all:
         """
-        \"cur_op\" do not have to be in list of \"ops\". E.g. \"cur_op\" can come 
-        from startup_prog block and \"ops\" list from main_prog block. 
-        By setting idx to -1, we'll start looking for post-ops from the top of the list. 
-        If search_all is False, assume that \"cur_op\" is in \"ops\" list, 
-        so to reduce the time of search we can start iterating from \"cur_op\" idx. 
+        \"cur_op\" do not have to be in list of \"ops\". E.g. \"cur_op\" can come
+        from startup_prog block and \"ops\" list from main_prog block.
+        By setting idx to -1, we'll start looking for post-ops from the top of the list.
+        If search_all is False, assume that \"cur_op\" is in \"ops\" list,
+        so to reduce the time of search we can start iterating from \"cur_op\" idx.
         """
         idx = -1
     else:
@@ -517,19 +526,19 @@ def cast_parameters_to_fp16(place, program, scope=None, to_fp16_var_names=None):
 
 def rewrite_program(main_prog, amp_lists):
     """
-    Traverse all ops in current block and insert cast op according to 
+    Traverse all ops in current block and insert cast op according to
     which set current op belongs to.
 
     1. When an op belongs to the black list, add it to black set
     2. When an op belongs to the white list, add it to white set
-    3. When an op belongs to the gray list. If one 
-       of its inputs is the output of black set op or black list op, 
-       add it to black set. If all of its previous ops are not black 
-       op and one of its inputs is the output of white set op or 
+    3. When an op belongs to the gray list. If one
+       of its inputs is the output of black set op or black list op,
+       add it to black set. If all of its previous ops are not black
+       op and one of its inputs is the output of white set op or
        white list op, add it to white set.
     4. When an op isn't in the lists, add it to black op set.
-    5. Add necessary cast ops to make sure that black set op will be 
-       computed in fp32 mode, while white set op will be computed in 
+    5. Add necessary cast ops to make sure that black set op will be
+       computed in fp32 mode, while white set op will be computed in
        fp16 mode.
 
     Args:
diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
index dc355fec0d362..9b2954b13f222 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
@@ -272,7 +272,8 @@ class QuantizationTransformPass(object):
     the quantized ops's inputs.
     """
     _supported_quantizable_op_type = [
-        'conv2d', 'depthwise_conv2d', 'conv2d_transpose', 'mul', 'matmul'
+        'conv2d', 'depthwise_conv2d', 'conv2d_transpose', 'mul', 'matmul',
+        'matmul_v2'
     ]
 
     def __init__(self,
@@ -520,6 +521,16 @@ def _transform_backward(graph, op):
                     dequant_var_node = dequantized_vars[var_node.name()]
                     graph.update_input_link(var_node, dequant_var_node, op)
 
+        def _has_weight(op):
+            has_weight = False
+            for var_node in op.inputs:
+                if var_node.name() not in op.input_arg_names():
+                    continue
+                name = var_node.name()
+                if var_node.name() in persistable_vars:
+                    has_weight = True
+            return has_weight
+
         if not self._is_test:
             self._create_global_step(graph)
         ops = graph.all_op_nodes()
@@ -535,11 +546,11 @@ def _transform_backward(graph, op):
         # The loop for transforming the forward graph:
         for op in ops:
             if op.name() in self._quantizable_ops:
-                if not self._is_skip_quant(graph, op):
+                if not self._is_skip_quant(graph, op) and _has_weight(op):
                     _transform_forward(graph, op)
         # The loop for renaming the inputs of backward op.
         for op in ops:
-            if op.name() in self._quantizable_grad_ops:
+            if op.name() in self._quantizable_grad_ops and _has_weight(op):
                 _transform_backward(graph, op)
         graph.resolve_hazard()
         return graph
@@ -1281,10 +1292,11 @@ def _insert_post_channel_dequant_op(self, graph, op_node, quant_axis):
             var_type=output_var_node.type(),
             shape=output_var_node.shape(),
             var_dtype=output_var_node.dtype())
+        x_num_col_dims = 1
+        if op_node.name() in ['matmul', 'matmul_v2', 'mul']:
+            x_num_col_dims = len(op_node.outputs[0].shape()) - 1
         if op_node.op().has_attr("x_num_col_dims"):
             x_num_col_dims = op_node.op().attr("x_num_col_dims")
-        else:
-            x_num_col_dims = 1
         dequant_op_node = graph.create_op_node(
             op_type='fake_channel_wise_dequantize_max_abs',
             attrs={
diff --git a/python/paddle/fluid/contrib/sparsity/__init__.py b/python/paddle/fluid/contrib/sparsity/__init__.py
index b36a79b8ca865..9bf45f4272738 100644
--- a/python/paddle/fluid/contrib/sparsity/__init__.py
+++ b/python/paddle/fluid/contrib/sparsity/__init__.py
@@ -25,8 +25,10 @@
 from .utils import check_sparsity
 from .utils import MaskAlgo
 from .utils import CheckMethod
-from .asp import decorate, prune_model
-from .asp import set_excluded_layers, reset_excluded_layers
+from .asp import decorate
+from .asp import prune_model
+from .asp import set_excluded_layers
+from .asp import reset_excluded_layers
 
 __all__ = [
     'calculate_density', 'check_mask_1d', 'get_mask_1d', 'check_mask_2d',
diff --git a/python/paddle/fluid/contrib/sparsity/asp.py b/python/paddle/fluid/contrib/sparsity/asp.py
index 77c61faf23dee..61e3a61fc9cd2 100644
--- a/python/paddle/fluid/contrib/sparsity/asp.py
+++ b/python/paddle/fluid/contrib/sparsity/asp.py
@@ -19,10 +19,9 @@
 import copy
 import numpy as np
 import paddle
-from paddle.fluid import framework, global_scope, program_guard, layers
+from paddle.fluid import global_scope, program_guard, layers
 from paddle.fluid.initializer import ConstantInitializer
 from paddle.fluid.contrib import sparsity
-from paddle.fluid import core
 
 __all__ = [
     'decorate', 'prune_model', 'set_excluded_layers', 'reset_excluded_layers'
@@ -36,6 +35,35 @@ def set_excluded_layers(main_program, param_names):
     Args:
         main_program (Program, optional): Program with model definition and its parameters.
         param_names (list): A list contains names of parameters.
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddle.static import sparsity
+
+            paddle.enable_static()
+
+            main_program = paddle.static.Program()
+            startup_program = paddle.static.Program()
+
+            with paddle.static.program_guard(main_program, startup_program):
+                input_data = paddle.static.data(name='data', shape=[None, 128])
+                label = paddle.static.data(name='label', shape=[None, 10])
+                hidden = paddle.static.nn.fc(x=input_data, num_flatten_dims=-1, size=32, activation=None, name="need_sparse_fc")
+                hidden = paddle.static.nn.fc(x=hidden, num_flatten_dims=-1, size=32, activation=None, name="need_dense_fc")
+                prob = paddle.static.nn.fc(x=hidden, num_flatten_dims=-1, size=10, activation=None)
+                loss = paddle.mean(paddle.nn.functional.square_error_cost(prob, label))
+
+                # Setup exluded layers out from ASP workflow.
+                # Please note, excluded_layers must be set before calling `optimizer.minimize()`.
+                sparsity.set_excluded_layers(main_program, ["need_dense_fc"])
+
+                optimizer = paddle.optimizer.SGD(learning_rate=0.1)
+                optimizer = paddle.static.amp.decorate(optimizer )
+                # Calling sparsity.decorate() to wrap minimize() in optimizer, which 
+                # will insert necessary masking operations for ASP workflow.
+                optimizer = sparsity.decorate(optimizer)
+                optimizer.minimize(loss, startup_program)
     """
     ASPHelper.set_excluded_layers(
         main_program=main_program, param_names=param_names)
@@ -48,6 +76,33 @@ def reset_excluded_layers(main_program=None):
 
     Args:
         main_program (Program, optional): Program with model definition and its parameters.
+        Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddle.static import sparsity
+
+            paddle.enable_static()
+
+            main_program = paddle.static.Program()
+            startup_program = paddle.static.Program()
+
+            with paddle.static.program_guard(main_program, startup_program):
+                input_data = paddle.static.data(name='data', shape=[None, 128])
+                label = paddle.static.data(name='label', shape=[None, 10])
+                hidden = paddle.static.nn.fc(x=input_data, num_flatten_dims=-1, size=32, activation=None, name="my_first_fc")
+                hidden = paddle.static.nn.fc(x=hidden, num_flatten_dims=-1, size=32, activation=None, name="my_second_fc")
+                prob = paddle.static.nn.fc(x=hidden, num_flatten_dims=-1, size=10, activation=None)
+                loss = paddle.mean(paddle.nn.functional.square_error_cost(prob, label))
+
+                # Setup exluded layers out from ASP workflow.
+                # Please note, excluded_layers must be set before calling `optimizer.minimize()`.
+                sparsity.set_excluded_layers(main_program, ["my_second_fc"])
+                # Now the weights of "my_second_fc" would not be included in Automatic SParsity's workflow.
+
+            # Reset excluded_layers, all FC layers would be included into Automatic SParsity's workflow.
+            # Please note, reset_excluded_layers also must be called before calling `optimizer.minimize()`.
+            sparsity.reset_excluded_layers(main_program)
     """
     ASPHelper.reset_excluded_layers(main_program=main_program)
 
@@ -65,22 +120,21 @@ def decorate(optimizer):
         .. code-block:: python
 
             import paddle
-            import paddle.fluid as fluid
-            from paddle.fluid.contrib import sparsity
+            from paddle.static import sparsity
 
-            main_program = fluid.Program()
-            startup_program = fluid.Program()
+            main_program = paddle.static.Program()
+            startup_program = paddle.static.Program()
 
             paddle.enable_static()
 
-            with fluid.program_guard(main_program, startup_program):
-                input_data = fluid.layers.data(name='data', shape=[None, 128])
-                label = fluid.layers.data(name='label', shape=[None, 10])
-                hidden = fluid.layers.fc(input=input_data, num_flatten_dims=-1, size=32, act=None)
-                prob = fluid.layers.fc(input=hidden, num_flatten_dims=-1, size=10, act=None)
-                loss = fluid.layers.mean(fluid.layers.square_error_cost(prob, label))
+            with paddle.static.program_guard(main_program, startup_program):
+                input_data = paddle.static.data(name='data', shape=[None, 128])
+                label = paddle.static.data(name='label', shape=[None, 10])
+                hidden = paddle.static.nn.fc(x=input_data, num_flatten_dims=-1, size=32, activation=None)
+                prob = paddle.static.nn.fc(x=hidden, num_flatten_dims=-1, size=10, activation=None)
+                loss = paddle.mean(paddle.nn.functional.square_error_cost(prob, label))
 
-                optimizer = fluid.optimizer.SGD(learning_rate=0.1)
+                optimizer = paddle.optimizer.SGD(learning_rate=0.1)
                 optimizer = sparsity.decorate(optimizer)
                 # if do sparse training with Fleet, please replace above decorate with:
                 # strategy = paddle.distributed.fleet.DistributedStrategy()
@@ -92,15 +146,14 @@ def decorate(optimizer):
     return ASPHelper.decorate(optimizer)
 
 
-def prune_model(place,
-                main_program=None,
+def prune_model(main_program=None,
                 n=2,
                 m=4,
-                func_name=sparsity.MaskAlgo.MASK_1D,
+                mask_algo='mask_1d',
                 with_mask=True):
     r"""
     Pruning parameters of supported layers in :attr:`main_program` via 
-    specified mask generation function given by :attr:`func_name`. This 
+    specified mask generation function given by :attr:`mask_algo`. This 
     function supports both training and inference controlled by :attr:`with_mask`.
     If :attr:`with_mask` is True, it would also prune parameter related ASP mask Variables,
     else only prunes parameters.
@@ -114,11 +167,11 @@ def prune_model(place,
     inference only. To obtain OptimizerWithSparsityGuarantee, please see `sparsity.decoreate()`.
 
     Args:
-        place (fluid.CPUPlace()|fluid.CUDAPlace(N)): Device place for pruned parameter and mask Variables, and N means the GPU's id. It should be the same as created instance of Executor.
         main_program (Program, optional): Program with model definition and its parameters. Default is `paddle.static.default_main_program()
         n (int): n of `n:m` sparse pattern.
         m (int): m of `n:m` sparse pattern.
-        func_name (MaskAlgo, optional): The function name to generate spase mask. Default is `MaskAlgo.MASK_1D`. All options please refer to `MaskAlgo`.
+        mask_algo (string, optional): The function name to generate spase mask. Default is `mask_1d`.
+                                      The vaild inputs should be one of 'mask_1d', 'mask_2d_greedy' and 'mask_2d_best'.
         with_mask (bool, optional): To prune mask Variables related to parameters or not. Ture is purning also, False is not. Defalut is True.
     Returns:
         dictionary: A dictionary with key: `parameter name` (string) and value: its corresponding mask Variable.
@@ -126,50 +179,58 @@ def prune_model(place,
         .. code-block:: python
 
             import paddle
-            import paddle.fluid as fluid
-            import paddle.fluid.core as core
-            from paddle.fluid.contrib import sparsity
+            from paddle.static import sparsity
 
             paddle.enable_static()
 
-            main_program = fluid.Program()
-            startup_program = fluid.Program()
+            main_program = paddle.static.Program()
+            startup_program = paddle.static.Program()
 
-            place = paddle.CPUPlace()
-            if core.is_compiled_with_cuda():
-                place = paddle.CUDAPlace(0)
-
-            with fluid.program_guard(main_program, startup_program):
-                input_data = fluid.layers.data(name='data', shape=[None, 128])
-                label = fluid.layers.data(name='label', shape=[None, 10])
-                hidden = fluid.layers.fc(input=input_data, num_flatten_dims=-1, size=32, act=None, name="need_sparse")
-                hidden = fluid.layers.fc(input=hidden, num_flatten_dims=-1, size=32, act=None, name="need_dense")
-                prob = fluid.layers.fc(input=hidden, num_flatten_dims=-1, size=10, act=None)
-                loss = fluid.layers.mean(fluid.layers.square_error_cost(prob, label))
+            with paddle.static.program_guard(main_program, startup_program):
+                input_data = paddle.static.data(name='data', shape=[None, 128])
+                label = paddle.static.data(name='label', shape=[None, 10])
+                hidden = paddle.static.nn.fc(x=input_data, num_flatten_dims=-1, size=32, activation=None, name="need_sparse_fc")
+                hidden = paddle.static.nn.fc(x=hidden, num_flatten_dims=-1, size=32, activation=None, name="need_dense_fc")
+                prob = paddle.static.nn.fc(x=hidden, num_flatten_dims=-1, size=10, activation=None)
+                loss = paddle.mean(paddle.nn.functional.square_error_cost(prob, label))
 
                 # Setup exluded layers out from ASP workflow.
                 # Please note, excluded_layers must be set before calling `optimizer.minimize()`.
-                sparsity.set_excluded_layers(main_program, ["need_dense"])
+                sparsity.set_excluded_layers(main_program, ["need_dense_fc"])
 
-                optimizer = fluid.optimizer.SGD(learning_rate=0.1)
-                optimizer = fluid.contrib.mixed_precision.decorator.decorate(optimizer )
+                optimizer = paddle.optimizer.SGD(learning_rate=0.1)
+                optimizer = paddle.static.amp.decorate(optimizer )
                 # Calling sparsity.decorate() to wrap minimize() in optimizer, which 
                 # will insert necessary masking operations for ASP workflow.
                 optimizer = sparsity.decorate(optimizer)
                 optimizer.minimize(loss, startup_program)
 
-            exe = fluid.Executor(place)
+            device = paddle.device.get_device()
+            place = paddle.set_device(device)
+
+            exe = paddle.static.Executor(place)
             exe.run(startup_program)
 
             # Must call `exe.run(startup_program)` first before calling `sparsity.prune_model`
-            sparsity.prune_model(place, main_program, func_name=sparsity.MaskAlgo.MASK_2D_BEST)
+            sparsity.prune_model(main_program, mask_algo='mask_2d_best')
     """
+    device = paddle.device.get_device()
+    place = paddle.set_device(device)
+
+    MaskAlgo_mapping = {
+        'mask_1d': sparsity.MaskAlgo.MASK_1D,
+        'mask_2d_greedy': sparsity.MaskAlgo.MASK_2D_GREEDY,
+        'mask_2d_best': sparsity.MaskAlgo.MASK_2D_BEST
+    }
+    assert (mask_algo in MaskAlgo_mapping), \
+           'The "mask_algo" should be one of ["mask_1d", "mask_2d_greedy", "mask_2d_best"]'
+
     return ASPHelper.prune_model(
         place=place,
         main_program=main_program,
         n=n,
         m=m,
-        func_name=func_name,
+        mask_algo=MaskAlgo_mapping[mask_algo],
         with_mask=with_mask)
 
 
@@ -256,12 +317,12 @@ def prune_model(cls,
                     main_program=None,
                     n=2,
                     m=4,
-                    func_name=sparsity.MaskAlgo.MASK_1D,
+                    mask_algo=sparsity.MaskAlgo.MASK_1D,
                     with_mask=True):
         r"""
         This is the implementation of `sparsity.prune_model`, for details please see explanation in `sparsity.prune_model`.
         """
-        checked_func_name = sparsity.CheckMethod.get_checking_method(func_name)
+        checked_func_name = sparsity.CheckMethod.get_checking_method(mask_algo)
 
         if main_program is None:
             main_program = paddle.static.default_main_program()
@@ -284,7 +345,7 @@ def prune_model(cls,
                 # matrices beforce invoking create_mask. Then we transpose the result maks to make 
                 # sure its shape to be the same as the input weight.
                 weight_sparse_mask = sparsity.create_mask(
-                    weight_nparray.T, func_name=func_name, n=n, m=m).T
+                    weight_nparray.T, func_name=mask_algo, n=n, m=m).T
                 weight_pruned_nparray = np.multiply(weight_nparray,
                                                     weight_sparse_mask)
                 weight_tensor.set(weight_pruned_nparray, place)
@@ -347,15 +408,14 @@ def _is_supported_layer(cls, main_program, param_name):
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
-              from paddle.fluid.contrib.sparsity.asp import ASPHelper
+              from paddle.static.sparsity.asp import ASPHelper
 
-              main_program = fluid.Program()
-              startup_program = fluid.Program()
+              main_program = paddle.static.Program()
+              startup_program = paddle.static.Program()
 
-              with fluid.program_guard(main_program, startup_program):
-                  input_data = fluid.layers.data(name='data', shape=[None, 128])
-                  fc = fluid.layers.fc(input=input_data, num_flatten_dims=-1, size=32, act=None)
+              with paddle.static.program_guard(main_program, startup_program):
+                  input_data = paddle.static.data(name='data', shape=[None, 128])
+                  fc = paddle.static.nn.fc(x=input_data, num_flatten_dims=-1, size=32, activation=None)
 
               for param in main_program.global_block().all_parameters():
                   ASPHelper._is_supported_layer(main_program, param.name)
diff --git a/python/paddle/fluid/contrib/sparsity/utils.py b/python/paddle/fluid/contrib/sparsity/utils.py
index a72ea4d9b8510..8b8c043bc4bad 100644
--- a/python/paddle/fluid/contrib/sparsity/utils.py
+++ b/python/paddle/fluid/contrib/sparsity/utils.py
@@ -64,7 +64,8 @@ def get_checking_method(mask_algo):
             .. code-block:: python
 
             import numpy as np
-            from paddle.fluid.contrib.sparsity import MaskAlgo, CheckMethod
+            from paddle.static.sparsity import MaskAlgo
+            from paddle.fluid.contrib.sparsity import CheckMethod
 
             CheckMethod.get_checking_method(MaskAlgo.MASK_1D)
             # CheckMethod.CHECK_1D
@@ -95,7 +96,7 @@ def calculate_density(x):
         .. code-block:: python
 
           import numpy as np
-          import paddle.fluid.contrib.sparsity as sparsity
+          import paddle.static.sparsity as sparsity
 
           x = np.array([[0, 1, 3, 0],
                         [1, 1, 0, 1]])
@@ -446,7 +447,7 @@ def get_mask_2d_best(mat, n, m):
                           [5, 6, 3, 9],
                           [2, 4, 6, 9]])
           mask_greedy = sparsity.get_mask_2d_greedy(mat, 2, 4)
-          mask_greedy = sparsity.get_mask_2d_best(mat, 2, 4)
+          mask_best = sparsity.get_mask_2d_best(mat, 2, 4)
           print("L1 norm of `greedy` sparse matrix", np.multiply(mat, mask_greedy).sum()) # 56
           print("L1 norm of `best` sparse matrix", np.multiply(mat, mask_best).sum()) # 61
     """
diff --git a/python/paddle/fluid/dygraph/amp/auto_cast.py b/python/paddle/fluid/dygraph/amp/auto_cast.py
index ddde3e66c56dc..006287752839d 100644
--- a/python/paddle/fluid/dygraph/amp/auto_cast.py
+++ b/python/paddle/fluid/dygraph/amp/auto_cast.py
@@ -119,10 +119,7 @@ def _in_amp_guard():
 
 
 @dygraph_only
-def pure_fp16_initialize(enable_pure_fp16, models, optimizers):
-    if not enable_pure_fp16:
-        return models, optimizers
-
+def pure_fp16_initialize(models):
     for idx in range(len(models)):
         for layer in models[idx].sublayers(include_self=True):
             layer._casted_by_pure_fp16 = True
@@ -132,43 +129,7 @@ def pure_fp16_initialize(enable_pure_fp16, models, optimizers):
                         paddle.nn.BatchNorm, paddle.nn.LayerNorm)):
                     continue
                 layer.to(dtype='float16')
-
-    for idx_opt in range(len(optimizers)):
-        # update _param_groups
-        if getattr(optimizers[idx_opt], '_param_groups', None) and isinstance(
-                optimizers[idx_opt]._param_groups[0], dict):
-            for param_group in optimizers[idx_opt]._param_groups:
-                for i, param in enumerate(param_group['params']):
-                    for idx_model in range(len(models)):
-                        for layer in models[idx_model].sublayers(
-                                include_self=True):
-                            if id(param) in layer._parameters_transform_map:
-                                param_group['params'][
-                                    i] = layer._parameters_transform_map[id(
-                                        param)][0]
-            for param_group in optimizers[idx_opt]._parameter_list:
-                params = param_group['params']
-                for i, param in enumerate(params):
-                    for idx_model in range(len(models)):
-                        for layer in models[idx_model].sublayers(
-                                include_self=True):
-                            if id(param) in layer._parameters_transform_map:
-                                params[i] = layer._parameters_transform_map[id(
-                                    param)][0]
-        # update _parameter_list
-        else:
-            for i, param in enumerate(optimizers[idx_opt]._parameter_list):
-                for idx_model in range(len(models)):
-                    for layer in models[idx_model].sublayers(include_self=True):
-                        if id(param) in layer._parameters_transform_map:
-                            optimizers[idx_opt]._parameter_list[
-                                i] = layer._parameters_transform_map[id(param)][
-                                    0]
-                            if hasattr(optimizers[idx_opt], '_param_groups'):
-                                optimizers[idx_opt]._param_groups[
-                                    i] = layer._parameters_transform_map[id(
-                                        param)][0]
-    return models, optimizers
+    return models
 
 
 def check_models(models):
@@ -401,8 +362,7 @@ def amp_decorate(models,
             "optimizers must be either a single optimizer or a list of optimizers."
         )
 
-    models, optimizers = pure_fp16_initialize(
-        enable_pure_fp16=True, models=models, optimizers=optimizers)
+    models = pure_fp16_initialize(models=models)
 
     # supprot master_weight    
     for idx_opt in range(len(optimizers)):
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py b/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py
index d27af5c0dd9e0..0ac4da947a46b 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py
@@ -214,7 +214,7 @@ def convert_ifelse(pred, true_fn, false_fn, true_args, false_args, return_vars):
 
 
 def _remove_no_value_return_var(out):
-    if out and isinstance(out, tuple):
+    if isinstance(out, tuple) and len(out) > 0:
         processed_out = out
         align_ret = out[0]
         if isinstance(align_ret, tuple):
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/error.py b/python/paddle/fluid/dygraph/dygraph_to_static/error.py
index 273961e27efba..69ec89a5af644 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/error.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/error.py
@@ -122,7 +122,7 @@ def formated_message(self):
         msg = ' ' * BLANK_COUNT_BEFORE_FILE_STR + 'File "{}", line {}, in {}\n'.format(
             self.location.filepath, self.location.lineno, self.function_name)
         # add empty line after range code
-        return msg + '\n'.join(self.source_code) + '\n'
+        return msg + '\n'.join(self.source_code)
 
 
 class SuggestionDict(object):
@@ -183,24 +183,39 @@ def create_message(self):
             return '\n'.join(message_lines)
 
         # Step2: Optimizes stack information with source code information of dygraph from user.
-        whether_source_range = True
-        for filepath, lineno, funcname, code in self.origin_traceback[::-1]:
-            loc = Location(filepath, lineno)
-            dygraph_func_info = self.origin_info_map.get(loc.line_location,
+        user_code_traceback_index = []
+        for i, (filepath, lineno, funcname,
+                code) in enumerate(self.origin_traceback):
+            dygraph_func_info = self.origin_info_map.get((filepath, lineno),
                                                          None)
             if dygraph_func_info:
-                if whether_source_range:
-                    traceback_frame = TraceBackFrameRange(
-                        dygraph_func_info.location,
-                        dygraph_func_info.function_name)
-                    whether_source_range = False
-                else:
-                    traceback_frame = TraceBackFrame(
-                        dygraph_func_info.location,
-                        dygraph_func_info.function_name,
-                        dygraph_func_info.source_code)
-                # Two elements already exist in message_lines: "In transformed code:" and "", so insert in index 2
-                message_lines.insert(2, traceback_frame.formated_message())
+                user_code_traceback_index.append(i)
+
+        # Add user code traceback
+        for i in user_code_traceback_index:
+            filepath, lineno, funcname, code = self.origin_traceback[i]
+            dygraph_func_info = self.origin_info_map.get((filepath, lineno),
+                                                         None)
+            if i == user_code_traceback_index[-1]:
+                traceback_frame = TraceBackFrameRange(
+                    dygraph_func_info.location, dygraph_func_info.function_name)
+            else:
+                traceback_frame = TraceBackFrame(
+                    dygraph_func_info.location, dygraph_func_info.function_name,
+                    dygraph_func_info.source_code)
+
+            message_lines.append(traceback_frame.formated_message())
+        message_lines.append("")
+
+        # Add paddle traceback after user code traceback
+        paddle_traceback_start_index = user_code_traceback_index[
+            -1] + 1 if user_code_traceback_index else 0
+        for filepath, lineno, funcname, code in self.origin_traceback[
+                paddle_traceback_start_index:]:
+            traceback_frame = TraceBackFrame(
+                Location(filepath, lineno), funcname, code)
+            message_lines.append(traceback_frame.formated_message())
+        message_lines.append("")
 
         # Step3: Adds error message like "TypeError: dtype must be int32, but received float32".
         # NOTE: `format_exception` is a list, its length is 1 in most cases, but sometimes its length
@@ -258,8 +273,9 @@ def _simplify_error_value(self):
         bottom_error_message = error_value_lines[empty_line_idx + 1:]
         revise_suggestion = self._create_revise_suggestion(bottom_error_message)
 
-        filepath = ''
-        error_from_user_code = []
+        user_filepath = ''
+        error_traceback = []
+        user_code_traceback_index = []
         pattern = 'File "(?P<filepath>.+)", line (?P<lineno>.+), in (?P<function_name>.+)'
         for i in range(0, len(error_value_lines_strip), 2):
             if error_value_lines_strip[i].startswith("File "):
@@ -268,22 +284,35 @@ def _simplify_error_value(self):
                 code = error_value_lines_strip[i + 1] if i + 1 < len(
                     error_value_lines_strip) else ''
                 if i == 0:
-                    filepath = tmp_filepath
-                if tmp_filepath == filepath:
-                    error_from_user_code.append(
-                        (tmp_filepath, int(lineno_str), function_name, code))
+                    user_filepath = tmp_filepath
+                if tmp_filepath == user_filepath:
+                    user_code_traceback_index.append(len(error_traceback))
+
+                error_traceback.append(
+                    (tmp_filepath, int(lineno_str), function_name, code))
 
         error_frame = []
-        whether_source_range = True
-        for filepath, lineno, funcname, code in error_from_user_code[::-1]:
-            loc = Location(filepath, lineno)
-            if whether_source_range:
-                traceback_frame = TraceBackFrameRange(loc, funcname)
-                whether_source_range = False
+        # Add user code traceback
+        for i in user_code_traceback_index:
+            filepath, lineno, funcname, code = error_traceback[i]
+            if i == user_code_traceback_index[-1]:
+                traceback_frame = TraceBackFrameRange(
+                    Location(filepath, lineno), funcname)
             else:
-                traceback_frame = TraceBackFrame(loc, funcname, code)
-
-            error_frame.insert(0, traceback_frame.formated_message())
+                traceback_frame = TraceBackFrame(
+                    Location(filepath, lineno), funcname, code)
+            error_frame.append(traceback_frame.formated_message())
+        error_frame.append("")
+
+        # Add paddle traceback after user code traceback
+        paddle_traceback_start_index = user_code_traceback_index[
+            -1] + 1 if user_code_traceback_index else 0
+        for filepath, lineno, funcname, code in error_traceback[
+                paddle_traceback_start_index:]:
+            traceback_frame = TraceBackFrame(
+                Location(filepath, lineno), funcname, code)
+            error_frame.append(traceback_frame.formated_message())
+        error_frame.append("")
 
         error_frame.extend(bottom_error_message)
         error_frame.extend(revise_suggestion)
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/static_analysis.py b/python/paddle/fluid/dygraph/dygraph_to_static/static_analysis.py
index ce5f50137b7aa..45a42d481b5a9 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/static_analysis.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/static_analysis.py
@@ -15,7 +15,8 @@
 from __future__ import print_function
 
 from paddle.utils import gast
-from .utils import is_paddle_api, is_dygraph_api, is_numpy_api, index_in_list
+from .logging_utils import warn
+from .utils import is_paddle_api, is_dygraph_api, is_numpy_api, index_in_list, ast_to_source_code
 
 __all__ = ['AstNodeWrapper', 'NodeVarType', 'StaticAnalysisVisitor']
 
@@ -57,6 +58,15 @@ class NodeVarType(object):
     # If node.node_var_type in TENSOR_TYPES, it can be considered as tensor-dependent.
     TENSOR_TYPES = {TENSOR, PADDLE_RETURN_TYPES}
 
+    Annotation_map = {
+        "Tensor": TENSOR,
+        "paddle.Tensor": TENSOR,
+        "int": INT,
+        "float": FLOAT,
+        "bool": BOOLEAN,
+        "str": STRING
+    }
+
     @staticmethod
     def binary_op_output_type(in_type1, in_type2):
         if in_type1 == in_type2:
@@ -83,6 +93,16 @@ def binary_op_output_type(in_type1, in_type2):
             return NodeVarType.UNKNOWN
         return max(in_type1, in_type2)
 
+    @staticmethod
+    def type_from_annotation(annotation):
+        annotation_str = ast_to_source_code(annotation).strip()
+        if annotation_str in NodeVarType.Annotation_map:
+            return NodeVarType.Annotation_map[annotation_str]
+
+        # raise warning if not found
+        warn("Currently we don't support annotation: %s" % annotation_str)
+        return NodeVarType.UNKNOWN
+
 
 class AstNodeWrapper(object):
     """
@@ -316,6 +336,18 @@ def _get_node_var_type(self, cur_wrapper):
                     self.var_env.set_var_type(target.id, ret_type)
             return ret_type
 
+        if isinstance(node, gast.AnnAssign):
+            # TODO(0x45f): To determine whether need to support assignment statements
+            # like `self.x: float = 2.1`.
+            ret_type = {NodeVarType.type_from_annotation(node.annotation)}
+            # if annotation and value(Constant) are diffent type, we use value type
+            if node.value:
+                ret_type = self.node_to_wrapper_map[node.value].node_var_type
+            if isinstance(node.target, gast.Name):
+                self.node_to_wrapper_map[node.target].node_var_type = ret_type
+                self.var_env.set_var_type(node.target.id, ret_type)
+            return ret_type
+
         if isinstance(node, gast.Name):
             if node.id == "None":
                 return {NodeVarType.NONE}
@@ -325,21 +357,8 @@ def _get_node_var_type(self, cur_wrapper):
             parent_node_wrapper = cur_wrapper.parent
             if parent_node_wrapper and isinstance(parent_node_wrapper.node,
                                                   gast.arguments):
-                parent_node = parent_node_wrapper.node
-                var_type = {NodeVarType.UNKNOWN}
-                if parent_node.defaults:
-                    index = index_in_list(parent_node.args, node)
-                    args_len = len(parent_node.args)
-                    if index != -1 and args_len - index <= len(
-                            parent_node.defaults):
-                        defaults_node = parent_node.defaults[index - args_len]
-                        if isinstance(defaults_node, gast.Constant):
-                            var_type = self._get_constant_node_type(
-                                defaults_node)
-
-                            # Add node with identified type into cur_env.
-                            self.var_env.set_var_type(node.id, var_type)
-                return var_type
+
+                return self._get_func_argument_type(parent_node_wrapper, node)
 
             return self.var_env.get_var_type(node.id)
 
@@ -373,3 +392,42 @@ def _get_node_var_type(self, cur_wrapper):
                 return {NodeVarType.TENSOR}
 
         return {NodeVarType.STATEMENT}
+
+    def _get_func_argument_type(self, parent_node_wrapper, node):
+        """
+        Returns type information by parsing annotation or default values.
+        
+        For example:
+            1. parse by default values.
+                foo(x, y=1, z='s') -> x: UNKNOWN, y: INT, z: STR
+            
+            2. parse by Py3 type annotation.
+                foo(x: Tensor, y: int, z: str) -> x: Tensor, y: INT, z: STR
+
+            3. parse by type annotation and default values.
+                foo(x: Tensor, y: int, z: str = 'abc') -> x: Tensor, y: INT, z: STR
+
+        NOTE: Currently, we only support Tensor, int, bool, float, str et.al.
+              Other complicate types will be supported later.
+        """
+        assert isinstance(node, gast.Name)
+
+        parent_node = parent_node_wrapper.node
+        var_type = {NodeVarType.UNKNOWN}
+        if node.annotation is not None:
+            var_type = {NodeVarType.type_from_annotation(node.annotation)}
+            self.var_env.set_var_type(node.id, var_type)
+
+        # if annotation and value(Constant) are diffent type, we use value type
+        if parent_node.defaults:
+            index = index_in_list(parent_node.args, node)
+            args_len = len(parent_node.args)
+            if index != -1 and args_len - index <= len(parent_node.defaults):
+                defaults_node = parent_node.defaults[index - args_len]
+                if isinstance(defaults_node, gast.Constant):
+                    var_type = self._get_constant_node_type(defaults_node)
+
+                    # Add node with identified type into cur_env.
+                    self.var_env.set_var_type(node.id, var_type)
+
+        return var_type
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
index 650857eefb3bb..4da898d7441d8 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
@@ -520,7 +520,8 @@ def remove_if_exit(filepath):
 
 def _inject_import_statements():
     import_statements = [
-        "import paddle", "import paddle.fluid as fluid", "from typing import *",
+        "import paddle", "from paddle import Tensor",
+        "import paddle.fluid as fluid", "from typing import *",
         "import numpy as np"
     ]
     return '\n'.join(import_statements) + '\n'
diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py
index e1855ee6db9af..8bf8300c8a263 100644
--- a/python/paddle/fluid/dygraph/layers.py
+++ b/python/paddle/fluid/dygraph/layers.py
@@ -92,7 +92,7 @@ class Layer(core.Layer):
                 If set str, it can be "bool",  "float16", "float32", "float64",
                 "int8", "int16", "int32", "int64", "uint8" or "uint16".
                 Default: "float32"
-    
+
     Returns:
         None
     """
@@ -121,9 +121,6 @@ def __init__(self, name_scope=None, dtype="float32"):
         self._forward_pre_hooks = collections.OrderedDict()
         self._forward_post_hooks = collections.OrderedDict()
 
-        self._parameters_transform_map = {}
-        self._buffers_transform_map = {}
-
         self._casted_by_pure_fp16 = False
 
         self._state_dict_hooks = collections.OrderedDict()
@@ -278,7 +275,7 @@ def register_forward_post_hook(self, hook):
 
         It should have the following form, `input` and `output` of the `hook` is `input` and `output` of the `Layer` respectively.
         User can use forward post-hook to change the output of the Layer or perform information statistics tasks on the Layer.
- 
+
         hook(Layer, input, output) -> None or modified output
 
         Parameters:
@@ -324,9 +321,9 @@ def forward_post_hook(layer, input, output):
 
     def register_forward_pre_hook(self, hook):
         """Register a forward pre-hook for Layer. The hook will be called before `forward` function has been computed.
-        
+
         It should have the following form, `input` of the `hook` is `input` of the `Layer`,
-        hook can either return a tuple or a single modified value in the hook. We will wrap the value into a tuple if 
+        hook can either return a tuple or a single modified value in the hook. We will wrap the value into a tuple if
         a single value is returned(unless that value is already a tuple).
         User can use forward pre-hook to change the input of the Layer or perform information statistics tasks on the Layer.
 
@@ -382,7 +379,7 @@ def create_parameter(self,
                          is_bias=False,
                          default_initializer=None):
         """Create parameters for this layer.
-        
+
         Parameters:
             shape(list): Shape of the parameter.
             attr(ParamAttr, optional): Parameter attribute of weight. Please refer to :ref:`api_paddle_ParamAttr`. Default: None.
@@ -453,13 +450,13 @@ def __init__(self,
                                 out_features):
                         super(MyLinear, self).__init__()
                         self.linear = paddle.nn.Linear( 10, 10)
-                            
+
                         self.back_var = self.create_variable(name = "linear_tmp_0", dtype=self._dtype)
-                    
+
                     def forward(self, input):
                         out = self.linear(input)
                         paddle.assign( out, self.back_var)
-                        
+
                         return out
 
         """
@@ -503,13 +500,13 @@ def __init__(self,
                                 out_features):
                         super(MyLinear, self).__init__()
                         self.linear = paddle.nn.Linear( 10, 10)
-                            
+
                         self.back_var = self.create_tensor(name = "linear_tmp_0", dtype=self._dtype)
-                    
+
                     def forward(self, input):
                         out = self.linear(input)
                         paddle.assign( out, self.back_var)
-                        
+
                         return out
 
         """
@@ -729,7 +726,7 @@ def register_buffer(self, name, tensor, persistable=True):
 
         Returns:
             None
-        
+
         Examples:
             .. code-block:: python
 
@@ -856,10 +853,10 @@ def named_buffers(self, prefix='', include_sublayers=True):
     def clear_gradients(self):
         """
         Clear the gradients of all parameters for this layer.
-        
+
         Returns:
             None
-        
+
         Examples:
             .. code-block:: python
 
@@ -901,8 +898,8 @@ def __call__(self, *inputs, **kwargs):
                 with program_desc_tracing_guard(False):
                     self._build_once(*inputs, **kwargs)
 
-                    # TODO(liuyuhui) Only xpu broadcast parameters here. 
-                    # The other device is to call _sync_params_buffers in DataParallel 
+                    # TODO(liuyuhui) Only xpu broadcast parameters here.
+                    # The other device is to call _sync_params_buffers in DataParallel
                     # to realize the parameter synchronization among multiply cards.
                     if parallel_helper._is_data_parallel_mode(
                     ) and paddle.is_compiled_with_xpu():
@@ -944,7 +941,7 @@ def add_sublayer(self, name, sublayer):
             sublayer(Layer): an instance of Layer.
         Returns:
             Layer: the sublayer passed in.
-        
+
         Examples:
             .. code-block:: python
 
@@ -1167,7 +1164,7 @@ def _remove_if_exist(*dicts):
                         self._non_persistable_buffer_names_set.add(name)
                     _buffers[name] = value
                 elif _buffers is not None and name in _buffers:
-                    # Note(Aurelius84): In Dy2stat, the value of the Buffer may be modified in 
+                    # Note(Aurelius84): In Dy2stat, the value of the Buffer may be modified in
                     # decorated function, such as `self.buffer = new_tensor`. So we update its
                     # value via `assign`.
                     if type(value) == framework.Variable:
@@ -1326,7 +1323,7 @@ def to_static_state_dict(self,
         Parameters:
             destination(dict, optional) : If provide, all the parameters and persistable buffers will be set to this dict . Default: None
             include_sublayers(bool, optional) : If true, also include the parameters and persistable buffers from sublayers. Default: True
-            
+
         Retruns:
             dict: a dict contains all the parameters and persistable buffers.
 
@@ -1357,7 +1354,7 @@ def state_dict(self,
         Parameters:
             destination(dict, optional) : If provide, all the parameters and persistable buffers will be set to this dict . Default: None
             include_sublayers(bool, optional) : If true, also include the parameters and persistable buffers from sublayers. Default: True
-            
+
         Retruns:
             dict: a dict contains all the parameters and persistable buffers.
 
@@ -1385,7 +1382,7 @@ def set_state_dict(self, state_dict, use_structured_name=True):
 
         Parameters:
             state_dict(dict) : Dict contains all the parameters and persistable buffers.
-            use_structured_name(bool, optional) : If true, use structured name as key, otherwise, use parameter or buffer name as key. 
+            use_structured_name(bool, optional) : If true, use structured name as key, otherwise, use parameter or buffer name as key.
                                                   Default: True
         Returns:
             None
@@ -1473,50 +1470,36 @@ def _apply(self, func, device, dtype, blocking):
             if param is not None:
                 with no_grad():
                     param_applied = func(param, device, dtype, blocking)
-                    assert param.is_leaf
-                    param_applied.stop_gradient = param.stop_gradient
-                    if hasattr(param_applied, 'is_distributed'):
-                        param_applied.is_distributed = param.is_distributed
-                    self._parameters[key] = param_applied
 
                 if param.grad is not None:
                     with no_grad():
                         grad_applied = func(param._grad_ivar(), device, dtype,
                                             blocking)
 
-                        grad_applied.stop_gradient = param._grad_ivar(
-                        ).stop_gradient
-                        if hasattr(param._grad_ivar(), 'is_distributed'):
-                            grad_applied.is_distributed = param._grad_ivar(
-                            ).is_distributed
-                        self._parameters[key]._set_grad_ivar(grad_applied)
-
-            self._parameters_transform_map[id(param)] = [param_applied, key]
-
         for key, buf in self._buffers.items():
             self._buffers[key] = func(buf, device, dtype, blocking)
-            self._buffers_transform_map[id(buf)] = [self._buffers[key], key]
 
     def to(self, device=None, dtype=None, blocking=None):
         '''
         Cast the parameters and buffers of Layer by the give device, dtype and blocking.
 
         Parameters:
-            device(str|paddle.CPUPlace()|paddle.CUDAPlace()|paddle.CUDAPinnedPlace()|paddle.XPUPlace()|None, optional): The device of the Layer which want to be stored. 
-            If None, the device is the same with the original Tensor. If device is string, it can be ``cpu``, ``gpu:x`` and ``xpu:x``, where ``x`` is the 
-            index of the GPUs or XPUs. Default: None. 
-            
+            device(str|paddle.CPUPlace()|paddle.CUDAPlace()|paddle.CUDAPinnedPlace()|paddle.XPUPlace()|None, optional): The device of the Layer which want to be stored.
+            If None, the device is the same with the original Tensor. If device is string, it can be ``cpu``, ``gpu:x`` and ``xpu:x``, where ``x`` is the
+            index of the GPUs or XPUs. Default: None.
+
             dtype(str|core.VarDesc.VarType|None, optional): The type of the data. If None, the dtype is the same with the original Tensor. Default: None.
 
-            blocking(bool|None, optional): If False and the source is in pinned memory, the copy will be 
+            blocking(bool|None, optional): If False and the source is in pinned memory, the copy will be
               asynchronous with respect to the host. Otherwise, the argument has no effect. If None, the blocking is set True. Default: None.
-            
+
         Returns:
-            None
+            self
 
         Examples:
             .. code-block:: python
 
+                # required: gpu
                 import paddle
 
                 linear=paddle.nn.Linear(2, 2)
@@ -1542,12 +1525,12 @@ def to(self, device=None, dtype=None, blocking=None):
                 #Tensor(shape=[2, 2], dtype=float64, place=CUDAPinnedPlace, stop_gradient=False,
                 #       [[-0.04989364, -0.56889004],
                 #        [ 0.33960250,  0.96878713]])
-    
+
 
         '''
 
         if device is None and dtype is None and blocking is None:
-            return
+            return self
 
         if device is not None:
             if isinstance(device, str):
@@ -1573,28 +1556,66 @@ def transform(t, device, dtype, blocking):
             if dtype is None:
                 dtype = t.dtype
 
-            new_t = t._copy_to(device, blocking)
-            if isinstance(t, framework.ParamBase):
-                if dtype is not None and dtype != t.dtype:
+            # 1. gpu place need to determine whether the memory is sufficient for allocation:
+            if t.place.is_gpu_place():
+                gpu_memory_available = core.gpu_memory_available()
+                # for gpu, minimum memory allocation unit is 256 bytes.
+                if type(dtype) is str:
+                    size_dtype = core.size_of_dtype(
+                        convert_np_dtype_to_dtype_(dtype))
+                else:
+                    size_dtype = core.size_of_dtype(dtype)
+                # Note(zhangbo): Paddle GPU minimum memory allocation unit is 256 bytes, waiting_alloc_memory will comput ‘t’ occupied memory space.
+                # Coefficient 1.2 is used to avoid OOM that may occur in this critical state when the memory is just enough.
+                waiting_alloc_memory = (
+                    (t.numel().numpy()[0] * size_dtype) / 256 + 1) * 256 * 1.2
+                if gpu_memory_available < waiting_alloc_memory:
+                    # Copy param / Tensor to cpu
+                    t_used = t._copy_to(paddle.CPUPlace(),
+                                        blocking)  # k-v type will error
+                    # Release mem of t
+                    t.value().get_tensor()._clear()
+                else:
+                    t_used = t
+            else:
+                t_used = t
+
+            # 2. cast param / Tensor to dtype
+            if dtype is not None and dtype != t_used.dtype:
+                if isinstance(t_used, framework.ParamBase):
+                    from paddle.fluid.layer_helper import LayerHelper
+                    helper = LayerHelper("cast", **locals())
+                    t_casted = helper.create_variable_for_type_inference(
+                        dtype=dtype)
                     framework._dygraph_tracer().trace_op(
                         type='cast',
-                        inputs={'X': new_t},
-                        outputs={'Out': new_t},
+                        inputs={'X': t_used},
+                        outputs={'Out': t_casted},
                         attrs={
-                            'in_dtype': t.dtype,
+                            'in_dtype': t_used.dtype,
                             'out_dtype': convert_np_dtype_to_dtype_(dtype)
                         })
+                else:
+                    t_casted = t_used.cast(dtype=dtype)
             else:
-                if dtype is not None and dtype != t.dtype:
-                    new_t = new_t.cast(dtype=dtype)
+                t_casted = t_used
 
-            return new_t
+            # 3. Copy casted cpu param / Tensor to device
+            new_t = t_casted._copy_to(device, blocking)
+
+            # 4. share Tensor to origin param / Tensor
+            dst_tensor = t.value().get_tensor()
+            src_tensor = new_t.value().get_tensor()
+            dst_tensor._share_data_with(src_tensor)
+
+            return t
 
         with warnings.catch_warnings():
             warnings.filterwarnings("ignore", category=UserWarning)
             self._apply(transform, device, dtype, blocking)
 
         self._dtype = dtype
+        return self
 
     # [aliases] Compatible with old method names
     set_dict = set_state_dict
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 6fba200f54099..dd5744203d547 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -485,10 +485,11 @@ def handler(self, res_dict):
 
 
 class _StandaloneExecutor(object):
-    def __init__(self, place, main_program):
+    def __init__(self, place, main_program, scope):
         self._place = core.Place()
         self._place.set_place(place)
         self._main_program = main_program
+        self._scope = scope
         self._new_exe = self._create_new_executor()
 
     def run(self, feed, fetch_list, return_numpy=True):
@@ -522,9 +523,8 @@ def run(self, feed, fetch_list, return_numpy=True):
     def _create_new_executor(self):
         # NOTE: It's a trick to set empty start_up program.
         startup_program = Program()
-        outer_scope = global_scope()
         new_exe = core.StandaloneExecutor(self._place, startup_program.desc,
-                                          self._main_program.desc, outer_scope)
+                                          self._main_program.desc, self._scope)
 
         return new_exe
 
@@ -585,11 +585,11 @@ def __init__(self, place):
         self._place = place
         self._cached_executors = {}
 
-    def run(self, program, feed, fetch_list, return_numpy=True):
-        new_exe = self._get_exe_from_cache(program)
+    def run(self, program, scope, feed, fetch_list, return_numpy=True):
+        new_exe = self._get_exe_from_cache(program, scope)
         return new_exe.run(feed, fetch_list, return_numpy)
 
-    def _get_exe_from_cache(self, program):
+    def _get_exe_from_cache(self, program, scope):
         """
         Return cached _StandaloneExecutor instance. If not found, create associated 
         _StandaloneExecutor instance with given program and cache it.
@@ -598,7 +598,7 @@ def _get_exe_from_cache(self, program):
             program, Program), "Required type(Program), but received {}".format(
                 type(program).__name__)
         if program not in self._cached_executors:
-            new_exe = _StandaloneExecutor(self._place, program)
+            new_exe = _StandaloneExecutor(self._place, program, scope)
             self._cached_executors[program] = new_exe
 
         return self._cached_executors[program]
@@ -1271,6 +1271,11 @@ def _run_impl(self, program, feed, fetch_list, feed_var_name,
         fetch_list = self._check_fetch_list(fetch_list)
 
         if isinstance(program, Program) and program._pipeline_opt:
+            if "fleet_opt" in program._pipeline_opt:
+                return self._run_using_fleet_executor(
+                    program,
+                    fetch_list=fetch_list,
+                    use_program_cache=use_program_cache)
             if "startup_program" in program._pipeline_opt:
                 program = program._pipeline_opt["startup_program"]
             else:
@@ -1296,9 +1301,13 @@ def _run_impl(self, program, feed, fetch_list, feed_var_name,
 
         # NOTE: This is an experimental feature. If `export FLAGS_USE_STANDALONE_EXECUTOR=1 `,
         # use StandaloneExecutor to run the program.
-        if self._enable_interpreter_core and not program._is_start_up_program_:
-            return self._executor_cache.run(program, feed, fetch_list,
-                                            return_numpy)
+        if self._enable_interpreter_core:
+            inner_program_ = program._program if isinstance(
+                program, compiler.CompiledProgram) else program
+            assert isinstance(inner_program_, framework.Program)
+            if not inner_program_._is_start_up_program_:
+                return self._executor_cache.run(inner_program_, scope, feed,
+                                                fetch_list, return_numpy)
 
         # use_prune can be overrided by putting optimize_ops in fetch_list
         _origin_fetch_list = fetch_list
@@ -1820,6 +1829,31 @@ def _get_real_program_fetch_list():
 
         return ctx
 
+    def _run_using_fleet_executor(self,
+                                  program=None,
+                                  dataset=None,
+                                  scope=None,
+                                  thread=0,
+                                  is_infer=False,
+                                  debug=False,
+                                  fetch_list=None,
+                                  fetch_info=None,
+                                  print_period=100,
+                                  fetch_handler=None,
+                                  use_program_cache=False):
+        scope, real_fetch_list, trainer_instance = \
+            self._prepare_pipeline_ctx(program, dataset, scope, thread,
+                                       is_infer, debug, fetch_list, fetch_info,
+                                       print_period, fetch_handler,
+                                       use_program_cache)
+        from ..distributed.fleet.proto import fleet_executor_desc_pb2
+        from google.protobuf import text_format
+        fleet_exe_desc = fleet_executor_desc_pb2.FleetExecutorDesc()
+        fleet_exe = core.FleetExecutor(fleet_exe_desc.SerializeToString())
+        fleet_exe.init(program._pipeline_opt["section_program"].desc)
+        fleet_exe.run()
+        return None
+
     def _run_pipeline(self,
                       program=None,
                       dataset=None,
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index a3cd34c32ebbf..6b868903c8cec 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -467,7 +467,7 @@ def is_compiled_with_cuda():
         .. code-block:: python
 
             import paddle
-            support_gpu = paddle.is_compiled_with_cuda()
+            support_gpu = paddle.device.is_compiled_with_cuda()
     """
     return core.is_compiled_with_cuda()
 
@@ -482,7 +482,7 @@ def is_compiled_with_rocm():
         .. code-block:: python
 
             import paddle
-            support_gpu = paddle.is_compiled_with_rocm()
+            support_gpu = paddle.device.is_compiled_with_rocm()
     """
     return core.is_compiled_with_rocm()
 
@@ -1308,13 +1308,12 @@ def _to_readable_code(self):
         if self.persistable:
             var_str = "persist " + var_str
 
-        from paddle.distributed.auto_parallel.context import get_default_distributed_context
+        from paddle.distributed.auto_parallel.dist_context import get_default_distributed_context
         dist_context = get_default_distributed_context()
-        var_dist_attr = dist_context.get_tensor_distributed_attr_for_program(
-            self)
-        if var_dist_attr is not None:
+        dist_tensor = dist_context.get_dist_tensor_for_program(self)
+        if dist_tensor is not None:
             var_str += ", {name} = {value}".format(
-                name="dist_attr", value=var_dist_attr)
+                name="dist_attr", value=dist_tensor)
 
         return var_str
 
@@ -2529,12 +2528,12 @@ def _to_readable_code(self, skip_op_callstack=True):
             if i != len(attr_names) - 1:
                 attrs_str += ", "
 
-        from paddle.distributed.auto_parallel.context import get_default_distributed_context
+        from paddle.distributed.auto_parallel.dist_context import get_default_distributed_context
         dist_context = get_default_distributed_context()
-        op_dist_attr = dist_context.get_op_distributed_attr_for_program(self)
-        if op_dist_attr is not None:
+        dist_op = dist_context.get_dist_op_for_program(self)
+        if dist_op is not None:
             attrs_str += ", {name} = {value}".format(
-                name="dist_attr", value=op_dist_attr)
+                name="dist_attr", value=dist_op)
 
         if outputs_str != "{}":
             op_str = "{outputs} = {op_type}(inputs={inputs}, {attrs})".\
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py
index 309532cafc2e1..8d803c0d5bd7d 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py
@@ -544,18 +544,6 @@ def clear_model(self):
             self._fleet_ptr.clear_model()
         self._role_maker._barrier_worker()
 
-    def clear_model(self):
-        """
-        clear_model() will be called by user. It will clear sparse model.
-        Examples:
-            .. code-block:: python
-              fleet.clear_model()
-        """
-        self._role_maker._barrier_worker()
-        if self._role_maker.is_first_worker():
-            self._fleet_ptr.clear_model()
-        self._role_maker._barrier_worker()
-
     def load_pslib_whitelist(self, table_id, model_path, **kwargs):
         """
         load pslib model for one table with whitelist
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py
index 56d476210894e..61630d7769206 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py
@@ -42,9 +42,12 @@
     "scale_sparse_grad": None,
 }
 
-logging.basicConfig(
-    format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)
 logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+formatter = logging.Formatter(fmt='%(asctime)s - %(levelname)s - %(message)s')
+ch = logging.StreamHandler()
+ch.setFormatter(formatter)
+logger.addHandler(ch)
 
 
 class DistributedOptimizerImplBase(object):
diff --git a/python/paddle/fluid/ir.py b/python/paddle/fluid/ir.py
index 3c7c8879fd420..adeab721fc2dd 100644
--- a/python/paddle/fluid/ir.py
+++ b/python/paddle/fluid/ir.py
@@ -19,6 +19,7 @@
 from . import core, unique_name
 from .framework import _apply_pass, OpProtoHolder
 
+from .proto import framework_pb2
 try:
     from .proto import pass_desc_pb2
 except ModuleNotFoundError:
@@ -142,28 +143,21 @@ def _get_args_from_func(self, func):
             input_spec = self._input_specs.get(arg_name)
             if isinstance(input_spec, paddle.static.InputSpec):
                 args.append(
-                    paddle.static.data(arg_name, input_spec.shape,
+                    PassDesc.VarHelper(arg_name, input_spec.shape,
                                        input_spec.dtype))
             elif isinstance(input_spec, paddle.ParamAttr):
                 args.append(paddle.ParamAttr(arg_name))
             else:
-                args.append(paddle.static.data(arg_name, [-1]))
+                args.append(PassDesc.VarHelper(arg_name, [-1]))
         return args
 
-    def _prune_program_desc(self, program_desc):
-        block_desc = program_desc.blocks[0]
-        # block_desc.ClearField("vars")
-        for var in [
-                var for var in block_desc.vars
-                if var.name not in self._input_specs
-        ]:
-            block_desc.vars.remove(var)
-        for op_desc in block_desc.ops:
+    def _prune_program_desc(self, ops):
+        for op_desc in ops:
             default_attrs = core.get_op_attrs_default_value(
                 paddle.compat.to_bytes(op_desc.type))
             remove_attrs = list()
             for attr in op_desc.attrs:
-                # attr must not in 
+                # attr must not in
                 if attr.name not in [
                         "op_namescope", "op_callstack", "op_device"
                 ]:
@@ -179,33 +173,69 @@ def _prune_program_desc(self, program_desc):
             for attr in remove_attrs:
                 op_desc.attrs.remove(attr)
 
-    def _func_to_program_desc(self, func, program_desc, is_replace=False):
+    def _func_to_program_desc(self, func, ops):
         vars = list()
         program = paddle.static.Program()
         startup_program = paddle.static.Program()
         with paddle.static.program_guard(program, startup_program):
             args = self._get_args_from_func(func)
-            for arg in args:
-                vars.append(arg.name)
+            vars.extend(args)
             outs = func(*args)
             if not isinstance(outs, (list, tuple)):
                 outs = [outs]
             for out in outs:
                 if isinstance(out, PassDesc.OpHelper):
-                    for out in out.Outputs().values():
-                        vars.extend(out)
-                elif isinstance(out, paddle.fluid.framework.Variable):
-                    vars.append(out.name)
-        program_desc.ParseFromString(program.desc.serialize_to_string())
-        self._prune_program_desc(program_desc)
-        if is_replace:
-            attrs = list()
-            for op in program.current_block().ops:
-                if not isinstance(op, PassDesc.OpHelper):
-                    continue
-                attrs.extend(op._attrs.values())
-            return vars, attrs
-        return vars
+                    op_outs = out.Outputs()
+                    if len(op_outs) != 1:
+                        raise ValueError(
+                            "Operator '{}' has multiple outputs, please specify one output variable.".
+                            format(out._type))
+                    for op_out in op_outs.values():
+                        vars.extend(op_out)
+                else:
+                    vars.append(out)
+        block_desc = program.current_block().desc
+        for i in range(block_desc.op_size()):
+            ops.add().ParseFromString(block_desc.op(i).serialize_to_string())
+        self._prune_program_desc(ops)
+        return vars, program.current_block().ops
+
+    def _convert_vars_to_pass_desc(self, patterns, replaces, desc):
+        for (pattern, replace) in zip(patterns, replaces):
+            # Convert maps of inputs and outputs.
+            var_map = desc.var_maps.add()
+            var_map.pattern_var = pattern.name
+            var_map.replace_var = replace.name
+            conditions = desc.var_attr_conditions
+            # Convert shape condition.
+            if pattern.name in self._input_specs:
+                condition = conditions.add()
+                pattern.Attr("shape")._to_pass_desc_attr(condition.attr)
+                condition.condition_value.name = ""
+                condition.condition_value.type = framework_pb2.AttrType.LONGS
+                condition.condition_value.longs.extend(pattern.shape)
+                condition.type = pass_desc_pb2.PassDesc.ConditionType.kEQ
+            # Convert attr conditions.
+            if PassDesc.VarHelper == pattern.__class__:
+                for attr in pattern._attrs.values():
+                    if attr._condition is not None:
+                        conditions.append(attr._condition)
+                    conditions.extend(
+                        [e._condition for e in attr._elements if e._condition])
+
+    def _convert_ops_to_pass_desc(self, patterns, replaces, desc):
+        for replace in replaces:
+            if isinstance(replace, PassDesc.OpHelper):
+                for attr in replace._attrs.values():
+                    # Convert attr maps.
+                    mapped = attr._mapped
+                    if inspect.isfunction(mapped):
+                        mapped = mapped(patterns)
+                    attr_map = desc.op_attr_maps.add()
+                    mapped._to_pass_desc_attr(attr_map.pattern_attr)
+                    attr._to_pass_desc_attr(attr_map.replace_attr)
+                    if mapped._operation is not None:
+                        attr_map.operation.CopyFrom(mapped._operation)
 
     def SerializeMultiPassDesc(self):
         switch_static_mode = paddle.in_dynamic_mode()
@@ -213,30 +243,18 @@ def SerializeMultiPassDesc(self):
             paddle.enable_static()
         multi_pass_desc = pass_desc_pb2.MultiPassDesc()
         multi_pass_desc.pass_type = self._pass_type
+        # Traverse all pass pairs and convert them to PassDesc data.
+        # Here need to add cache in the future. 
         for (pattern, replace) in self._pass_pairs:
             pass_desc = multi_pass_desc.pass_descs.add()
-            pattern_vars = self._func_to_program_desc(pattern,
-                                                      pass_desc.pattern)
-            replace_vars, attrs = self._func_to_program_desc(
-                replace, pass_desc.replace, is_replace=True)
-            for (pattern_var, replace_var) in zip(pattern_vars, replace_vars):
-                var_map = pass_desc.var_maps.add()
-                var_map.pattern_var = pattern_var
-                var_map.replace_var = replace_var
-            pattern_op_idxs = dict()
-            for (idx, op) in enumerate(pass_desc.pattern.blocks[0].ops):
-                op_idxs = pattern_op_idxs.get(op.type)
-                if op_idxs:
-                    op_idxs.append(idx)
-                else:
-                    pattern_op_idxs[op.type] = [idx]
-            for attr in attrs:
-                attr_map = pass_desc.attr_maps.add()
-                attr_map.pattern_op_idx = pattern_op_idxs[
-                    attr._pattern_op_type][attr._pattern_op_idx]
-                attr_map.replace_op_idx = attr._replace_op_idx
-                attr_map.pattern_name = attr._pattern_name
-                attr_map.replace_name = attr._replace_name
+            # Convert ProgramDescs of pattern and replace subgraphs.
+            pattern_vars, pattern_ops = self._func_to_program_desc(
+                pattern, pass_desc.pattern)
+            replace_vars, replace_ops = self._func_to_program_desc(
+                replace, pass_desc.replace)
+            self._convert_vars_to_pass_desc(pattern_vars, replace_vars,
+                                            pass_desc)
+            self._convert_ops_to_pass_desc(pattern_ops, replace_ops, pass_desc)
         if switch_static_mode:
             paddle.disable_static()
         return multi_pass_desc.SerializeToString()
@@ -244,18 +262,119 @@ def SerializeMultiPassDesc(self):
 
 class PassDesc(object):
     class AttrHelper(object):
-        def __init__(self, name, replace_op_idx):
-            self._pattern_op_type = None
-            self._pattern_op_idx = -1
-            self._replace_op_idx = replace_op_idx
-            self._pattern_name = name
-            self._replace_name = name
-
-        def ReusePattern(self, op, index=0, name=None):
-            if name:
-                self._pattern_name = name
-            self._pattern_op_type = op
-            self._pattern_op_idx = index
+        def __init__(self, obj, name, element_index=None):
+            self._obj = obj
+            self._name = name
+            self._operation_type = None
+            self._element_index = element_index
+            self._elements = list()
+            self._operation = None
+            self._condition = None
+            self._mapped = None
+
+        def __getitem__(self, index):
+            element = PassDesc.AttrHelper(
+                self._obj, self._name, element_index=index)
+            self._elements.append(element)
+            return element
+
+        def _to_pass_desc_attr(self, pass_desc_attr):
+            if isinstance(self._obj, PassDesc.VarHelper):
+                pass_desc_attr.role = pass_desc_pb2.PassDesc.RoleType.kVariable
+                pass_desc_attr.var_name = self._obj.name
+            else:
+                pass_desc_attr.role = pass_desc_pb2.PassDesc.RoleType.kOperator
+                pass_desc_attr.op_index = self._obj._index
+            pass_desc_attr.name = self._name
+            if self._operation_type is not None:
+                pass_desc_attr.operation = self._operation_type
+            if self._element_index is not None:
+                pass_desc_attr.element_index = self._element_index
+
+        def _to_op_desc_attr(self, value, op_desc_attr):
+            op_desc_attr.name = ""
+            if isinstance(value, int):
+                op_desc_attr.type = framework_pb2.AttrType.INT
+                op_desc_attr.i = value
+            else:
+                raise NotImplementedError("Unimplemented transform operation.")
+
+        def _clone_with_operation(self, type, value=None):
+            attr = PassDesc.AttrHelper(self._obj, self._name,
+                                       self._element_index)
+            self._elements.append(attr)
+            if value is None:
+                attr._operation_type = type
+                return attr
+            operation = pass_desc_pb2.PassDesc.Operation()
+            operation.type = type
+            if isinstance(value, PassDesc.AttrHelper):
+                value._to_pass_desc_attr(operation.attr)
+            else:
+                self._to_op_desc_attr(value, operation.value)
+            attr._operation = operation
+            attr._operation_type = self._operation_type
+            return attr
+
+        def __sub__(self, value):
+            return self._clone_with_operation(
+                pass_desc_pb2.PassDesc.OperationType.kSub, value)
+
+        def __add__(self, value):
+            return self._clone_with_operation(
+                pass_desc_pb2.PassDesc.OperationType.kAdd, value)
+
+        def Size(self):
+            return self._clone_with_operation(
+                pass_desc_pb2.PassDesc.OperationType.kSize)
+
+        def _set_with_condition(self, type, value):
+            condition = pass_desc_pb2.PassDesc.AttrCondition()
+            self._to_pass_desc_attr(condition.attr)
+            condition.type = type
+            if isinstance(value, PassDesc.AttrHelper):
+                value._to_pass_desc_attr(condition.condition_attr)
+            else:
+                self._to_op_desc_attr(value, condition.condition_value)
+            self._condition = condition
+
+        def EQ(self, value):
+            self._set_with_condition(pass_desc_pb2.PassDesc.ConditionType.kEQ,
+                                     value)
+
+        def MappedPattern(self, var=None, op=None, index=0, name=None):
+            if all([var, op]):
+                raise ValueError("Only mapped one of which var or op.")
+
+            def mapped_var(pattern_ops):
+                raise NotImplementedError(
+                    "Mapping to variable is not implemented.")
+
+            def mapped_op(pattern_ops):
+                ops = [o for o in pattern_ops if o._type == op]
+                if len(ops) <= index:
+                    raise ValueError(
+                        "Index '{}' of operator '{}' is incorrect.".format(
+                            index, op))
+                return PassDesc.AttrHelper(ops[index], name)
+
+            self._mapped = mapped_op if var is None else mapped_var
+
+    class VarHelper(paddle.static.Variable):
+        def __init__(self, *args, **kwargs):
+            block = paddle.static.default_main_program().current_block()
+            self._var = paddle.static.data(*args, **kwargs)
+            self._attrs = dict()
+
+        def __getattr__(self, name):
+            return getattr(self._var, name)
+
+        def Attr(self, name):
+            attr = self._attrs.get(name)
+            if attr is None:
+                attr = PassDesc.AttrHelper(self, name)
+                self._attrs[name] = attr
+            return attr
 
     class OpHelper(object):
         def __init__(self, type=None):
@@ -267,8 +386,15 @@ def __getattr__(self, name):
             return op
 
         def __call__(self, *args, **kwargs):
+            if len(args) > 0:
+                raise ValueError(
+                    "Each input argument needs to specify a parameter name.")
             for (in_name, in_args) in kwargs.items():
-                in_arg_names = list()
+                op_input = self._inputs.get(in_name)
+                if op_input is None:
+                    raise ValueError(
+                        "Operator '{}' does not have input named '{}'.".format(
+                            self._type, in_name))
                 if isinstance(in_args, (list, tuple)):
                     if len(in_args) == 0:
                         raise ValueError(
@@ -278,52 +404,61 @@ def __call__(self, *args, **kwargs):
                     in_args = [in_args]
                 for in_arg in in_args:
                     if isinstance(in_arg, PassDesc.OpHelper):
-                        in_arg_names.extend(in_arg.Output())
+                        op_outs = in_arg.Outputs()
+                        if len(op_outs) != 1:
+                            raise ValueError(
+                                "The size of outputs of operator '{}' is not equal 1, please specify one output variable.".
+                                format(in_arg._type))
+                        for op_out in op_outs.values():
+                            op_input.extend(op_out)
                     else:
-                        in_arg_names.append(in_arg.name)
-                self._op_desc.set_input(in_name, in_arg_names)
+                        op_input.append(in_arg)
+                self._desc.set_input(in_name, [i.name for i in op_input])
+            block = paddle.static.default_main_program().current_block()
+            for out_name, op_output in self._outputs.items():
+                op_output_name = unique_name.generate(self._type)
+                op_output.append(block.create_var(name=op_output_name))
+                self._desc.set_output(out_name, [op_output_name])
             return self
 
         def Init(self):
             block = paddle.static.default_main_program().current_block()
-            self._attrs = dict()
-            self._op_idx = len(block.ops)
-            self._op_desc = block.desc.append_op()
-            self._op_desc.set_type(self._type)
-            self._op_proto = OpProtoHolder.instance().op_proto_map.get(
-                self._type)
-            if self._op_proto is None:
+            self._proto = OpProtoHolder.instance().op_proto_map.get(self._type)
+            if self._proto is None:
                 raise AttributeError(
                     "type object 'OpHelper' has no attribute '{}'".format(
                         self._type))
+            self._index = len(block.ops)
+            self._desc = block.desc.append_op()
+            self._desc.set_type(self._type)
+            self._attrs = dict()
+            self._inputs = {i.name: list() for i in self._proto.inputs}
+            self._outputs = {o.name: list() for o in self._proto.outputs}
             block.ops.append(self)
 
         def Attr(self, name):
             attr = self._attrs.get(name)
-            if attr:
-                return attr
-            attr = PassDesc.AttrHelper(name, self._op_idx)
-            self._attrs[name] = attr
+            if attr is None:
+                attr = PassDesc.AttrHelper(self, name)
+                self._attrs[name] = attr
             return attr
 
         def SetAttr(self, name, value):
-            self._op_desc._set_attr(name, value)
+            if isinstance(value, PassDesc.AttrHelper):
+                self.Attr(name)._mapped = value
+            else:
+                self._desc._set_attr(name, value)
 
-        def Output(self, name=None):
-            if name:
-                return self.Outputs()[name]
-            return list(self.Outputs().values())[0]
+        def Output(self, name):
+            output = self._outputs.get(name)
+            if output is None:
+                raise ValueError(
+                    "Operator '{}' does not have output named '{}'.".format(
+                        self._type, name))
+            return output
 
         def Outputs(self):
-            outputs = self._op_desc.outputs()
-            if len(outputs) > 0:
-                return outputs
-            block = paddle.static.default_main_program().current_block()
-            for output_proto in self._op_proto.outputs:
-                name = unique_name.generate(self._type)
-                block.create_var(name=name)
-                self._op_desc.set_output(output_proto.name, [name])
-            return self._op_desc.outputs()
+            return self._outputs
 
     OP = OpHelper()
 
diff --git a/python/paddle/fluid/layers/loss.py b/python/paddle/fluid/layers/loss.py
index eaac99fc5b592..3db4a894d1a07 100644
--- a/python/paddle/fluid/layers/loss.py
+++ b/python/paddle/fluid/layers/loss.py
@@ -479,9 +479,7 @@ def warpctc(input,
             blank=0,
             norm_by_times=False,
             input_length=None,
-            label_length=None,
-            norm_by_batchsize=False,
-            norm_by_total_logits_len=False):
+            label_length=None):
     """
     An operator integrating the open source Warp-CTC library
     (https://github.com/baidu-research/warp-ctc)
@@ -518,12 +516,6 @@ def warpctc(input,
          of Tensor type, it should have shape `[batch_size]` and dtype int64.
        label_length(Variable): The length for each label sequence if it is
          of Tensor type, it should have shape `[batch_size]` and dtype int64.
-       norm_by_batchsize (bool): normalize the loss by the batch size. 
-            If `True`, supersedes  `norm_by_times`
-            (default: `False`)
-       norm_by_total_logits_len (bool): normalize the loss by the total number of frames
-            in the batch. If `True`, supersedes `norm_by_batchsize` and `norm_by_times`
-            (default: `False`)
 
     Returns:
         Variable: The Connectionist Temporal Classification (CTC) loss,
@@ -611,12 +603,15 @@ def warpctc(input,
                 "input_length and label_length must not be None in dygraph mode!"
             )
         grad, loss_out = _C_ops.warpctc(
-            input, label, input_length, label_length, 'blank', blank,
-            'norm_by_times', norm_by_times, 'norm_by_batchsize',
-            norm_by_batchsize, 'norm_by_total_logits_len',
-            norm_by_total_logits_len)
+            input,
+            label,
+            input_length,
+            label_length,
+            'blank',
+            blank,
+            'norm_by_times',
+            norm_by_times, )
         return loss_out
-
     helper = LayerHelper('warpctc', **locals())
     check_variable_and_dtype(input, 'input', ['float32', 'float64'], "warpctc")
     check_variable_and_dtype(label, 'label', ['int32'], "warpctc")
@@ -640,8 +635,6 @@ def warpctc(input,
         attrs={
             'blank': blank,
             'norm_by_times': norm_by_times,
-            'norm_by_batchsize': norm_by_batchsize,
-            'norm_by_total_logits_len': norm_by_total_logits_len,
         })
     return loss_out
 
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index ceda304b26e89..dd0abd212e834 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -11396,9 +11396,10 @@ def shape(input):
             res = exe.run(fluid.default_main_program(), feed={'x':img}, fetch_list=[output])
             print(res) # [array([  3, 100, 100], dtype=int32)]
     """
-    check_variable_and_dtype(
-        input, 'input',
-        ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'], 'shape')
+    check_variable_and_dtype(input, 'input', [
+        'bool', 'float16', 'float32', 'float64', 'int32', 'int64', 'complex64',
+        'complex128'
+    ], 'shape')
     helper = LayerHelper('shape', **locals())
     out = helper.create_variable_for_type_inference(dtype='int32')
     helper.append_op(
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 228ba08499808..7412d3a3fe6cf 100755
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -2068,6 +2068,7 @@ def _append_optimize_op(self, block, param_and_grad):
             "lars_coeff": self._lars_coeff,
             "lars_weight_decay": [_lars_weight_decay],
             "multi_precision": find_master,
+            "epsilon": self._epsilon,
             "rescale_grad": self._rescale_grad
         }
 
diff --git a/python/paddle/fluid/profiler.py b/python/paddle/fluid/profiler.py
index 40b0862be0177..fc48a48450efd 100644
--- a/python/paddle/fluid/profiler.py
+++ b/python/paddle/fluid/profiler.py
@@ -38,72 +38,15 @@
 @signature_safe_contextmanager
 def cuda_profiler(output_file, output_mode=None, config=None):
     """
-    The CUDA profiler.
-    
-    This fuctions is used to profile CUDA program by CUDA runtime application
-    programming interface. The profiling result will be written into
-    `output_file`. The users can set the output mode by `output_mode` argument 
-    and set the nvidia profiling config by `config` argument. 
-    
-    After getting the profiling result file, users can use 
-    `NVIDIA Visual Profiler <https://developer.nvidia.com/nvidia-visual-profiler>`_ 
-    to load this output file to visualize results.
-
-    Args:
-        output_file (str) : The output file name, the result will be
-            written into this file.
-        output_mode (str, optional) : The output mode has Key-Value pair format ('kvp') 
-            and Comma separated values format ('csv', default).
-        config (list<str>, optional) : Nvidia profile config. Default config is 
-            ['gpustarttimestamp', 'gpuendtimestamp', 'gridsize3d', 'threadblocksize', 
-            'streamid', 'enableonstart 0', 'conckerneltrace']. For more details, please
-            refer to `Compute Command Line Profiler User Guide <https://developer.download.nvidia.cn/compute/DevZone/docs/html/C/doc/Compute_Command_Line_Profiler_User_Guide.pdf>`_ .
-
-    Raises:
-        ValueError: If `output_mode` is not in ['kvp', 'csv'].
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            import paddle.fluid.profiler as profiler
-            import numpy as np
-
-            epoc = 8
-            dshape = [4, 3, 28, 28]
-            data = fluid.data(name='data', shape=[None, 3, 28, 28], dtype='float32')
-            conv = fluid.layers.conv2d(data, 20, 3, stride=[1, 1], padding=[1, 1])
-
-            place = fluid.CUDAPlace(0)
-            exe = fluid.Executor(place)
-            exe.run(fluid.default_startup_program())
-
-            output_file = 'cuda_profiler.txt'
-            with profiler.cuda_profiler(output_file, 'csv') as nvprof:
-                for i in range(epoc):
-                    input = np.random.random(dshape).astype('float32')
-                    exe.run(fluid.default_main_program(), feed={'data': input})
-            # then use  NVIDIA Visual Profiler (nvvp) to load this output file
-            # to visualize results.
+    API cuda_profiler has been abandoned. If you have relevant requirements, you can use `paddle.utils.profiler.start_profiler` and `paddle.utils.profiler.stop_profiler`. 
+    The relevant reference documents are as follows:
+    <https://www.paddlepaddle.org.cn/documentation/docs/en/api/paddle/utils/profiler/start_profiler_en.html#start-profiler>
+    <https://www.paddlepaddle.org.cn/documentation/docs/en/api/paddle/utils/profiler/stop_profiler_en.html#stop-profiler>
+    <https://www.paddlepaddle.org.cn/documentation/docs/en/advanced_guide/performance_improving/analysis_tools/timeline_en.html>
     """
-    if output_mode is None:
-        output_mode = 'csv'
-    if output_mode not in ['kvp', 'csv']:
-        raise ValueError("The output mode must be 'kvp' or 'csv'.")
-    config = NVPROF_CONFIG if config is None else config
-    config_file = 'nvprof_config_file'
-    with open(config_file, 'wb') as fp:
-        fp.writelines([six.b("%s\n" % item) for item in config])
-    core.nvprof_init(output_file, output_mode, config_file)
-    # Enables profiler collection by the active CUDA profiling tool.
-    core.nvprof_start()
-    try:
-        yield
-    # Disables profiler collection.
-    finally:
-        core.nvprof_stop()
-        os.remove(config_file)
+    raise RuntimeError(
+        "API cuda_profiler has been abandoned. If you have relevant requirements, you can use `paddle.utils.profiler.start_profiler` and `paddle.utils.profiler.stop_profiler`.\nThe relevant reference documents are as follows:\n<https://www.paddlepaddle.org.cn/documentation/docs/en/api/paddle/utils/profiler/start_profiler_en.html#start-profiler>\n<https://www.paddlepaddle.org.cn/documentation/docs/en/api/paddle/utils/profiler/stop_profiler_en.html#stop-profiler>\n<https://www.paddlepaddle.org.cn/documentation/docs/en/advanced_guide/performance_improving/analysis_tools/timeline_en.html>"
+    )
 
 
 @signature_safe_contextmanager
@@ -167,8 +110,7 @@ def npu_profiler(output_file, config=None):
 
 def reset_profiler():
     """
-    Clear the previous time record. This interface does not work for
-    `fluid.profiler.cuda_profiler`, it only works for
+    Clear the previous time record. It works for
     `fluid.profiler.start_profiler`, `fluid.profiler.stop_profiler`,
     and `fluid.profiler.profiler`.
 
@@ -176,6 +118,7 @@ def reset_profiler():
 
         .. code-block:: python
 
+            # required: gpu
             import paddle.fluid as fluid
             import paddle.fluid.profiler as profiler
             with profiler.profiler('CPU', 'total', '/tmp/profile'):
@@ -316,8 +259,7 @@ def profiler(state,
              profile_path='/tmp/profile',
              tracer_option='Default'):
     """
-    The profiler interface. Different from `fluid.profiler.cuda_profiler`, 
-    this profiler can be used to profile both CPU and GPU program.
+    The profiler interface. This profiler can be used to profile both CPU and GPU program.
 
     Args:
         state (str) : The profiling state, which should be one of 'CPU', 'GPU'
@@ -349,9 +291,12 @@ def profiler(state,
 
         .. code-block:: python
 
+            # required: gpu
             import paddle.fluid as fluid
             import paddle.fluid.profiler as profiler
             import numpy as np
+            import paddle
+            paddle.enable_static()
 
             epoc = 8
             dshape = [4, 3, 28, 28]
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 34ba1d19b809c..e821140a0d1ec 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -38,6 +38,7 @@ list(APPEND DIST_TEST_OPS test_hybrid_parallel_inference_helper)
 list(APPEND DIST_TEST_OPS test_parallel_class_center_sample)
 list(APPEND DIST_TEST_OPS test_parallel_margin_cross_entropy)
 list(APPEND DIST_TEST_OPS test_auto_parallel_data_unshard)
+list(APPEND DIST_TEST_OPS test_auto_parallel_save_load)
 set(MIXED_DIST_TEST_OPS ${DIST_TEST_OPS})
 #remove distribute unittests.
 list(APPEND MIXED_DIST_TEST_OPS test_dgc_op)
@@ -58,6 +59,7 @@ list(APPEND MIXED_DIST_TEST_OPS test_fleet_run_random_port)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_launch_async)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_launch_cloud)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_launch_ascend)
+list(APPEND MIXED_DIST_TEST_OPS test_fleet_launch_rank_mapping)
 list(APPEND MIXED_DIST_TEST_OPS test_ascend_group)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_launch_nproc)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_api_input)
@@ -138,6 +140,7 @@ if(((NOT WITH_ROCM) AND (NOT WITH_GPU)) OR WIN32)
     LIST(REMOVE_ITEM TEST_OPS test_raw_program_optimizer)
     LIST(REMOVE_ITEM TEST_OPS test_fleet_gradient_scale)
     LIST(REMOVE_ITEM TEST_OPS test_disable_signal_handler)
+    LIST(REMOVE_ITEM TEST_OPS test_fleet_executor)
 endif()
 
 # Temporally disable test_deprecated_decorator
@@ -252,6 +255,7 @@ if ((NOT WITH_GPU) AND (NOT WITH_ROCM))
     LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_reshard_dpmppp)
     LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_cost_model)
     LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_data_unshard)
+    LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_save_load)
 elseif(WITH_GPU)
     if (${CUDNN_VERSION} VERSION_LESS 7100)
         LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op)
@@ -655,6 +659,7 @@ if(WITH_DISTRIBUTE)
         bash_test_modules(test_fleet_launch_async START_BASH test_fleet_launch_async.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
         bash_test_modules(test_fleet_launch_cloud START_BASH test_fleet_launch_cloud.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
         bash_test_modules(test_fleet_launch_nproc START_BASH test_fleet_launch_nproc.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
+        bash_test_modules(test_fleet_launch_rank_mapping START_BASH test_fleet_launch_rank_mapping.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
         if(WITH_ASCEND OR WITH_ASCEND_CL)
             bash_test_modules(test_fleet_launch_ascend START_BASH test_fleet_launch_ascend.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
             bash_test_modules(test_ascend_group START_BASH test_ascend_group.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
@@ -1014,7 +1019,6 @@ set_tests_properties(test_dataloader_unkeep_order PROPERTIES TIMEOUT 120)
 set_tests_properties(test_reader_reset PROPERTIES TIMEOUT 120)
 set_tests_properties(test_pool3d_api PROPERTIES TIMEOUT 120)
 set_tests_properties(test_cumprod_op PROPERTIES TIMEOUT 120)
-set_tests_properties(test_solve_op PROPERTIES TIMEOUT 120)
 if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL)
     set_tests_properties(test_parallel_dygraph_dataparallel PROPERTIES TIMEOUT 120)
     set_tests_properties(test_parallel_dygraph_unused_variables PROPERTIES TIMEOUT 120)
@@ -1030,6 +1034,7 @@ if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL)
     set_tests_properties(test_parallel_class_center_sample PROPERTIES TIMEOUT 120)
     set_tests_properties(test_parallel_margin_cross_entropy PROPERTIES TIMEOUT 120)
     set_tests_properties(test_auto_parallel_data_unshard PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_auto_parallel_save_load PROPERTIES TIMEOUT 120)
     if(${NCCL_VERSION} VERSION_GREATER_EQUAL 2212)
         set_tests_properties(test_parallel_dygraph_sparse_embedding PROPERTIES TIMEOUT 120)
         set_tests_properties(test_parallel_dygraph_transformer PROPERTIES TIMEOUT 120)
diff --git a/python/paddle/fluid/tests/unittests/asp/asp_pruning_base.py b/python/paddle/fluid/tests/unittests/asp/asp_pruning_base.py
index 370d73cc35a43..d41a7b2b842e8 100644
--- a/python/paddle/fluid/tests/unittests/asp/asp_pruning_base.py
+++ b/python/paddle/fluid/tests/unittests/asp/asp_pruning_base.py
@@ -20,7 +20,7 @@
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
-from paddle.fluid.contrib import sparsity
+from paddle.static import sparsity
 from paddle.fluid.contrib.sparsity.asp import ASPHelper
 import numpy as np
 
@@ -76,14 +76,11 @@ def __pruning_and_checking(self, exe, place, mask_func_name,
                                check_func_name, with_mask):
         exe.run(self.startup_program)
         sparsity.prune_model(
-            place,
-            self.main_program,
-            func_name=mask_func_name,
-            with_mask=with_mask)
+            self.main_program, mask_algo=mask_func_name, with_mask=with_mask)
         for param in self.main_program.global_block().all_parameters():
             if ASPHelper._is_supported_layer(self.main_program, param.name):
                 mat = np.array(fluid.global_scope().find_var(param.name)
                                .get_tensor())
                 self.assertTrue(
-                    sparsity.check_sparsity(
+                    paddle.fluid.contrib.sparsity.check_sparsity(
                         mat.T, func_name=check_func_name, n=2, m=4))
diff --git a/python/paddle/fluid/tests/unittests/asp/test_asp_optimize.py b/python/paddle/fluid/tests/unittests/asp/test_asp_optimize.py
index 402861ad5d931..9e5e3c924f1a5 100644
--- a/python/paddle/fluid/tests/unittests/asp/test_asp_optimize.py
+++ b/python/paddle/fluid/tests/unittests/asp/test_asp_optimize.py
@@ -20,7 +20,7 @@
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
-from paddle.fluid.contrib import sparsity
+from paddle.static import sparsity
 from paddle.fluid.contrib.sparsity.asp import ASPHelper
 import numpy as np
 
@@ -129,7 +129,7 @@ def test_asp_training(self):
         feeder = fluid.DataFeeder(feed_list=[self.img, self.label], place=place)
 
         exe.run(self.startup_program)
-        sparsity.prune_model(place, self.main_program)
+        sparsity.prune_model(self.main_program)
 
         data = (np.random.randn(64, 3, 32, 32), np.random.randint(
             10, size=(64, 1)))
@@ -139,7 +139,9 @@ def test_asp_training(self):
             if ASPHelper._is_supported_layer(self.main_program, param.name):
                 mat = np.array(fluid.global_scope().find_var(param.name)
                                .get_tensor())
-                self.assertTrue(sparsity.check_sparsity(mat.T, n=2, m=4))
+                self.assertTrue(
+                    paddle.fluid.contrib.sparsity.check_sparsity(
+                        mat.T, n=2, m=4))
 
     def test_asp_training_with_amp(self):
         if core.is_compiled_with_cuda():
@@ -155,7 +157,7 @@ def test_asp_training_with_amp(self):
                 feed_list=[self.img, self.label], place=place)
 
             exe.run(self.startup_program)
-            sparsity.prune_model(place, self.main_program)
+            sparsity.prune_model(self.main_program)
 
             data = (np.random.randn(64, 3, 32, 32), np.random.randint(
                 10, size=(64, 1)))
@@ -165,7 +167,9 @@ def test_asp_training_with_amp(self):
                 if ASPHelper._is_supported_layer(self.main_program, param.name):
                     mat = np.array(fluid.global_scope().find_var(param.name)
                                    .get_tensor())
-                    self.assertTrue(sparsity.check_sparsity(mat.T, n=2, m=4))
+                    self.assertTrue(
+                        paddle.fluid.contrib.sparsity.check_sparsity(
+                            mat.T, n=2, m=4))
 
     def __get_param_names(self, params):
         param_names = []
diff --git a/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_1d.py b/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_1d.py
index 6ebc89b18738c..7a3fa0244930c 100644
--- a/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_1d.py
+++ b/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_1d.py
@@ -17,7 +17,7 @@
 
 import unittest
 import paddle
-from paddle.fluid.contrib import sparsity
+from paddle.static import sparsity
 from paddle.fluid.tests.unittests.asp.asp_pruning_base import TestASPHelperPruningBase
 
 paddle.enable_static()
@@ -25,12 +25,12 @@
 
 class TestASPHelperPruning1D(TestASPHelperPruningBase):
     def test_1D_inference_pruning(self):
-        self.run_inference_pruning_test(sparsity.MaskAlgo.MASK_1D,
-                                        sparsity.CheckMethod.CHECK_1D)
+        self.run_inference_pruning_test(
+            'mask_1d', paddle.fluid.contrib.sparsity.CheckMethod.CHECK_1D)
 
     def test_1D_training_pruning(self):
-        self.run_training_pruning_test(sparsity.MaskAlgo.MASK_1D,
-                                       sparsity.CheckMethod.CHECK_1D)
+        self.run_training_pruning_test(
+            'mask_1d', paddle.fluid.contrib.sparsity.CheckMethod.CHECK_1D)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_2d_best.py b/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_2d_best.py
index b21f8edf4f477..e99509187038c 100644
--- a/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_2d_best.py
+++ b/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_2d_best.py
@@ -17,7 +17,7 @@
 
 import paddle
 import unittest
-from paddle.fluid.contrib import sparsity
+from paddle.static import sparsity
 from paddle.fluid.tests.unittests.asp.asp_pruning_base import TestASPHelperPruningBase
 
 paddle.enable_static()
@@ -25,12 +25,12 @@
 
 class TestASPHelperPruning2DBest(TestASPHelperPruningBase):
     def test_2D_best_inference_pruning(self):
-        self.run_inference_pruning_test(sparsity.MaskAlgo.MASK_2D_BEST,
-                                        sparsity.CheckMethod.CHECK_2D)
+        self.run_inference_pruning_test(
+            'mask_2d_best', paddle.fluid.contrib.sparsity.CheckMethod.CHECK_2D)
 
     def test_2D_best_training_pruning(self):
-        self.run_training_pruning_test(sparsity.MaskAlgo.MASK_2D_BEST,
-                                       sparsity.CheckMethod.CHECK_2D)
+        self.run_training_pruning_test(
+            'mask_2d_best', paddle.fluid.contrib.sparsity.CheckMethod.CHECK_2D)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_2d_greedy.py b/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_2d_greedy.py
index 8ec8ab485250e..7ad6c3ae02275 100644
--- a/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_2d_greedy.py
+++ b/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_2d_greedy.py
@@ -17,7 +17,7 @@
 
 import unittest
 import paddle
-from paddle.fluid.contrib import sparsity
+from paddle.static import sparsity
 from paddle.fluid.tests.unittests.asp.asp_pruning_base import TestASPHelperPruningBase
 
 paddle.enable_static()
@@ -25,12 +25,14 @@
 
 class TestASPHelperPruning2DGreedy(TestASPHelperPruningBase):
     def test_2D_greedy_inference_pruning(self):
-        self.run_inference_pruning_test(sparsity.MaskAlgo.MASK_2D_GREEDY,
-                                        sparsity.CheckMethod.CHECK_2D)
+        self.run_inference_pruning_test(
+            'mask_2d_greedy',
+            paddle.fluid.contrib.sparsity.CheckMethod.CHECK_2D)
 
     def test_2D_greedy_training_pruning(self):
-        self.run_training_pruning_test(sparsity.MaskAlgo.MASK_2D_GREEDY,
-                                       sparsity.CheckMethod.CHECK_2D)
+        self.run_training_pruning_test(
+            'mask_2d_greedy',
+            paddle.fluid.contrib.sparsity.CheckMethod.CHECK_2D)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/asp/test_asp_utils.py b/python/paddle/fluid/tests/unittests/asp/test_asp_utils.py
index 387cb55e5c3cf..4aac878763b6f 100644
--- a/python/paddle/fluid/tests/unittests/asp/test_asp_utils.py
+++ b/python/paddle/fluid/tests/unittests/asp/test_asp_utils.py
@@ -18,22 +18,24 @@
 import unittest
 import threading, time
 import paddle
-from paddle.fluid.contrib import sparsity
+from paddle.static import sparsity
 import numpy as np
 
 
 class TestASPUtils(unittest.TestCase):
     def test_get_check_method(self):
         self.assertEqual(
-            sparsity.CheckMethod.get_checking_method(sparsity.MaskAlgo.MASK_1D),
-            sparsity.CheckMethod.CHECK_1D)
+            paddle.fluid.contrib.sparsity.CheckMethod.get_checking_method(
+                paddle.fluid.contrib.sparsity.MaskAlgo.MASK_1D),
+            paddle.fluid.contrib.sparsity.CheckMethod.CHECK_1D)
         self.assertEqual(
-            sparsity.CheckMethod.get_checking_method(
-                sparsity.MaskAlgo.MASK_2D_GREEDY),
-            sparsity.CheckMethod.CHECK_2D)
+            paddle.fluid.contrib.sparsity.CheckMethod.get_checking_method(
+                paddle.fluid.contrib.sparsity.MaskAlgo.MASK_2D_GREEDY),
+            paddle.fluid.contrib.sparsity.CheckMethod.CHECK_2D)
         self.assertEqual(
-            sparsity.CheckMethod.get_checking_method(
-                sparsity.MaskAlgo.MASK_2D_BEST), sparsity.CheckMethod.CHECK_2D)
+            paddle.fluid.contrib.sparsity.CheckMethod.get_checking_method(
+                paddle.fluid.contrib.sparsity.MaskAlgo.MASK_2D_BEST),
+            paddle.fluid.contrib.sparsity.CheckMethod.CHECK_2D)
 
     def test_density(self):
         x = np.array([[1.0, 1.0, 1.0, 0.0, 1.0], [1.0, 1.0, 0.0, 0.0, 1.0],
@@ -47,53 +49,59 @@ def test_check_mask_1d(self):
         x = np.array([[1.0, 0.0, 0.0, 1.0, 1.0], [1.0, 1.0, 0.0, 0.0, 1.0],
                       [1.0, 1.0, 0.0, 0.0, 1.0], [1.0, 1.0, 0.0, 0.0, 1.0],
                       [0.0, 1.0, 0.0, 0.0, 1.0]])
-        self.assertTrue(sparsity.check_mask_1d(x, 2, 4))
-        self.assertFalse(sparsity.check_mask_1d(x, 3, 4))
-        self.assertTrue(sparsity.check_mask_1d(x, 2, 5))
-        self.assertFalse(sparsity.check_mask_1d(x, 3, 5))
-        self.assertTrue(sparsity.check_mask_1d(x, 3, 6))
-        self.assertFalse(sparsity.check_mask_1d(x, 4, 6))
+        self.assertTrue(paddle.fluid.contrib.sparsity.check_mask_1d(x, 2, 4))
+        self.assertFalse(paddle.fluid.contrib.sparsity.check_mask_1d(x, 3, 4))
+        self.assertTrue(paddle.fluid.contrib.sparsity.check_mask_1d(x, 2, 5))
+        self.assertFalse(paddle.fluid.contrib.sparsity.check_mask_1d(x, 3, 5))
+        self.assertTrue(paddle.fluid.contrib.sparsity.check_mask_1d(x, 3, 6))
+        self.assertFalse(paddle.fluid.contrib.sparsity.check_mask_1d(x, 4, 6))
 
     def test_get_mask_1d(self):
         for _ in range(10):
             x = np.random.randint(10, size=(5, 5))
-            x = sparsity.get_mask_1d(x, 2, 4)
-            self.assertTrue(sparsity.check_mask_1d(x, 2, 4))
+            x = paddle.fluid.contrib.sparsity.get_mask_1d(x, 2, 4)
+            self.assertTrue(
+                paddle.fluid.contrib.sparsity.check_mask_1d(x, 2, 4))
 
             x = np.random.randn(5, 4)
-            x = sparsity.get_mask_1d(x, 2, 4)
-            self.assertTrue(sparsity.check_mask_1d(x, 2, 4))
+            x = paddle.fluid.contrib.sparsity.get_mask_1d(x, 2, 4)
+            self.assertTrue(
+                paddle.fluid.contrib.sparsity.check_mask_1d(x, 2, 4))
 
     def test_check_mask_2d(self):
         x = np.array([[1.0, 0.0, 0.0, 1.0, 1.0], [0.0, 1.0, 0.0, 0.0, 0.0],
                       [0.0, 0.0, 1.0, 0.0, 1.0], [1.0, 1.0, 0.0, 0.0, 0.0],
                       [0.0, 1.0, 0.0, 0.0, 1.0]])
-        self.assertTrue(sparsity.check_mask_2d(x, 2, 4))
-        self.assertFalse(sparsity.check_mask_2d(x, 3, 4))
-        self.assertTrue(sparsity.check_mask_2d(x, 2, 5))
-        self.assertFalse(sparsity.check_mask_2d(x, 3, 5))
-        self.assertTrue(sparsity.check_mask_2d(x, 3, 6))
-        self.assertFalse(sparsity.check_mask_2d(x, 4, 6))
+        self.assertTrue(paddle.fluid.contrib.sparsity.check_mask_2d(x, 2, 4))
+        self.assertFalse(paddle.fluid.contrib.sparsity.check_mask_2d(x, 3, 4))
+        self.assertTrue(paddle.fluid.contrib.sparsity.check_mask_2d(x, 2, 5))
+        self.assertFalse(paddle.fluid.contrib.sparsity.check_mask_2d(x, 3, 5))
+        self.assertTrue(paddle.fluid.contrib.sparsity.check_mask_2d(x, 3, 6))
+        self.assertFalse(paddle.fluid.contrib.sparsity.check_mask_2d(x, 4, 6))
 
     def test_get_mask_2d_greedy(self):
         for _ in range(10):
             x = np.random.randint(10, size=(5, 5))
-            x = sparsity.get_mask_2d_greedy(x, 2, 4)
-            self.assertTrue(sparsity.check_mask_2d(x, 2, 4))
+            x = paddle.fluid.contrib.sparsity.get_mask_2d_greedy(x, 2, 4)
+            self.assertTrue(
+                paddle.fluid.contrib.sparsity.check_mask_2d(x, 2, 4))
 
             x = np.random.randn(5, 4)
-            x = sparsity.get_mask_2d_greedy(x, 2, 4)
-            self.assertTrue(sparsity.check_mask_2d(x, 2, 4))
+            x = paddle.fluid.contrib.sparsity.get_mask_2d_greedy(x, 2, 4)
+            self.assertTrue(
+                paddle.fluid.contrib.sparsity.check_mask_2d(x, 2, 4))
 
     def test_get_mask_2d_best(self):
         for _ in range(10):
             x = np.random.randint(10, size=(5, 5))
-            x = sparsity.get_mask_2d_best(x, 2, 4)
-            self.assertTrue(sparsity.check_mask_2d(x, 2, 4))
+            x = paddle.fluid.contrib.sparsity.get_mask_2d_best(x, 2, 4)
+            self.assertTrue(
+                paddle.fluid.contrib.sparsity.check_mask_2d(x, 2, 4))
 
             x = np.random.randn(5, 4)
-            x = sparsity.get_mask_2d_best(x, 2, 4)
-            self.assertTrue(sparsity.check_mask_2d(x, 2, 4))
+            x = paddle.fluid.contrib.sparsity.get_mask_2d_best(x, 2, 4)
+            self.assertTrue(
+                paddle.fluid.contrib.sparsity.check_mask_2d(x, 2, 4))
 
     def test_threadsafe_valid_2d_patterns(self):
         def get_reference(m=4, n=2):
@@ -160,30 +168,54 @@ def test_create_mask(self):
             self.__test_1D_2D_sparse_mask_generation_methods(x)
 
     def __test_1D_2D_sparsity_checking_methods(self, x_2d):
-        mask = sparsity.get_mask_1d(x_2d, 2, 4)
+        mask = paddle.fluid.contrib.sparsity.get_mask_1d(x_2d, 2, 4)
         self.assertEqual(
-            sparsity.check_sparsity(
-                mask, func_name=sparsity.CheckMethod.CHECK_1D, n=2, m=4),
-            sparsity.check_mask_1d(mask, 2, 4))
-        mask = sparsity.get_mask_2d_best(x_2d, 2, 4)
+            paddle.fluid.contrib.sparsity.check_sparsity(
+                mask,
+                func_name=paddle.fluid.contrib.sparsity.CheckMethod.CHECK_1D,
+                n=2,
+                m=4),
+            paddle.fluid.contrib.sparsity.check_mask_1d(mask, 2, 4))
+        mask = paddle.fluid.contrib.sparsity.get_mask_2d_best(x_2d, 2, 4)
         self.assertEqual(
-            sparsity.check_sparsity(
-                mask, func_name=sparsity.CheckMethod.CHECK_2D, n=2, m=4),
-            sparsity.check_mask_2d(mask, 2, 4))
+            paddle.fluid.contrib.sparsity.check_sparsity(
+                mask,
+                func_name=paddle.fluid.contrib.sparsity.CheckMethod.CHECK_2D,
+                n=2,
+                m=4),
+            paddle.fluid.contrib.sparsity.check_mask_2d(mask, 2, 4))
 
     def __test_1D_2D_sparse_mask_generation_methods(self, x):
-        mask = sparsity.create_mask(
-            x, func_name=sparsity.MaskAlgo.MASK_1D, n=2, m=4)
+        mask = paddle.fluid.contrib.sparsity.create_mask(
+            x,
+            func_name=paddle.fluid.contrib.sparsity.MaskAlgo.MASK_1D,
+            n=2,
+            m=4)
         self.assertTrue(
-            sparsity.check_sparsity(
-                mask, func_name=sparsity.CheckMethod.CHECK_1D, n=2, m=4))
-        mask = sparsity.create_mask(
-            x, func_name=sparsity.MaskAlgo.MASK_2D_GREEDY, n=2, m=4)
+            paddle.fluid.contrib.sparsity.check_sparsity(
+                mask,
+                func_name=paddle.fluid.contrib.sparsity.CheckMethod.CHECK_1D,
+                n=2,
+                m=4))
+        mask = paddle.fluid.contrib.sparsity.create_mask(
+            x,
+            func_name=paddle.fluid.contrib.sparsity.MaskAlgo.MASK_2D_GREEDY,
+            n=2,
+            m=4)
         self.assertTrue(
-            sparsity.check_sparsity(
-                mask, func_name=sparsity.CheckMethod.CHECK_2D, n=2, m=4))
-        mask = sparsity.create_mask(
-            x, func_name=sparsity.MaskAlgo.MASK_2D_BEST, n=2, m=4)
+            paddle.fluid.contrib.sparsity.check_sparsity(
+                mask,
+                func_name=paddle.fluid.contrib.sparsity.CheckMethod.CHECK_2D,
+                n=2,
+                m=4))
+        mask = paddle.fluid.contrib.sparsity.create_mask(
+            x,
+            func_name=paddle.fluid.contrib.sparsity.MaskAlgo.MASK_2D_BEST,
+            n=2,
+            m=4)
         self.assertTrue(
-            sparsity.check_sparsity(
-                mask, func_name=sparsity.CheckMethod.CHECK_2D, n=2, m=4))
+            paddle.fluid.contrib.sparsity.check_sparsity(
+                mask,
+                func_name=paddle.fluid.contrib.sparsity.CheckMethod.CHECK_2D,
+                n=2,
+                m=4))
diff --git a/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp.py b/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp.py
index 34d17f570e427..074aedb947613 100644
--- a/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp.py
+++ b/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp.py
@@ -20,7 +20,7 @@
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import os
-from paddle.fluid.contrib import sparsity
+from paddle.static import sparsity
 from paddle.fluid.contrib.sparsity.asp import ASPHelper
 import numpy as np
 cuda_visible_devices = os.getenv('CUDA_VISIBLE_DEVICES')
@@ -73,7 +73,7 @@ def test_with_asp(self):
         feeder = fluid.DataFeeder(feed_list=[input_x, input_y], place=place)
         exe.run(startup_prog)
 
-        sparsity.prune_model(place, train_prog)
+        sparsity.prune_model(train_prog)
 
         data = (np.random.randn(64, 32), np.random.randint(2, size=(64, 1)))
         exe.run(train_prog, feed=feeder.feed([data]))
@@ -82,7 +82,9 @@ def test_with_asp(self):
             if ASPHelper._is_supported_layer(train_prog, param.name):
                 mat = np.array(fluid.global_scope().find_var(param.name)
                                .get_tensor())
-                self.assertTrue(sparsity.check_sparsity(mat.T, n=2, m=4))
+                self.assertTrue(
+                    paddle.fluid.contrib.sparsity.check_sparsity(
+                        mat.T, n=2, m=4))
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_amp.py b/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_amp.py
index c4074b2ae7a3c..a34d7e69872e2 100644
--- a/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_amp.py
+++ b/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_amp.py
@@ -20,7 +20,7 @@
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import os
-from paddle.fluid.contrib import sparsity
+from paddle.static import sparsity
 from paddle.fluid.contrib.sparsity.asp import ASPHelper
 import numpy as np
 cuda_visible_devices = os.getenv('CUDA_VISIBLE_DEVICES')
@@ -76,7 +76,7 @@ def test_with_asp_and_amp(self):
 
         optimizer.amp_init(place)
 
-        sparsity.prune_model(place, train_prog)
+        sparsity.prune_model(train_prog)
 
         data = (np.random.randn(64, 32), np.random.randint(2, size=(64, 1)))
         exe.run(train_prog, feed=feeder.feed([data]))
@@ -85,7 +85,9 @@ def test_with_asp_and_amp(self):
             if ASPHelper._is_supported_layer(train_prog, param.name):
                 mat = np.array(fluid.global_scope().find_var(param.name)
                                .get_tensor())
-                self.assertTrue(sparsity.check_sparsity(mat.T, n=2, m=4))
+                self.assertTrue(
+                    paddle.fluid.contrib.sparsity.check_sparsity(
+                        mat.T, n=2, m=4))
 
     def test_with_asp_and_pure_fp16(self):
         fleet.init(is_collective=True)
@@ -114,7 +116,7 @@ def test_with_asp_and_pure_fp16(self):
 
         optimizer.amp_init(place)
 
-        sparsity.prune_model(place, train_prog)
+        sparsity.prune_model(train_prog)
 
         data = (np.random.randn(64, 32), np.random.randint(2, size=(64, 1)))
         exe.run(train_prog, feed=feeder.feed([data]))
@@ -123,7 +125,9 @@ def test_with_asp_and_pure_fp16(self):
             if ASPHelper._is_supported_layer(train_prog, param.name):
                 mat = np.array(fluid.global_scope().find_var(param.name)
                                .get_tensor())
-                self.assertTrue(sparsity.check_sparsity(mat.T, n=2, m=4))
+                self.assertTrue(
+                    paddle.fluid.contrib.sparsity.check_sparsity(
+                        mat.T, n=2, m=4))
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel_data_unshard.py b/python/paddle/fluid/tests/unittests/auto_parallel_data_unshard.py
index 367d985862684..ed8cb8a23c372 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel_data_unshard.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel_data_unshard.py
@@ -36,8 +36,7 @@ def test_dp2pp1mp1(self):
         def create_model(train_program, start_program):
             with paddle.static.program_guard(train_program, start_program):
 
-                ROOT_MESH = auto.ProcessMesh([0, 1])
-                MESH_0 = auto.ProcessMesh([0, 1], ROOT_MESH)
+                MESH_0 = auto.ProcessMesh([0, 1])
                 input = paddle.static.data(name='input', shape=[2, 8])
                 label = paddle.static.data(name='label', shape=[2, 8])
 
@@ -47,10 +46,30 @@ def create_model(train_program, start_program):
                 linear0 = nn.Linear(8, 8, weight_attr)
                 linear1 = nn.Linear(8, 8, weight_attr)
 
-                auto.shard_tensor(input, MESH_0, dim_mapping=[0, -1])
-                auto.shard_tensor(label, MESH_0, dim_mapping=[0, -1])
-                auto.shard_tensor(linear0.weight, MESH_0, dim_mapping=[-1, -1])
-                auto.shard_tensor(linear1.weight, MESH_0, dim_mapping=[-1, -1])
+                auto.shard_tensor(
+                    input,
+                    dist_attr={
+                        "process_mesh": MESH_0,
+                        "dims_mapping": [0, -1]
+                    })
+                auto.shard_tensor(
+                    label,
+                    dist_attr={
+                        "process_mesh": MESH_0,
+                        "dims_mapping": [0, -1]
+                    })
+                auto.shard_tensor(
+                    linear0.weight,
+                    dist_attr={
+                        "process_mesh": MESH_0,
+                        "dims_mapping": [-1, -1]
+                    })
+                auto.shard_tensor(
+                    linear1.weight,
+                    dist_attr={
+                        "process_mesh": MESH_0,
+                        "dims_mapping": [-1, -1]
+                    })
 
                 linear0_out = linear0(input)
                 gelu_out = F.gelu(linear0_out)
@@ -105,8 +124,7 @@ def dp1pp1mp2(self):
         def create_model(train_program, start_program):
             with paddle.static.program_guard(train_program, start_program):
 
-                ROOT_MESH = auto.ProcessMesh([0, 1])
-                MESH_0 = auto.ProcessMesh([0, 1], ROOT_MESH)
+                MESH_0 = auto.ProcessMesh([0, 1])
                 input = paddle.static.data(name='input', shape=[8, 8])
                 label = paddle.static.data(name='label', shape=[8, 8])
 
@@ -116,11 +134,31 @@ def create_model(train_program, start_program):
                 linear0 = nn.Linear(8, 8, weight_attr)
                 linear1 = nn.Linear(8, 8, weight_attr)
 
-                auto.shard_tensor(input, MESH_0, dim_mapping=[-1, -1])
-                auto.shard_tensor(label, MESH_0, dim_mapping=[-1, -1])
-
-                auto.shard_tensor(linear0.weight, MESH_0, dim_mapping=[-1, 0])
-                auto.shard_tensor(linear1.weight, MESH_0, dim_mapping=[0, -1])
+                auto.shard_tensor(
+                    input,
+                    dist_attr={
+                        "process_mesh": MESH_0,
+                        "dims_mapping": [-1, -1]
+                    })
+                auto.shard_tensor(
+                    label,
+                    dist_attr={
+                        "process_mesh": MESH_0,
+                        "dims_mapping": [-1, -1]
+                    })
+
+                auto.shard_tensor(
+                    linear0.weight,
+                    dist_attr={
+                        "process_mesh": MESH_0,
+                        "dims_mapping": [-1, 0]
+                    })
+                auto.shard_tensor(
+                    linear1.weight,
+                    dist_attr={
+                        "process_mesh": MESH_0,
+                        "dims_mapping": [0, -1]
+                    })
 
                 linear0_out = linear0(input)
                 gelu_out = F.gelu(linear0_out)
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel_parallelizer.py b/python/paddle/fluid/tests/unittests/auto_parallel_parallelizer.py
index 89880f8c2f49d..036b46470a762 100755
--- a/python/paddle/fluid/tests/unittests/auto_parallel_parallelizer.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel_parallelizer.py
@@ -24,13 +24,12 @@
 from paddle.fluid import layers
 from paddle.distributed import fleet
 import paddle.distributed.auto_parallel as auto
-from paddle.distributed.auto_parallel.utils import print_program_with_distributed_attr
+from paddle.distributed.auto_parallel.utils import print_program_with_dist_attr
 import paddle.fluid.core as core
 
 paddle.enable_static()
 _global_parallel_strategy = None
 _global_process_mesh = None
-ROOT_MESH = auto.ProcessMesh([0, 1])
 
 
 class MLPLayer(nn.Layer):
@@ -78,8 +77,12 @@ def mlp_pretrain_forward(train_program, start_program):
         label = static.data(
             name="label", shape=[batch_size, sequence_len, 1], dtype='float32')
 
-        auto.shard_tensor(input, _global_process_mesh, dim_mapping=[-1, -1, -1])
-        auto.set_pipeline_stage(1)
+        auto.shard_tensor(
+            input,
+            dist_attr={
+                "process_mesh": _global_process_mesh,
+                "dims_mappig": [-1, -1, -1]
+            })
 
         mlp = MLPLayer(
             hidden_size=hidden_size,
@@ -99,7 +102,7 @@ class TestMLPAutoParallelizer(unittest.TestCase):
     def test_mlp_serial(self):
 
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1], parent=ROOT_MESH)
+        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1])
 
         dist_strategy = fleet.DistributedStrategy()
         dist_strategy.amp = False
@@ -131,7 +134,7 @@ def test_mlp_serial(self):
             for op in block.ops:
                 for attr_name in op.attr_names:
                     self.assertTrue(suffix not in attr_name)
-        # print_program_with_distributed_attr(distributed_main_program)
+        # print_program_with_dist_attr(distributed_main_program)
         self.assertIsNotNone(distributed_startup_program)
         self.assertIsNotNone(distributed_main_program)
 
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel_save_load.py b/python/paddle/fluid/tests/unittests/auto_parallel_save_load.py
new file mode 100644
index 0000000000000..6996fab09112f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel_save_load.py
@@ -0,0 +1,344 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import random
+import numpy as np
+import os
+import shutil
+
+import paddle
+import paddle.nn as nn
+import paddle.utils as utils
+import paddle.static as static
+import paddle.nn.functional as F
+import paddle.distributed.auto_parallel as auto
+
+from paddle.distributed import fleet
+from paddle.fluid.initializer import NumpyArrayInitializer
+from paddle.distributed.auto_parallel.utils import make_data_unshard
+from paddle.distributed.auto_parallel.utils import save_distributed_checkpoint, load_distributed_checkpoint
+from paddle.distributed.auto_parallel.reshard import reshard
+from paddle.distributed.auto_parallel.partitioner import Partitioner
+from paddle.distributed.auto_parallel.dist_context import DistributedContext
+from paddle.distributed.auto_parallel.process_group import get_all_process_groups
+
+paddle.enable_static()
+_global_parallel_strategy = None
+_global_process_mesh = None
+PP_MESH_0 = None
+PP_MESH_1 = None
+
+
+class MLPLayer(nn.Layer):
+    def __init__(self,
+                 hidden_size=64,
+                 intermediate_size=4 * 64,
+                 initializer_range=0.02):
+        super(MLPLayer, self).__init__()
+        d_model = hidden_size
+        dim_feedforward = intermediate_size
+        np.random.seed(2021)
+        arr = np.random.normal(0, 0.02, size=(d_model, dim_feedforward))
+        weight_attr = paddle.ParamAttr(initializer=NumpyArrayInitializer(arr))
+        bias_attr = None
+
+        self.linear0 = nn.Linear(
+            d_model, dim_feedforward, weight_attr, bias_attr=bias_attr)
+        self.linear1 = nn.Linear(
+            dim_feedforward, d_model, weight_attr, bias_attr=bias_attr)
+        self.norm = nn.LayerNorm(d_model, epsilon=1e-5)
+
+    def forward(self, input):
+        if _global_parallel_strategy == "pp":
+            auto.shard_tensor(
+                self.linear0.weight,
+                dist_attr={
+                    "process_mesh": PP_MESH_0,
+                    "dims_mapping": [-1, -1]
+                })
+            auto.shard_tensor(
+                self.linear1.weight,
+                dist_attr={
+                    "process_mesh": PP_MESH_1,
+                    "dims_mapping": [-1, -1]
+                })
+        elif _global_parallel_strategy == "mp":
+            auto.shard_tensor(
+                self.linear0.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [-1, 0]
+                })
+            auto.shard_tensor(
+                self.linear1.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [0, -1]
+                })
+        elif _global_parallel_strategy == "dp":
+            auto.shard_tensor(
+                self.linear0.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [-1, -1]
+                })
+            auto.shard_tensor(
+                self.linear1.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [-1, -1]
+                })
+
+        out = self.norm(input)
+        out = self.linear0(out)
+        out = F.gelu(out, approximate=True)
+        out = self.linear1(out)
+
+        return out
+
+
+def mlp_forward(train_program, start_program):
+    with static.program_guard(train_program,start_program), \
+        utils.unique_name.guard():
+
+        batch_size = 4
+        hidden_size = 64
+        input = static.data(
+            name="input", shape=[batch_size, hidden_size], dtype='float32')
+        label = static.data(
+            name="label", shape=[batch_size, 1], dtype='float32')
+
+        if _global_parallel_strategy == "pp":
+            auto.shard_tensor(
+                input,
+                dist_attr={
+                    "process_mesh": PP_MESH_0,
+                    "dims_mapping": [-1, -1]
+                })
+            auto.shard_tensor(
+                label,
+                dist_attr={
+                    "process_mesh": PP_MESH_1,
+                    "dims_mapping": [-1, -1]
+                })
+        elif _global_parallel_strategy == "dp":
+            auto.shard_tensor(
+                input,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [0, -1]
+                })
+        elif _global_parallel_strategy == "mp":
+            auto.shard_tensor(
+                input,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [-1, -1]
+                })
+
+        mlp = MLPLayer(
+            hidden_size=hidden_size,
+            intermediate_size=4 * hidden_size,
+            initializer_range=0.02)
+
+        predict = mlp(input)
+        error_cost = paddle.nn.functional.square_error_cost(predict, label)
+        loss = paddle.mean(error_cost)
+
+    return loss, train_program, start_program
+
+
+def get_distributed_program():
+    train_program = static.Program()
+    startup_program = static.Program()
+
+    dist_strategy = fleet.DistributedStrategy()
+    dist_strategy.semi_auto = True
+    fleet.init(is_collective=True, strategy=dist_strategy)
+
+    loss, train_program, startup_program = mlp_forward(train_program,
+                                                       startup_program)
+
+    optimizer = paddle.fluid.optimizer.SGDOptimizer(learning_rate=0.01)
+    optimizer = fleet.distributed_optimizer(optimizer)
+    _, _, dist_startup_prog, dist_main_prog = optimizer.minimize(
+        loss, startup_program)
+
+    return dist_main_prog, dist_startup_prog, loss
+
+
+class TestMLPSaveLoad(unittest.TestCase):
+    def setUp(self):
+        paddle.seed(2021)
+        random.seed(2021)
+        np.random.seed(2021)
+
+    def test_mlp_dp(self):
+        global _global_parallel_strategy
+        _global_parallel_strategy = "dp"
+        global _global_process_mesh
+        _global_process_mesh = auto.ProcessMesh([0, 1])
+
+        dist_main_prog, dist_start_prog, loss = get_distributed_program()
+        place = paddle.set_device("gpu")
+        exe = paddle.static.Executor(place)
+        exe.run(dist_start_prog)
+
+        input = np.random.random(size=(80, 64)).astype('float32')
+        label = np.random.random(size=(80, 1)).astype('float32')
+        for step in range(20):
+            if step == 10:
+                path = "./output_dp{}".format(paddle.distributed.get_rank())
+                os.makedirs(path, exist_ok=True)
+                save_distributed_checkpoint(dist_main_prog, path)
+
+            res = exe.run(dist_main_prog,
+                          feed={
+                              "input": input[step * 4:(step + 1) * 4, :],
+                              "label": label[step * 4:(step + 1) * 4, :]
+                          },
+                          fetch_list=[loss])
+
+        last_res = res[0]
+        ckpt_path = [
+            "./output_dp0/model_state_rank0.pdmodel",
+            "./output_dp1/model_state_rank1.pdmodel"
+        ]
+        load_distributed_checkpoint(ckpt_path, dist_main_prog)
+        for step in range(10, 20):
+            res = exe.run(dist_main_prog,
+                          feed={
+                              "input": input[step * 4:(step + 1) * 4, :],
+                              "label": label[step * 4:(step + 1) * 4, :]
+                          },
+                          fetch_list=[loss])
+
+        self.assertEqual(last_res, res[0])
+        shutil.rmtree("./output_dp{}".format(paddle.distributed.get_rank()))
+
+    def test_mlp_mp(self):
+        global _global_parallel_strategy
+        _global_parallel_strategy = "mp"
+        global _global_process_mesh
+        _global_process_mesh = auto.ProcessMesh([0, 1])
+
+        dist_main_prog, dist_start_prog, loss = get_distributed_program()
+
+        place = paddle.set_device("gpu")
+        exe = paddle.static.Executor(place)
+        exe.run(dist_start_prog)
+
+        input = np.random.random(size=(80, 64)).astype('float32')
+        label = np.random.random(size=(80, 1)).astype('float32')
+        for step in range(20):
+            if step == 10:
+                path = "./output_mp{}".format(paddle.distributed.get_rank())
+                os.makedirs(path, exist_ok=True)
+                save_distributed_checkpoint(dist_main_prog, path)
+
+            res = exe.run(dist_main_prog,
+                          feed={
+                              "input": input[step * 4:(step + 1) * 4, :],
+                              "label": label[step * 4:(step + 1) * 4, :]
+                          },
+                          fetch_list=[loss])
+
+        last_res = res[0]
+        ckpt_path = [
+            "./output_mp0/model_state_rank0.pdmodel",
+            "./output_mp1/model_state_rank1.pdmodel"
+        ]
+        load_distributed_checkpoint(ckpt_path, dist_main_prog)
+        for step in range(10, 20):
+            res = exe.run(dist_main_prog,
+                          feed={
+                              "input": input[step * 4:(step + 1) * 4, :],
+                              "label": label[step * 4:(step + 1) * 4, :]
+                          },
+                          fetch_list=[loss])
+
+        self.assertEqual(last_res, res[0])
+        shutil.rmtree("./output_mp{}".format(paddle.distributed.get_rank()))
+
+    def test_mlp_pp(self):
+        global _global_parallel_strategy
+        _global_parallel_strategy = "pp"
+        global _global_process_mesh
+        _global_process_mesh = auto.ProcessMesh([0, 1])
+        global PP_MESH_0
+        PP_MESH_0 = auto.ProcessMesh(mesh=[0])
+        global PP_MESH_1
+        PP_MESH_1 = auto.ProcessMesh(mesh=[1])
+
+        dist_main_prog, dist_start_prog, loss = get_distributed_program()
+
+        place = paddle.set_device("gpu")
+        exe = paddle.static.Executor(place)
+        exe.run(dist_start_prog)
+
+        input = np.random.random(size=(80, 64)).astype('float32')
+        label = np.random.random(size=(80, 1)).astype('float32')
+        for step in range(20):
+            if step == 10:
+                path = "./output_pp{}".format(paddle.distributed.get_rank())
+                os.makedirs(path, exist_ok=True)
+                save_distributed_checkpoint(dist_main_prog, path)
+
+            if paddle.distributed.get_rank() in [0]:
+                res = exe.run(dist_main_prog,
+                              feed={
+                                  "input": input[step * 4:(step + 1) * 4, :],
+                                  "label": label[step * 4:(step + 1) * 4, :]
+                              })
+            else:
+                res = exe.run(dist_main_prog,
+                              feed={
+                                  "input": input[step * 4:(step + 1) * 4, :],
+                                  "label": label[step * 4:(step + 1) * 4, :]
+                              },
+                              fetch_list=[loss])
+
+        if paddle.distributed.get_rank() in [1]:
+            last_res = res[0]
+
+        ckpt_path = [
+            "./output_pp0/model_state_rank0.pdmodel",
+            "./output_pp1/model_state_rank1.pdmodel"
+        ]
+        load_distributed_checkpoint(ckpt_path, dist_main_prog)
+        for step in range(10, 20):
+            if paddle.distributed.get_rank() in [0]:
+                res = exe.run(dist_main_prog,
+                              feed={
+                                  "input": input[step * 4:(step + 1) * 4, :],
+                                  "label": label[step * 4:(step + 1) * 4, :]
+                              })
+            else:
+                res = exe.run(dist_main_prog,
+                              feed={
+                                  "input": input[step * 4:(step + 1) * 4, :],
+                                  "label": label[step * 4:(step + 1) * 4, :]
+                              },
+                              fetch_list=[loss])
+
+        if paddle.distributed.get_rank() in [1]:
+            self.assertEqual(last_res, res[0])
+        shutil.rmtree("./output_pp{}".format(paddle.distributed.get_rank()))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_origin_info.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_origin_info.py
index 016a1b3b588ab..e3d34184a38fc 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_origin_info.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_origin_info.py
@@ -65,7 +65,7 @@ def set_test_func(self):
         self.func = simple_func
 
     def set_static_lineno(self):
-        self.static_abs_lineno_list = [5, 6, 7]
+        self.static_abs_lineno_list = [6, 7, 8]
 
     def set_dygraph_info(self):
         self.line_num = 3
@@ -149,7 +149,7 @@ def set_test_func(self):
         self.func = nested_func
 
     def set_static_lineno(self):
-        self.static_abs_lineno_list = [5, 7, 8, 9, 10]
+        self.static_abs_lineno_list = [6, 8, 9, 10, 11]
 
     def set_dygraph_info(self):
         self.line_num = 5
@@ -174,7 +174,7 @@ def set_test_func(self):
         self.func = decorated_func
 
     def set_static_lineno(self):
-        self.static_abs_lineno_list = [5, 6]
+        self.static_abs_lineno_list = [6, 7]
 
     def set_dygraph_info(self):
         self.line_num = 2
@@ -208,7 +208,7 @@ def set_test_func(self):
         self.func = decorated_func2
 
     def set_static_lineno(self):
-        self.static_abs_lineno_list = [5, 6]
+        self.static_abs_lineno_list = [6, 7]
 
     def set_dygraph_info(self):
         self.line_num = 2
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_static_analysis.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_static_analysis.py
index 7f6d6cf1f3b00..afccaca69383c 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_static_analysis.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_static_analysis.py
@@ -57,6 +57,8 @@ def func_to_test3():
     h = None
     i = False
     j = None + 1
+    k: float = 1.0
+    l: paddle.Tensor = paddle.to_tensor([1, 2])
 
 
 result_var_type3 = {
@@ -69,7 +71,9 @@ def func_to_test3():
     'g': {NodeVarType.STRING},
     'h': {NodeVarType.NONE},
     'i': {NodeVarType.BOOLEAN},
-    'j': {NodeVarType.UNKNOWN}
+    'j': {NodeVarType.UNKNOWN},
+    'k': {NodeVarType.FLOAT},
+    'l': {NodeVarType.PADDLE_RETURN_TYPES}
 }
 
 
@@ -139,13 +143,25 @@ def add(x, y):
     'add': {NodeVarType.INT}
 }
 
+
+def func_to_test7(a: int, b: float, c: paddle.Tensor, d: float='diff'):
+    a = True
+
+
+result_var_type7 = {
+    'a': {NodeVarType.BOOLEAN},
+    'b': {NodeVarType.FLOAT},
+    'c': {NodeVarType.TENSOR},
+    'd': {NodeVarType.STRING}
+}
+
 test_funcs = [
     func_to_test1, func_to_test2, func_to_test3, func_to_test4, func_to_test5,
-    func_to_test6
+    func_to_test6, func_to_test7
 ]
 result_var_type = [
     result_var_type1, result_var_type2, result_var_type3, result_var_type4,
-    result_var_type5, result_var_type6
+    result_var_type5, result_var_type6, result_var_type7
 ]
 
 
diff --git a/python/paddle/fluid/tests/unittests/fft/test_fft.py b/python/paddle/fluid/tests/unittests/fft/test_fft.py
index c83c943217d4e..0ef7a1e939e02 100644
--- a/python/paddle/fluid/tests/unittests/fft/test_fft.py
+++ b/python/paddle/fluid/tests/unittests/fft/test_fft.py
@@ -1012,6 +1012,9 @@ def test_rfftfreq(self):
 @parameterize((TEST_CASE_NAME, 'x', 'axes', 'dtype'), [
     ('test_1d', np.random.randn(10), (0, ), 'float64'),
     ('test_2d', np.random.randn(10, 10), (0, 1), 'float64'),
+    ('test_2d_with_all_axes', np.random.randn(10, 10), None, 'float64'),
+    ('test_2d_odd_with_all_axes',
+     np.random.randn(5, 5) + 1j * np.random.randn(5, 5), None, 'complex128'),
 ])
 class TestFftShift(unittest.TestCase):
     def test_fftshift(self):
@@ -1027,10 +1030,13 @@ def test_fftshift(self):
 
 
 @place(DEVICES)
-@parameterize((TEST_CASE_NAME, 'x', 'axes'), [
-    ('test_1d', np.random.randn(10), (0, ), 'float64'),
-    ('test_2d', np.random.randn(10, 10), (0, 1), 'float64'),
-])
+@parameterize(
+    (TEST_CASE_NAME, 'x', 'axes'),
+    [('test_1d', np.random.randn(10), (0, ),
+      'float64'), ('test_2d', np.random.randn(10, 10), (0, 1), 'float64'),
+     ('test_2d_with_all_axes', np.random.randn(10, 10), None, 'float64'),
+     ('test_2d_odd_with_all_axes',
+      np.random.randn(5, 5) + 1j * np.random.randn(5, 5), None, 'complex128')])
 class TestIfftShift(unittest.TestCase):
     def test_ifftshift(self):
         """Test ifftshift with norm condition
diff --git a/python/paddle/fluid/tests/unittests/fft/test_fft_with_static_graph.py b/python/paddle/fluid/tests/unittests/fft/test_fft_with_static_graph.py
index ac9d1557b53e9..4f19cd06a493f 100644
--- a/python/paddle/fluid/tests/unittests/fft/test_fft_with_static_graph.py
+++ b/python/paddle/fluid/tests/unittests/fft/test_fft_with_static_graph.py
@@ -888,6 +888,56 @@ def test_static_ihfftn(self):
                 pass
 
 
+@place(DEVICES)
+@parameterize((TEST_CASE_NAME, 'x', 'axes', 'dtype'), [
+    ('test_1d', np.random.randn(10), (0, ), 'float64'),
+    ('test_2d', np.random.randn(10, 10), (0, 1), 'float64'),
+    ('test_2d_with_all_axes', np.random.randn(10, 10), None, 'float64'),
+    ('test_2d_odd_with_all_axes',
+     np.random.randn(5, 5) + 1j * np.random.randn(5, 5), None, 'complex128'),
+])
+class TestFftShift(unittest.TestCase):
+    def test_fftshift(self):
+        """Test fftshift with norm condition
+        """
+        paddle.enable_static()
+        mp, sp = paddle.static.Program(), paddle.static.Program()
+        with paddle.static.program_guard(mp, sp):
+            input = paddle.static.data('input', x.shape, dtype=x.dtype)
+            output = paddle.fft.fftshift(input, axes)
+
+        exe = paddle.static.Executor(place)
+        exe.run(sp)
+        [output] = exe.run(mp, feed={'input': x}, fetch_list=[output])
+        yield output
+        paddle.disable_static()
+
+
+@place(DEVICES)
+@parameterize(
+    (TEST_CASE_NAME, 'x', 'axes'),
+    [('test_1d', np.random.randn(10), (0, ),
+      'float64'), ('test_2d', np.random.randn(10, 10), (0, 1), 'float64'),
+     ('test_2d_with_all_axes', np.random.randn(10, 10), None, 'float64'),
+     ('test_2d_odd_with_all_axes',
+      np.random.randn(5, 5) + 1j * np.random.randn(5, 5), None, 'complex128')])
+class TestIfftShift(unittest.TestCase):
+    def test_ifftshift(self):
+        """Test ifftshift with norm condition
+        """
+        paddle.enable_static()
+        mp, sp = paddle.static.Program(), paddle.static.Program()
+        with paddle.static.program_guard(mp, sp):
+            input = paddle.static.data('input', x.shape, dtype=x.dtype)
+            output = paddle.fft.ifftshift(input, axes)
+
+        exe = paddle.static.Executor(place)
+        exe.run(sp)
+        [output] = exe.run(mp, feed={'input': x}, fetch_list=[output])
+        yield output
+        paddle.disable_static()
+
+
 if __name__ == '__main__':
     unittest.main()
 
diff --git a/python/paddle/fluid/tests/unittests/gradient_checker.py b/python/paddle/fluid/tests/unittests/gradient_checker.py
index b56bbc07a7f44..dff2b7aa8d8d6 100644
--- a/python/paddle/fluid/tests/unittests/gradient_checker.py
+++ b/python/paddle/fluid/tests/unittests/gradient_checker.py
@@ -304,7 +304,6 @@ def fail_test(msg):
                 if b.has_var(xi.name):
                     clone_x.append(b.var(xi.name))
                     break
-
         analytical.append(
             _compute_analytical_jacobian(prog, clone_x, clone_y, place, scope))
 
@@ -486,7 +485,6 @@ def triple_grad_check(x,
             var_to_np_array_in_scope(scope, place, v.name)
             for v in x_grads_grads
         ]
-
     x += y_grads
     x_init = _as_list(x_init)
     x_init += y_grads_init
diff --git a/python/paddle/fluid/tests/unittests/interpreter/test_standalone_controlflow.py b/python/paddle/fluid/tests/unittests/interpreter/test_standalone_controlflow.py
new file mode 100644
index 0000000000000..7c1497a48535e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/interpreter/test_standalone_controlflow.py
@@ -0,0 +1,131 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import unittest
+import paddle
+from paddle.fluid import core
+from paddle.fluid.core import StandaloneExecutor
+import paddle.fluid as fluid
+from paddle.fluid.framework import Program, program_guard
+import paddle.fluid.layers as layers
+
+import numpy as np
+
+paddle.enable_static()
+
+
+#  test the compatibility of new executor: run old
+#  and new executor twice and check the result.
+#  please override the _get_feeds() and build_prgram()
+class TestCompatibility(unittest.TestCase):
+    def setUp(self):
+        self.place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
+        ) else paddle.CPUPlace()
+        self.iter_run = 4
+
+    def _get_feed(self):
+        """ return the feeds
+        """
+        return None
+
+    def build_program(self):
+        def true_func():
+            return layers.fill_constant(
+                shape=[1, 2], dtype='int32', value=1), layers.fill_constant(
+                    shape=[2, 3], dtype='bool', value=True)
+
+        def false_func():
+            return layers.fill_constant(
+                shape=[3, 4], dtype='float32', value=3), layers.fill_constant(
+                    shape=[4, 5], dtype='int64', value=2)
+
+        main_program = Program()
+        startup_program = Program()
+        with program_guard(main_program, startup_program):
+            x = layers.fill_constant(shape=[1], dtype='float32', value=0.1)
+            y = layers.fill_constant(shape=[1], dtype='float32', value=0.23)
+            pred = layers.less_than(x, y)
+            out = layers.cond(pred, true_func, false_func)
+            # out is a tuple containing 2 tensors
+            return main_program, startup_program, out
+
+    def _run(self, feed):
+        paddle.seed(2020)
+
+        main_program, startup_program, fetch_vars = self.build_program()
+
+        exe = paddle.static.Executor(self.place)
+        exe.run(startup_program)
+        ret = []
+        for i in range(self.iter_run):
+            ret.append(exe.run(main_program, feed=feed, fetch_list=fetch_vars))
+        return ret
+
+    def run_raw_executor(self, feed):
+        out = self._run(feed)
+        print("GT:", out)
+        return out
+
+    def run_new_executor(self, feed):
+        os.environ['FLAGS_USE_STANDALONE_EXECUTOR'] = '1'
+        out = self._run(feed)
+        del os.environ['FLAGS_USE_STANDALONE_EXECUTOR']
+        print("New:", out)
+        return out
+
+    def test_with_feed(self):
+        feed = self._get_feed()
+        res = self.run_new_executor(feed)
+        gt = self.run_raw_executor(feed)
+        for x, y in zip(gt, res):
+            if isinstance(x, list):
+                for tx, ty in zip(x, y):
+                    self.assertTrue(np.array_equal(tx, ty))
+            elif isinstance(x, np.ndarray):
+                self.assertTrue(np.array_equal(tx, ty))
+            else:
+                raise Exception("Not Implement!")
+
+
+class TestWhile(TestCompatibility):
+    def _get_feed(self):
+        """ return the feeds
+        """
+        return None
+
+    def build_program(self):
+        def cond(i, ten):
+            return i < ten
+
+        def body(i, ten):
+            i = i + 1
+            return [i, ten]
+
+        main_program = paddle.static.default_main_program()
+        startup_program = paddle.static.default_startup_program()
+        with paddle.static.program_guard(main_program, startup_program):
+            i = paddle.full(
+                shape=[1], fill_value=0, dtype='int64')  # loop counter
+            ten = paddle.full(
+                shape=[1], fill_value=10, dtype='int64')  # loop length
+            i, ten = paddle.static.nn.while_loop(cond, body, [i, ten])
+
+            exe = paddle.static.Executor(paddle.CPUPlace())
+        return main_program, startup_program, i
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py b/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py
index c927476caecd1..03062ab8e2b29 100644
--- a/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py
+++ b/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py
@@ -256,10 +256,12 @@ def build_program(self):
         main_program = paddle.static.Program()
         startup_program = paddle.static.Program()
         with paddle.static.program_guard(main_program, startup_program):
-            w = paddle.rand([10, 20])
+            w = paddle.rand([10, 3])
             ids = paddle.static.data(name="id", shape=[5], dtype='int64')
+            data = paddle.static.data(name="data", shape=[3], dtype='float32')
             emb = paddle.nn.functional.embedding(
                 x=ids, weight=w, sparse=False, name="embedding")
+            emb = emb + data
 
         return main_program, startup_program, emb
 
@@ -273,7 +275,7 @@ def _run(self, feeds):
 
         for feed in feeds:
             out = exe.run(main_program, feed=feed, fetch_list=fetch_vars)
-
+        print(out)
         return out
 
     def run_new_executor(self, feed):
@@ -284,12 +286,27 @@ def run_new_executor(self, feed):
 
     def test_exception(self):
         feed = [{
-            'id': np.array([1, 2, 3, 4, 5]).astype(np.int64)
+            'id': np.array([1, 2, 3, 4, 5]).astype(np.int64),
+            'data': np.array([1, 2, 3, 4]).astype(np.float32),
         }, {
-            'id': np.array([1, 2, 3, 4, 11]).astype(np.int64)
+            'id': np.array([1, 2, 3, 4, 11]).astype(np.int64),
+            'data': np.array([1, 2, 3, 4]).astype(np.float32),
         }]
         self.assertRaises(ValueError, self.run_new_executor, feed)
 
+    def test_nan(self):
+        flags = {'FLAGS_check_nan_inf': True, 'FLAGS_benchmark': True}
+        paddle.fluid.set_flags(flags)
+        feed = [{
+            'id': np.array([1, 2, 3, 4, 5]).astype(np.int64),
+            'data': np.array([1, 2, 3]).astype(np.float32),
+        }, {
+            'id': np.array([1, 2, 3, 4, 5]).astype(np.int64),
+            'data': np.array([1, 2, 3]).astype(np.float32),
+        }]
+        feed[1]['data'][0] = np.nan
+        self.assertRaises(RuntimeError, self.run_new_executor, feed)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
index 927456b396ea5..43cdb85e75edd 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
@@ -71,4 +71,9 @@ set_tests_properties(test_trt_matmul_quant_dequant PROPERTIES TIMEOUT 100)
 set_tests_properties(test_trt_conv3d_op PROPERTIES TIMEOUT 60)
 set_tests_properties(test_trt_conv3d_transpose_op PROPERTIES TIMEOUT 60)
 set_tests_properties(test_trt_nearest_interp_v2_op PROPERTIES TIMEOUT 30)
+set_tests_properties(test_emb_eltwise_layernorm_fuse_pass PROPERTIES TIMEOUT 120)
+
+if (WITH_MKLDNN)
+  set_tests_properties(test_mkldnn_prelu_op PROPERTIES TIMEOUT 300)
+endif()
 endif()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py b/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py
index 6fc6ec875c68f..337098cde3c0d 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py
@@ -17,35 +17,70 @@
 import abc
 import os
 import enum
+import time
 import logging
+import shutil
 import paddle
 import paddle.fluid as fluid
 from paddle.fluid.initializer import NumpyArrayInitializer
+from paddle.fluid.core import PassVersionChecker
 import paddle.fluid.core as core
 from paddle import compat as cpt
 import paddle.inference as paddle_infer
 from typing import Optional, List, Callable, Dict, Any, Set
 from program_config import TensorConfig, OpConfig, ProgramConfig, create_fake_model, create_quant_model
 
+import hypothesis
+from hypothesis import given, settings, seed, example, assume
+
 logging.basicConfig(level=logging.INFO, format="%(message)s")
 
+settings.register_profile(
+    "ci",
+    max_examples=100,
+    suppress_health_check=hypothesis.HealthCheck.all(),
+    deadline=None,
+    print_blob=True,
+    derandomize=True,
+    report_multiple_bugs=False)
+settings.register_profile(
+    "dev",
+    max_examples=1000,
+    suppress_health_check=hypothesis.HealthCheck.all(),
+    deadline=None,
+    print_blob=True,
+    derandomize=True,
+    report_multiple_bugs=False)
+if float(os.getenv('TEST_NUM_PERCENT_CASES', default='1.0')) < 1 or \
+    os.getenv('HYPOTHESIS_TEST_PROFILE', 'dev') == 'ci':
+    settings.load_profile("ci")
+else:
+    settings.load_profile("dev")
+
 
 class SkipReasons(enum.Enum):
     # Paddle not support, but trt support, we need to add the feature.
     TRT_NOT_IMPLEMENTED = 0
     # TRT not support.
     TRT_NOT_SUPPORT = 1
+    # Accuracy is abnormal after enabling pass.
+    PASS_ACCURACY_ERROR = 2
+    # Accuracy is abnormal after enabling mkldnn.
+    MKLDNN_ACCURACY_ERROR = 3
 
 
 class AutoScanTest(unittest.TestCase):
-    def __init__(self, methodName='runTest'):
+    def __init__(self, *args, **kwargs):
         np.random.seed(1024)
         paddle.enable_static()
-        super(AutoScanTest, self).__init__(methodName)
+        super(AutoScanTest, self).__init__(*args, **kwargs)
         self.skip_cases = []
+        abs_dir = os.path.abspath(os.path.dirname(__file__))
+        self.cache_dir = os.path.join(abs_dir,
+                                      str(self.__module__) + '_cache_dir')
 
     @abc.abstractmethod
-    def sample_program_configs(self) -> List[ProgramConfig]:
+    def sample_program_configs(self):
         '''
         Generate all config with the combination of different Input tensor shape and
         different Attr values.
@@ -53,7 +88,7 @@ def sample_program_configs(self) -> List[ProgramConfig]:
         raise NotImplementedError
 
     @abc.abstractmethod
-    def sample_predictor_configs(self) -> List[paddle_infer.Config]:
+    def sample_predictor_configs(self):
         raise NotImplementedError
 
     @abc.abstractmethod
@@ -88,21 +123,488 @@ def run_test_config(self, model, params, prog_config, pred_config,
             result[out_name] = predictor.get_output_handle(o_name).copy_to_cpu()
         return result
 
+    @abc.abstractmethod
     def assert_tensors_near(self,
-                            threshold: float,
-                            tensors: List[Dict[str, np.array]]):
-        assert len(tensors) > 1
-        first = tensors[0]
-        for group in tensors[1:]:
-            for key, arr in group.items():
-                self.assertTrue(
-                    first[key].shape == arr.shape,
-                    "The output shape of GPU and TensorRT are not equal.")
-                self.assertTrue(
-                    np.allclose(
-                        first[key], arr, atol=threshold),
-                    "Output has diff between GPU and TensorRT. ")
+                            atol: float,
+                            rtol: float,
+                            tensor: Dict[str, np.array],
+                            baseline: Dict[str, np.array]):
+        for key, arr in tensor.items():
+            self.assertTrue(
+                baseline[key].shape == arr.shape,
+                "The output shapes are not equal, the baseline shape is " +
+                str(baseline[key].shape) + ', but got ' + str(arr.shape))
+            self.assertTrue(
+                np.allclose(
+                    baseline[key], arr, atol=atol, rtol=rtol),
+                "Output has diff. ")
 
     @abc.abstractmethod
     def run_test(self, quant=False):
         raise NotImplementedError
+
+    def generate_op_config(self,
+                           ops_config: List[Dict[str, Any]]) -> List[OpConfig]:
+        ops = []
+        for i in range(len(ops_config)):
+            op_config = ops_config[i]
+            ops.append(
+                OpConfig(
+                    type=op_config['op_type'],
+                    inputs=op_config['op_inputs'],
+                    outputs=op_config['op_outputs'],
+                    attrs=op_config['op_attrs']))
+        return ops
+
+    @abc.abstractmethod
+    def skip_log(self, msg: str):
+        logging.warning("SKIP: " + msg)
+
+    @abc.abstractmethod
+    def fail_log(self, msg: str):
+        logging.error("FAILE: " + msg)
+
+    @abc.abstractmethod
+    def success_log(self, msg: str):
+        logging.info("SUCCESS: " + msg)
+
+    @abc.abstractmethod
+    def create_inference_config(self,
+                                passes: Optional[List[str]]=None,
+                                use_gpu: bool=False,
+                                use_mkldnn: bool=False,
+                                ir_optim: Optional[bool]=None):
+        config = paddle_infer.Config()
+        config.switch_ir_debug(True)
+        config.set_optim_cache_dir(self.cache_dir)
+        config.disable_glog_info()
+        if ir_optim is not None:
+            config.switch_ir_optim(ir_optim)
+        if use_gpu:
+            config.enable_use_gpu(100, 0)
+        if use_mkldnn:
+            config.enable_mkldnn()
+        if passes is not None:
+            config.pass_builder().set_passes(passes)
+            self.passes = passes
+        return config
+
+
+class MkldnnAutoScanTest(AutoScanTest):
+    def __init__(self, *args, **kwargs):
+        super(MkldnnAutoScanTest, self).__init__(*args, **kwargs)
+
+    def run_test(self, quant=False, *args, **kwargs):
+        status = True
+
+        for prog_config in self.sample_program_configs(*args, **kwargs):
+            # if program is invalid, we should skip that cases.
+            if not self.is_program_valid(prog_config):
+                continue
+
+            model, params = create_fake_model(prog_config)
+            if quant:
+                model, params = create_quant_model(model, params)
+
+            feed_data = {}
+            for name, tensor_config in prog_config.inputs.items():
+                feed_data[name] = {
+                    'data': tensor_config.data,
+                    'lod': tensor_config.lod
+                }
+            results: List[Dict[str, np.ndarray]] = []
+
+            # baseline: cpu no ir_optim run
+            base_config = self.create_inference_config(ir_optim=False)
+            logging.info('RUN program_config: ' + str(prog_config))
+            results.append(
+                self.run_test_config(model, params, prog_config, base_config,
+                                     feed_data))
+            self.success_log('RUN_CPU_BASELINE done')
+
+            for pred_config, (
+                    atol, rtol) in self.sample_predictor_configs(prog_config):
+                # skip info
+                skip_flag = False
+                for skip_info in self.skip_cases:
+                    if skip_info[0](prog_config, pred_config):
+                        skip_flag = True
+                        if skip_info[1] == SkipReasons.MKLDNN_ACCURACY_ERROR:
+                            self.skip_log("[MKLDNN_ACCURACY_ERROR] " +
+                                          skip_info[2] + ' ' + ' vs ' + self.
+                                          inference_config_str(pred_config))
+                        else:
+                            raise NotImplementedError
+                        break
+
+                if os.path.exists(self.cache_dir):
+                    shutil.rmtree(self.cache_dir)
+                if not os.path.exists(self.cache_dir):
+                    os.mkdir(self.cache_dir)
+
+                try:
+                    results.append(
+                        self.run_test_config(model, params, prog_config,
+                                             pred_config, feed_data))
+                    self.assert_tensors_near(atol, rtol, results[-1],
+                                             results[0])
+                except Exception as e:
+                    self.fail_log(
+                        self.inference_config_str(pred_config) +
+                        '\033[1;31m \nERROR INFO: {}\033[0m'.format(str(e)))
+                    if not skip_flag:
+                        status = False
+                    continue
+                self.success_log('RUN predictor_config ' + self.
+                                 inference_config_str(pred_config) + ' done')
+
+        self.assertTrue(status)
+
+    def inference_config_str(self, config) -> bool:
+        dic = {}
+        enable_mkldnn = config.mkldnn_enabled()
+        dic['use_mkldnn'] = enable_mkldnn
+        enable_gpu = config.use_gpu()
+        dic['use_gpu'] = enable_gpu
+        return str(dic)
+
+
+class PassAutoScanTest(AutoScanTest):
+    def __init__(self, *args, **kwargs):
+        super(PassAutoScanTest, self).__init__(*args, **kwargs)
+        self.passes = []
+
+    def check_op_version(self):
+        status = True
+        for pass_name in self.passes:
+            if not PassVersionChecker.IsCompatible(pass_name):
+                self.fail_log('{} version check failed.'.format(pass_name))
+                status = False
+        return status
+
+    def assert_op_size(self, fusion_before_num, fusion_after_num, origin_model):
+        if not self.passes:
+            raise ValueError(
+                'In PassAutoScan you should give a valid pass name.')
+        last_passed_program = os.path.join(self.cache_dir,
+                                           self.passes[-1] + '.pdmodel')
+        model_bytes = paddle.static.load_from_file(last_passed_program)
+        pg = paddle.static.deserialize_program(model_bytes)
+        main_block = pg.desc.block(0)
+        after_op_size = main_block.op_size()
+        pg = paddle.static.deserialize_program(origin_model)
+        main_block = pg.desc.block(0)
+        before_op_size = main_block.op_size()
+        self.assertTrue(before_op_size == fusion_before_num,
+                        'before fusion op size is {}, but got {}!'.format(
+                            before_op_size, fusion_before_num))
+        self.assertTrue(after_op_size == fusion_after_num,
+                        'after fusion op size is {}, but got {}!'.format(
+                            after_op_size, fusion_after_num))
+
+    def run_test(self, quant=False, *args, **kwargs):
+        status = True
+
+        for prog_config in self.sample_program_configs(*args, **kwargs):
+            # if program is invalid, we should skip that cases.
+            if not self.is_program_valid(prog_config):
+                continue
+
+            model, params = create_fake_model(prog_config)
+            if quant:
+                model, params = create_quant_model(model, params)
+
+            feed_data = {}
+            for name, tensor_config in prog_config.inputs.items():
+                feed_data[name] = {
+                    'data': tensor_config.data,
+                    'lod': tensor_config.lod
+                }
+            results: List[Dict[str, np.ndarray]] = []
+
+            # baseline: cpu no ir_optim run
+            base_config = self.create_inference_config(ir_optim=False)
+            logging.info('RUN program_config: ' + str(prog_config))
+            results.append(
+                self.run_test_config(model, params, prog_config, base_config,
+                                     feed_data))
+            self.success_log('RUN_CPU_BASELINE done')
+
+            for pred_config, nodes_num, (
+                    atol, rtol) in self.sample_predictor_configs(prog_config):
+                # skip info
+                skip_flag = False
+                for skip_info in self.skip_cases:
+                    if skip_info[0](prog_config, pred_config):
+                        skip_flag = True
+                        if skip_info[1] == SkipReasons.PASS_ACCURACY_ERROR:
+                            self.skip_log("[PASS_ACCURACY_ERROR] " + skip_info[
+                                2] + ' ' + ' vs ' + self.inference_config_str(
+                                    pred_config))
+                        else:
+                            raise NotImplementedError
+                        break
+
+                if os.path.exists(self.cache_dir):
+                    shutil.rmtree(self.cache_dir)
+                if not os.path.exists(self.cache_dir):
+                    os.mkdir(self.cache_dir)
+
+                try:
+                    results.append(
+                        self.run_test_config(model, params, prog_config,
+                                             pred_config, feed_data))
+                    self.assert_tensors_near(atol, rtol, results[-1],
+                                             results[0])
+                    if not skip_flag:
+                        self.assert_op_size(nodes_num[0], nodes_num[1], model)
+
+                except Exception as e:
+                    self.fail_log(
+                        self.inference_config_str(pred_config) +
+                        '\033[1;31m \nERROR INFO: {}\033[0m'.format(str(e)))
+                    if not skip_flag:
+                        status = False
+                    continue
+                self.success_log('RUN predictor_config ' + self.
+                                 inference_config_str(pred_config) + ' done')
+
+        status = self.check_op_version() and status
+        self.assertTrue(status)
+
+    def inference_config_str(self, config) -> bool:
+        dic = {}
+        enable_mkldnn = config.mkldnn_enabled()
+        dic['use_mkldnn'] = enable_mkldnn
+        enable_gpu = config.use_gpu()
+        dic['use_gpu'] = enable_gpu
+        if not self.passes:
+            dic['passes'] = self.passes
+
+        enable_trt = config.tensorrt_engine_enabled()
+        trt_precison = config.tensorrt_precision_mode()
+        trt_dynamic_shape = config.tensorrt_dynamic_shape_enabled()
+        if enable_trt:
+            dic['use_trt'] = True
+            dic['trt_precision'] = trt_precison
+            dic['use_dynamic_shape'] = trt_dynamic_shape
+        else:
+            dic['use_trt'] = False
+        return str(dic)
+
+    def create_trt_inference_config(self) -> paddle_infer.Config:
+        config = paddle_infer.Config()
+        config.disable_glog_info()
+        config.enable_use_gpu(100, 0)
+        config.set_optim_cache_dir(self.cache_dir)
+        config.switch_ir_debug()
+        # for assert_op_size.
+        self.passes = ['transpose_flatten_concat_fuse_pass']
+        return config
+
+
+class TrtLayerAutoScanTest(AutoScanTest):
+    class TensorRTParam:
+        '''
+        TensorRT subgraph engine parameters. 
+        '''
+
+        def __init__(self, workspace_size, max_batch_size, min_subgraph_size,
+                     precision, use_static, use_calib_mode):
+            self.workspace_size = workspace_size
+            self.max_batch_size = max_batch_size
+            self.min_subgraph_size = min_subgraph_size
+            self.precision = precision
+            self.use_static = use_static
+            self.use_calib_mode = use_calib_mode
+
+    class DynamicShapeParam:
+        '''
+         Prepare TensorRT subgraph engine dynamic shape parameters. 
+         '''
+
+        def __init__(self, min_input_shape, max_input_shape, opt_input_shape,
+                     disable_trt_plugin_fp16):
+            self.min_input_shape = min_input_shape
+            self.max_input_shape = max_input_shape
+            self.opt_input_shape = opt_input_shape
+            self.disable_trt_plugin_fp16 = disable_trt_plugin_fp16
+
+    def __init__(self, *args, **kwargs):
+        super(TrtLayerAutoScanTest, self).__init__(*args, **kwargs)
+        self.trt_param = self.TensorRTParam(
+            workspace_size=1024,
+            max_batch_size=4,
+            min_subgraph_size=0,
+            precision=paddle_infer.PrecisionType.Float32,
+            use_static=True,
+            use_calib_mode=False)
+        self.dynamic_shape = self.DynamicShapeParam({}, {}, {}, False)
+        self.num_percent_cases = float(
+            os.getenv(
+                'TEST_NUM_PERCENT_CASES', default='1.0'))
+        # Choose different tests by week
+        np.random.seed(int(time.strftime("%W")))
+
+    def create_inference_config(self, use_trt=True) -> paddle_infer.Config:
+        config = paddle_infer.Config()
+        config.disable_glog_info()
+        config.enable_use_gpu(100, 0)
+        config.set_optim_cache_dir(self.cache_dir)
+        if use_trt:
+            config.switch_ir_debug()
+            config.enable_tensorrt_engine(
+                max_batch_size=self.trt_param.max_batch_size,
+                workspace_size=self.trt_param.workspace_size,
+                min_subgraph_size=self.trt_param.min_subgraph_size,
+                precision_mode=self.trt_param.precision,
+                use_static=self.trt_param.use_static,
+                use_calib_mode=self.trt_param.use_calib_mode)
+            if len(self.dynamic_shape.min_input_shape
+                   ) != 0 and self.dynamic_shape.min_input_shape.keys(
+                   ) == self.dynamic_shape.max_input_shape.keys(
+                   ) and self.dynamic_shape.min_input_shape.keys(
+                   ) == self.dynamic_shape.opt_input_shape.keys():
+                config.set_trt_dynamic_shape_info(
+                    self.dynamic_shape.min_input_shape,
+                    self.dynamic_shape.max_input_shape,
+                    self.dynamic_shape.opt_input_shape,
+                    self.dynamic_shape.disable_trt_plugin_fp16)
+        return config
+
+    def assert_op_size(self, trt_engine_num, paddle_op_num):
+        last_passed_program = os.path.join(
+            self.cache_dir, 'transpose_flatten_concat_fuse_pass.pdmodel')
+        model_bytes = paddle.static.load_from_file(last_passed_program)
+        pg = paddle.static.deserialize_program(model_bytes)
+        main_block = pg.desc.block(0)
+        op_size = main_block.op_size()
+        op_types = [
+            main_block.op(i).type() == 'tensorrt_engine' for i in range(op_size)
+        ]
+        trt_engine_size = sum(op_types)
+        paddle_op_size = op_size - trt_engine_size
+        self.assertTrue(trt_engine_size == trt_engine_num,
+                        'trt_engine_num is {}, but got {}!'.format(
+                            trt_engine_size, trt_engine_num))
+        self.assertTrue(paddle_op_size == paddle_op_num,
+                        'paddle_op_num is {}, but got {}!'.format(
+                            paddle_op_size, paddle_op_num))
+
+    def inference_config_str(self, config: paddle_infer.Config):
+        dic = {}
+        enable_trt = config.tensorrt_engine_enabled()
+        trt_precison = config.tensorrt_precision_mode()
+        trt_dynamic_shape = config.tensorrt_dynamic_shape_enabled()
+        if enable_trt:
+            dic['use_trt'] = True
+            dic['trt_precision'] = trt_precison
+            dic['use_dynamic_shape'] = trt_dynamic_shape
+        else:
+            dic['use_trt'] = False
+        return str(dic)
+
+    def run_test(self, quant=False, *args, **kwargs):
+        status = True
+        run_flags = []
+        for prog_config in self.sample_program_configs(*args, **kwargs):
+            # In CI, only run 10% cases
+            if np.random.rand() < self.num_percent_cases:
+                run_flags.append(True)
+            else:
+                run_flags.append(False)
+
+        for prog_config, run_flags in zip(
+                self.sample_program_configs(*args, **kwargs), run_flags):
+            if not run_flags:
+                continue
+
+            # if program is invalid, we should skip that cases.
+            if not self.is_program_valid(prog_config):
+                continue
+
+            model, params = create_fake_model(prog_config)
+            if quant:
+                model, params = create_quant_model(model, params)
+
+            feed_data = {}
+            for name, tensor_config in prog_config.inputs.items():
+                feed_data[name] = {
+                    'data': tensor_config.data,
+                    'lod': tensor_config.lod
+                }
+
+            results: List[Dict[str, np.ndarray]] = []
+
+            # baseline: gpu run
+            logging.info('RUN program_config: ' + str(prog_config))
+            gpu_config = self.create_inference_config(use_trt=False)
+            results.append(
+                self.run_test_config(model, params, prog_config, gpu_config,
+                                     feed_data))
+            self.success_log('RUN_GPU_BASELINE done')
+
+            for pred_config, nodes_num, threshold in self.sample_predictor_configs(
+                    prog_config):
+
+                if os.path.exists(self.cache_dir):
+                    shutil.rmtree(self.cache_dir)
+
+                if isinstance(threshold, float):
+                    atol = threshold
+                    rtol = 1e-8
+                elif isinstance(threshold, list) or isinstance(threshold,
+                                                               tuple):
+                    atol = threshold[0]
+                    rtol = threshold[1]
+                else:
+                    raise NotImplementedError
+
+                if quant and pred_config.tensorrt_precision_mode(
+                ) != paddle_infer.PrecisionType.Int8:
+                    continue
+                if pred_config.tensorrt_precision_mode(
+                ) == paddle_infer.PrecisionType.Int8 and not quant:
+                    continue
+
+                skip_flag = False
+                for skip_info in self.skip_cases:
+                    if skip_info[0](prog_config, pred_config):
+                        skip_flag = True
+                        if skip_info[1] == SkipReasons.TRT_NOT_IMPLEMENTED:
+                            self.skip_log("[TRT_NOT_IMPLEMENTED] " + skip_info[
+                                2] + ' ' + ' vs ' + self.inference_config_str(
+                                    pred_config))
+                        elif skip_info[1] == SkipReasons.TRT_NOT_SUPPORT:
+                            self.skip_log("[TRT_NOT_SUPPORT] " + skip_info[
+                                2] + ' ' + ' vs ' + self.inference_config_str(
+                                    pred_config))
+                        else:
+                            raise NotImplementedError
+                        break
+
+                try:
+                    pred_config_deserialize = paddle_infer.Config(pred_config)
+                    results.append(
+                        self.run_test_config(model, params, prog_config,
+                                             pred_config, feed_data))
+                    self.assert_tensors_near(atol, rtol, results[-1],
+                                             results[0])
+                    if not skip_flag:
+                        self.assert_op_size(nodes_num[0], nodes_num[1])
+                    # deserialize test
+                    if nodes_num[0] > 0:
+                        self.run_test_config(model, params, prog_config,
+                                             pred_config_deserialize, feed_data)
+                except Exception as e:
+                    self.fail_log(
+                        str(prog_config) + ' vs ' + self.inference_config_str(
+                            pred_config) +
+                        '\033[1;31m \nERROR INFO: {}\033[0m'.format(str(e)))
+                    if not skip_flag:
+                        status = False
+                    continue
+                self.success_log('RUN predictor_config ' + self.
+                                 inference_config_str(pred_config) + ' done')
+
+        self.assertTrue(status)
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_emb_eltwise_layernorm_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_emb_eltwise_layernorm_fuse_pass.py
new file mode 100644
index 0000000000000..2046307e5c518
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_emb_eltwise_layernorm_fuse_pass.py
@@ -0,0 +1,321 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from auto_scan_test import PassAutoScanTest, SkipReasons
+from program_config import TensorConfig, ProgramConfig
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import Optional, List, Callable, Dict, Any, Set
+import unittest
+
+import hypothesis
+from hypothesis import given, settings, seed, example, assume
+import hypothesis.strategies as st
+
+
+class TestEmbeddingEltwiseLayerNormFusePass(PassAutoScanTest):
+    '''
+  in_var1  emb_var   in_var2   emb_var   in_var3   emb_var   in_var   emb_var
+    |        |        |         |        |         |           |         |
+   lookup_table      lookup_table       lookup_table   ...    lookup_table
+        |                 |                  |                     |
+     lkt_var           lkt_var            lkt_var               lkt_var
+        \                 /                  |         ...         |
+          elementwise_add                    |                     |
+                 \                          /                      |
+                       elementwise_add                             |
+                               |                                   |
+                            elt_var                               /
+                               \                                 /
+                                         elementwise_add
+                                                 |
+                                            layer_norm
+    '''
+
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        # is_sparse is only support False
+        if program_config.ops[0].attrs['is_sparse'] == True:
+            return False
+
+        # is_distributed only support False
+        if program_config.ops[0].attrs['is_distributed'] == True:
+            return False
+
+        # axis only support -1 and the last dim.
+        if program_config.ops[3].attrs['axis'] not in [-1, 2]:
+            return False
+
+        if not (program_config.ops[5].attrs['epsilon'] >= 0 and
+                program_config.ops[5].attrs['epsilon'] <= 0.001):
+            return False
+
+        if program_config.ops[5].attrs['begin_norm_axis'] != 2:
+            return False
+
+        # input check
+        if program_config.weights['embedding_weight1'].shape[
+                1] != program_config.weights['layer_norm_scale'].shape[0]:
+            return False
+
+        return True
+
+    def sample_program_configs(self, *args, **kwargs):
+        def generate_input(attrs):
+            if attrs[0]['op_type'] == 'lookup_table':
+                return np.random.randint(
+                    0,
+                    attrs[3]['weight_size'][0],
+                    size=(attrs[3]['batch_size'], attrs[3]['input_dim'],
+                          1)).astype(np.int64)
+            else:
+                return np.random.randint(
+                    0,
+                    attrs[3]['weight_size'][0],
+                    size=(attrs[3]['batch_size'],
+                          attrs[3]['input_dim'])).astype(np.int64)
+
+        def generate_weight1(attrs):
+            # set embedding weight by attrs
+            return np.random.random(attrs['weight_size']).astype(np.float32)
+
+        def generate_weight2(attrs):
+            # set layernorm weight by attrs
+            if attrs[2]['begin_norm_axis'] == 1:
+                return np.random.random(
+                    attrs[3]['input_dim'] *
+                    attrs[3]['weight_size'][1]).astype(np.float32)
+            else:
+                return np.random.random(attrs[3]['weight_size'][1]).astype(
+                    np.float32)
+
+        attrs = [{
+            'is_sparse': kwargs['is_sparse'],
+            'is_distributed': kwargs['is_distributed'],
+            'padding_idx': kwargs['padding_idx'],
+            'op_type': kwargs['op_type']
+        }, {
+            'axis': kwargs['axis']
+        }, {
+            'begin_norm_axis': kwargs['begin_norm_axis'],
+            'epsilon': kwargs['epsilon']
+        }, {
+            'batch_size': kwargs['batch_size'],
+            'input_dim': kwargs['input_dim'],
+            'weight_size': kwargs['weight_size']
+        }]
+
+        ops_config = [{
+            "op_type": attrs[0]['op_type'],
+            "op_inputs": {
+                "Ids": ["input_data1"],
+                "W": ["embedding_weight1"]
+            },
+            "op_outputs": {
+                "Out": ["embedding_output1"]
+            },
+            "op_attrs": {
+                'is_sparse': attrs[0]['is_sparse'],
+                'is_distributed': attrs[0]['is_distributed'],
+                'padding_idx': attrs[0]['padding_idx'],
+            }
+        }, {
+            "op_type": attrs[0]['op_type'],
+            "op_inputs": {
+                "Ids": ["input_data2"],
+                "W": ["embedding_weight2"]
+            },
+            "op_outputs": {
+                "Out": ["embedding_output2"]
+            },
+            "op_attrs": {
+                'is_sparse': attrs[0]['is_sparse'],
+                'is_distributed': attrs[0]['is_distributed'],
+                'padding_idx': attrs[0]['padding_idx'],
+            },
+        }, {
+            "op_type": attrs[0]['op_type'],
+            "op_inputs": {
+                "Ids": ["input_data3"],
+                "W": ["embedding_weight3"]
+            },
+            "op_outputs": {
+                "Out": ["embedding_output3"]
+            },
+            "op_attrs": {
+                'is_sparse': attrs[0]['is_sparse'],
+                'is_distributed': attrs[0]['is_distributed'],
+                'padding_idx': attrs[0]['padding_idx'],
+            },
+        }, {
+            "op_type": "elementwise_add",
+            "op_inputs": {
+                "X": ["embedding_output2"],
+                "Y": ["embedding_output3"]
+            },
+            "op_outputs": {
+                "Out": ["elementwise_add_output1"]
+            },
+            "op_attrs": {
+                "axis": attrs[1]['axis'],
+            }
+        }, {
+            "op_type": "elementwise_add",
+            "op_inputs": {
+                "X": ["elementwise_add_output1"],
+                "Y": ["embedding_output1"]
+            },
+            "op_outputs": {
+                "Out": ["elementwise_add_output2"]
+            },
+            "op_attrs": {
+                "axis": attrs[1]['axis'],
+            }
+        }, {
+            "op_type": "layer_norm",
+            "op_inputs": {
+                "X": ["elementwise_add_output2"],
+                "Bias": ["layer_norm_bias"],
+                "Scale": ["layer_norm_scale"]
+            },
+            "op_outputs": {
+                "Y": ["layer_norm_output1"],
+                "Mean": ["layer_norm_output2"],
+                "Variance": ["layer_norm_output3"]
+            },
+            "op_attrs": {
+                'begin_norm_axis': attrs[2]['begin_norm_axis'],
+                'epsilon': attrs[2]['epsilon'],
+            }
+        }]
+
+        ops = self.generate_op_config(ops_config)
+
+        program_config = ProgramConfig(
+            ops=ops,
+            weights={
+                "embedding_weight1":
+                TensorConfig(data_gen=partial(generate_weight1, attrs[3])),
+                "embedding_weight2":
+                TensorConfig(data_gen=partial(generate_weight1, attrs[3])),
+                "embedding_weight3":
+                TensorConfig(data_gen=partial(generate_weight1, attrs[3])),
+                "layer_norm_bias":
+                TensorConfig(data_gen=partial(generate_weight2, attrs)),
+                "layer_norm_scale":
+                TensorConfig(data_gen=partial(generate_weight2, attrs))
+            },
+            inputs={
+                "input_data1":
+                TensorConfig(data_gen=partial(generate_input, attrs)),
+                "input_data2":
+                TensorConfig(data_gen=partial(generate_input, attrs)),
+                "input_data3":
+                TensorConfig(data_gen=partial(generate_input, attrs))
+            },
+            outputs=["layer_norm_output1"])
+
+        yield program_config
+
+    def sample_predictor_configs(self, program_config):
+        # only used in gpu passes and trt passes.
+        config = self.create_inference_config(
+            passes=['embedding_eltwise_layernorm_fuse_pass'], use_gpu=True)
+        yield config, (10, 5), (1e-5, 1e-5)
+        # trt static_shape
+        config = self.create_trt_inference_config()
+        config.enable_tensorrt_engine(
+            max_batch_size=4,
+            workspace_size=102400,
+            min_subgraph_size=0,
+            precision_mode=paddle_infer.PrecisionType.Float32,
+            use_static=False,
+            use_calib_mode=False)
+        yield config, (10, 3), (1e-5, 1e-5)
+        # trt dynamic_shape
+        config = self.create_trt_inference_config()
+        config.enable_tensorrt_engine(
+            max_batch_size=4,
+            workspace_size=102400,
+            min_subgraph_size=0,
+            precision_mode=paddle_infer.PrecisionType.Float32,
+            use_static=False,
+            use_calib_mode=False)
+        if program_config.ops[0].type == 'lookup_table':
+            config.set_trt_dynamic_shape_info({
+                "input_data1": [1, 4, 1],
+                "input_data2": [1, 4, 1],
+                "input_data3": [1, 4, 1]
+            }, {
+                "input_data1": [4, 512, 1],
+                "input_data2": [4, 512, 1],
+                "input_data3": [4, 512, 1]
+            }, {
+                "input_data1": [2, 128, 1],
+                "input_data2": [2, 128, 1],
+                "input_data3": [2, 128, 1]
+            })
+        else:
+            config.set_trt_dynamic_shape_info({
+                "input_data1": [1, 4],
+                "input_data2": [1, 4],
+                "input_data3": [1, 4]
+            }, {
+                "input_data1": [4, 512],
+                "input_data2": [4, 512],
+                "input_data3": [4, 512]
+            }, {
+                "input_data1": [2, 128],
+                "input_data2": [2, 128],
+                "input_data3": [2, 128]
+            })
+        yield config, (10, 3), (1e-5, 1e-5)
+
+    def add_skip_pass_case(self):
+        def teller1(program_config, predictor_config):
+            if program_config.ops[3].attrs['axis'] in [
+                    -1, 2
+            ] and program_config.ops[5].attrs[
+                    'begin_norm_axis'] == 2 and program_config.weights[
+                        'embedding_weight1'].shape in [(64, 32), (64, 64)]:
+                return True
+            return False
+
+        self.add_skip_case(teller1, SkipReasons.PASS_ACCURACY_ERROR,
+                           "The pass output has diff in a specific case.")
+
+    @given(
+        is_sparse=st.booleans(),
+        is_distributed=st.booleans(),
+        padding_idx=st.integers(),
+        axis=st.integers(
+            min_value=-4, max_value=4),
+        op_type=st.sampled_from(['lookup_table', 'lookup_table_v2']),
+        epsilon=st.floats(
+            min_value=0, max_value=0.001),
+        begin_norm_axis=st.integers(
+            min_value=-4, max_value=4),
+        batch_size=st.integers(
+            min_value=1, max_value=4),
+        input_dim=st.sampled_from([32, 64]),
+        weight_size=st.sampled_from([[64, 64], [64, 32]]))
+    def test(self, *args, **kwargs):
+        assume(kwargs['begin_norm_axis'] == 2)
+
+        self.add_skip_pass_case()
+        self.run_test(quant=False, *args, **kwargs)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_activation_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_activation_fuse_pass.py
index 11d05f32c4d13..cf9b2257553b7 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_activation_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_activation_fuse_pass.py
@@ -102,5 +102,14 @@ def set_params(self):
         self.pass_name = 'conv_hard_swish_mkldnn_fuse_pass'
 
 
+class ConvHardSigmoidOneDNNFusePassTest(ConvActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.conv_num_filters = 5
+        self.conv_filter_size = 5
+        self.conv_bias_attr = True
+        self.act = "hard_sigmoid"
+        self.pass_name = 'conv_hard_sigmoid_mkldnn_fuse_pass'
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_prelu_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_prelu_op.py
new file mode 100644
index 0000000000000..32642096c76c6
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_prelu_op.py
@@ -0,0 +1,103 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from auto_scan_test import MkldnnAutoScanTest, SkipReasons
+from program_config import TensorConfig, ProgramConfig
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import Optional, List, Callable, Dict, Any, Set
+import unittest
+
+import hypothesis
+from hypothesis import given, settings, seed, example, assume
+import hypothesis.strategies as st
+
+
+class TestMkldnnPreluOp(MkldnnAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        # if mode is channel, and in_shape is 1 rank
+        if len(program_config.inputs['input_data'].
+               shape) == 1 and program_config.ops[0].attrs['mode'] == 'channel':
+            return False
+        return True
+
+    def sample_program_configs(self, *args, **kwargs):
+        def generate_input(*args, **kwargs):
+            return np.random.random(kwargs['in_shape']).astype(np.float32)
+
+        def generate_alpha(*args, **kwargs):
+            if kwargs["mode"] == "all":
+                return np.random.random(size=(1)).astype(np.float32)
+            elif kwargs["mode"] == "channel":
+                if len(kwargs['in_shape']) <= 1:
+                    # not valid case, just return 0
+                    return np.zeros((1)).astype(np.float32)
+                return np.random.random(kwargs['in_shape'][1]).astype(
+                    np.float32)
+            else:
+                if len(kwargs['in_shape']) <= 1:
+                    # not valid case, just return 0
+                    return np.zeros((1)).astype(np.float32)
+                return np.random.random(kwargs['in_shape']).astype(np.float32)
+
+        ops_config = [{
+            "op_type": "prelu",
+            "op_inputs": {
+                "X": ["input_data"],
+                "Alpha": ["alpha_weight"]
+            },
+            "op_outputs": {
+                "Out": ["output_data"]
+            },
+            "op_attrs": {
+                "mode": kwargs['mode']
+            }
+        }]
+
+        ops = self.generate_op_config(ops_config)
+
+        program_config = ProgramConfig(
+            ops=ops,
+            weights={
+                "alpha_weight":
+                TensorConfig(data_gen=partial(generate_alpha, *args, **kwargs))
+            },
+            inputs={
+                "input_data":
+                TensorConfig(data_gen=partial(generate_input, *args, **kwargs)),
+            },
+            outputs=["output_data"])
+
+        yield program_config
+
+    def sample_predictor_configs(self, program_config):
+        config = self.create_inference_config(use_mkldnn=True)
+        yield config, (1e-5, 1e-5)
+
+    def add_skip_pass_case(self):
+        pass
+
+    @given(
+        mode=st.sampled_from(['all', 'channel', 'element']),
+        in_shape=st.lists(
+            st.integers(
+                min_value=1, max_value=32), min_size=1, max_size=4))
+    def test(self, *args, **kwargs):
+        self.add_skip_pass_case()
+        self.run_test(quant=False, *args, **kwargs)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d.py
index fd4b5ad9a72b6..4726524523552 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d.py
@@ -15,6 +15,7 @@
 from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons
 from program_config import TensorConfig, ProgramConfig
 import numpy as np
+import unittest
 import paddle.inference as paddle_infer
 from functools import partial
 from typing import Optional, List, Callable, Dict, Any, Set
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_fusion.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_fusion.py
index 9fcbda4443de5..d811f3eac49bf 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_fusion.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_fusion.py
@@ -18,6 +18,7 @@
 import paddle.inference as paddle_infer
 from functools import partial
 from typing import Optional, List, Callable, Dict, Any, Set
+import unittest
 
 
 class TrtConvertConv2dFusionTest(TrtLayerAutoScanTest):
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_transpose.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_transpose.py
index 2c8f2592a737c..e21d67839eb6c 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_transpose.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_transpose.py
@@ -15,6 +15,7 @@
 from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons
 from program_config import TensorConfig, ProgramConfig
 import numpy as np
+import unittest
 import paddle.inference as paddle_infer
 from functools import partial
 from typing import Optional, List, Callable, Dict, Any, Set
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_deformable_conv.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_deformable_conv.py
new file mode 100644
index 0000000000000..9d29034d7fe18
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_deformable_conv.py
@@ -0,0 +1,181 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons
+from program_config import TensorConfig, ProgramConfig
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import Optional, List, Callable, Dict, Any, Set
+import unittest
+
+
+class TrtConvertDeformableConvTest(TrtLayerAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        inputs = program_config.inputs
+        weights = program_config.weights
+        attrs = [
+            program_config.ops[i].attrs
+            for i in range(len(program_config.ops))
+        ]
+
+        if inputs['input_data'].shape[1] != weights['filter_data'].shape[
+                1] * attrs[0]['groups']:
+            return False
+
+        return True
+
+    def sample_program_configs(self):
+        def compute_output_size(input_size: List[int],
+                                kernel_sizes: List[int],
+                                attrs: List[Dict[str, Any]]):
+            strides = attrs[0]['strides']
+            paddings = attrs[0]['paddings']
+            dilations = attrs[0]['dilations']
+            output_size = []
+            for i, k, s, p, d in zip(input_size, kernel_sizes, strides,
+                                     paddings, dilations):
+                k = d * (k - 1) + 1
+                output_size.append((i + 2 * p - k) // s + 1)
+            return output_size
+
+        def generate_input1(batch: int,
+                            input_size: List[int],
+                            kernel_sizes: List[int],
+                            attrs: List[Dict[str, Any]]):
+            return np.random.random([batch, 3] + input_size).astype(np.float32)
+
+        def generate_offset1(batch: int,
+                             input_size: List[int],
+                             kernel_sizes: List[int],
+                             attrs: List[Dict[str, Any]]):
+            output_size = compute_output_size(input_size, kernel_sizes, attrs)
+            return np.random.random([batch, 2 * np.prod(kernel_sizes)] +
+                                    output_size).astype(np.float32)
+
+        def generate_mask1(batch: int,
+                           input_size: List[int],
+                           kernel_sizes: List[int],
+                           attrs: List[Dict[str, Any]]):
+            output_size = compute_output_size(input_size, kernel_sizes, attrs)
+            return np.random.random([batch, np.prod(kernel_sizes)] +
+                                    output_size).astype(np.float32)
+
+        def generate_filter1(batch: int,
+                             input_size: List[int],
+                             kernel_sizes: List[int],
+                             attrs: List[Dict[str, Any]]):
+            return np.random.random([6, 3] + kernel_sizes).astype(np.float32)
+
+        for batch in [1, ]:
+            for input_size in [[32, 32]]:
+                for kernel_sizes in [[3, 3]]:
+                    for strides in [[1, 1], [2, 2]]:
+                        for paddings in [[1, 1], [0, 2]]:
+                            for groups in [1, ]:
+                                for dilations in [[1, 1], [2, 2]]:
+                                    dics = [{
+                                        "strides": strides,
+                                        "paddings": paddings,
+                                        "groups": groups,
+                                        "dilations": dilations,
+                                        "deformable_groups": 1,
+                                        "im2col_step": 1
+                                    }]
+
+                                ops_config = [{
+                                    "op_type": "deformable_conv",
+                                    "op_inputs": {
+                                        "Input": ["input_data"],
+                                        "Offset": ["offset_data"],
+                                        "Mask": ["mask_data"],
+                                        "Filter": ["filter_data"]
+                                    },
+                                    "op_outputs": {
+                                        "Output": ["output_data"]
+                                    },
+                                    "op_attrs": dics[0]
+                                }]
+                                ops = self.generate_op_config(ops_config)
+
+                                program_config = ProgramConfig(
+                                    ops=ops,
+                                    weights={
+                                        "filter_data":
+                                        TensorConfig(data_gen=partial(
+                                            generate_filter1, batch, input_size,
+                                            kernel_sizes, dics))
+                                    },
+                                    inputs={
+                                        "input_data":
+                                        TensorConfig(data_gen=partial(
+                                            generate_input1, batch, input_size,
+                                            kernel_sizes, dics)),
+                                        "offset_data":
+                                        TensorConfig(data_gen=partial(
+                                            generate_offset1, batch, input_size,
+                                            kernel_sizes, dics)),
+                                        "mask_data": TensorConfig(
+                                            data_gen=partial(
+                                                generate_mask1, batch,
+                                                input_size, kernel_sizes, dics))
+                                    },
+                                    outputs=["output_data"])
+
+                                yield program_config
+
+    def sample_predictor_configs(
+            self, program_config) -> (paddle_infer.Config, List[int], float):
+        def clear_dynamic_shape():
+            self.dynamic_shape.min_input_shape = {}
+            self.dynamic_shape.max_input_shape = {}
+            self.dynamic_shape.opt_input_shape = {}
+
+        def generate_trt_nodes_num(attrs, dynamic_shape):
+            # TODO: This is just the example, need to be fixed.
+            if len(attrs[0]['paddings']) == 4:
+                return 1, 2
+            else:
+                return 1, 2
+
+        attrs = [
+            program_config.ops[i].attrs
+            for i in range(len(program_config.ops))
+        ]
+
+        # for static_shape
+        clear_dynamic_shape()
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False), 1e-5
+
+    def add_skip_trt_case(self):
+        def teller1(program_config, predictor_config):
+            if len(program_config.ops[0].attrs["strides"]) != 2:
+                return False
+
+            return True
+
+        self.add_skip_case(
+            teller1, SkipReasons.TRT_NOT_IMPLEMENTED,
+            "In deformable conv, length of Attr(strides) should be 2.")
+
+    def test(self):
+        self.trt_param.workspace_size = 1 << 28
+        self.add_skip_trt_case()
+        self.run_test()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d.py
index fc2358bb11636..b87b33d355798 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d.py
@@ -18,6 +18,7 @@
 import paddle.inference as paddle_infer
 from functools import partial
 from typing import Optional, List, Callable, Dict, Any, Set
+import unittest
 
 
 class TrtConvertDepthwiseConv2dTest(TrtLayerAutoScanTest):
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d_transpose.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d_transpose.py
index 2fcd2bf5aca97..66a007f64b69c 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d_transpose.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d_transpose.py
@@ -18,6 +18,7 @@
 import paddle.inference as paddle_infer
 from functools import partial
 from typing import Optional, List, Callable, Dict, Any, Set
+import unittest
 
 
 class TrtConvertDepthwiseConv2dTransposeTest(TrtLayerAutoScanTest):
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_matmul.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_matmul.py
new file mode 100644
index 0000000000000..8913159b2c4df
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_matmul.py
@@ -0,0 +1,213 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons
+from program_config import TensorConfig, ProgramConfig
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import Optional, List, Callable, Dict, Any, Set
+import unittest
+
+
+class TrtConvertMatmulTest_static(TrtLayerAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        return True
+
+    def sample_program_configs(self):
+        def generate_input(shape):
+            return np.random.random(shape).astype(np.float32)
+
+        for batch in [1, 4]:
+            for trans_x in [True, False]:
+                for trans_y in [True, False]:
+                    if trans_x and trans_y:
+                        input1_shape = [batch, 6, 11]
+                        input2_shape = [batch, 32, 6]
+                    if trans_x and not trans_y:
+                        input1_shape = [batch, 6, 11]
+                        input2_shape = [batch, 6, 32]
+                    if not trans_x and trans_y:
+                        input1_shape = [batch, 32, 6]
+                        input2_shape = [batch, 11, 6]
+                    if not trans_x and not trans_y:
+                        input1_shape = [batch, 32, 6]
+                        input2_shape = [batch, 6, 11]
+                    for alpha in [0.3, 1.0]:
+                        dics = [{
+                            "transpose_X": trans_x,
+                            "transpose_Y": trans_y,
+                            "alpha": alpha,
+                            "fused_reshape_X": [],
+                            "fused_reshape_Y": [],
+                            "fused_transpose_X": [],
+                            "fused_transpose_Y": [],
+                            "fused_reshape_Out": [],
+                            "fused_transpose_Out": []
+                        }]
+                        ops_config = [{
+                            "op_type": "matmul",
+                            "op_inputs": {
+                                "X": ["input1_data"],
+                                "Y": ["input2_data"]
+                            },
+                            "op_outputs": {
+                                "Out": ["output_data"]
+                            },
+                            "op_attrs": dics[0]
+                        }]
+                        ops = self.generate_op_config(ops_config)
+
+                        program_config = ProgramConfig(
+                            ops=ops,
+                            weights={},
+                            inputs={
+                                "input1_data": TensorConfig(data_gen=partial(
+                                    generate_input, input1_shape)),
+                                "input2_data": TensorConfig(data_gen=partial(
+                                    generate_input, input2_shape))
+                            },
+                            outputs=["output_data"])
+
+                        yield program_config
+
+    def sample_predictor_configs(
+            self, program_config) -> (paddle_infer.Config, List[int], float):
+        def generate_dynamic_shape(attrs):
+            pass
+
+        def clear_dynamic_shape():
+            self.dynamic_shape.max_input_shape = {}
+            self.dynamic_shape.min_input_shape = {}
+            self.dynamic_shape.opt_input_shape = {}
+
+        # for static_shape
+        clear_dynamic_shape()
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), (1, 3), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), (1, 3), 1e-5
+
+    def test(self):
+        self.run_test()
+
+
+class TrtConvertMatmulTest_dynamic(TrtLayerAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        return True
+
+    def sample_program_configs(self):
+        def generate_input(shape):
+            return np.random.random(shape).astype(np.float32)
+
+        for trans_x in [True]:
+            for trans_y in [True]:
+                if trans_x and trans_y:
+                    input1_shape = [4, 4, 4]
+                    input2_shape = [4, 4, 4]
+                # if trans_x and not trans_y:
+                #     input1_shape = [4, 4, 4]
+                #     input2_shape = [4, 4, 4]
+                # if not trans_x and trans_y:
+                #     input1_shape = [batch, 32, 6]
+                #     input2_shape = [batch, 11, 6]
+                # if not trans_x and not trans_y:
+                #     input1_shape = [batch, 32, 6]
+                #     input2_shape = [batch, 6, 11]
+                for alpha in [0.3, 1.0]:
+                    dics = [{
+                        "transpose_X": trans_x,
+                        "transpose_Y": trans_y,
+                        "alpha": alpha,
+                        "fused_reshape_X": [],
+                        "fused_reshape_Y": [],
+                        "fused_transpose_X": [],
+                        "fused_transpose_Y": [],
+                        "fused_reshape_Out": [],
+                        "fused_transpose_Out": []
+                    }]
+                    ops_config = [{
+                        "op_type": "matmul",
+                        "op_inputs": {
+                            "X": ["input1_data"],
+                            "Y": ["input2_data"]
+                        },
+                        "op_outputs": {
+                            "Out": ["output_data"]
+                        },
+                        "op_attrs": dics[0]
+                    }]
+                    ops = self.generate_op_config(ops_config)
+
+                    program_config = ProgramConfig(
+                        ops=ops,
+                        weights={},
+                        inputs={
+                            "input1_data": TensorConfig(
+                                data_gen=partial(generate_input, input1_shape)),
+                            "input2_data": TensorConfig(
+                                data_gen=partial(generate_input, input2_shape))
+                        },
+                        outputs=["output_data"])
+
+                    yield program_config
+
+    def sample_predictor_configs(
+            self, program_config) -> (paddle_infer.Config, List[int], float):
+        def generate_dynamic_shape(attrs):
+            self.dynamic_shape.min_input_shape = {
+                "input1_data": [1, 4, 4],
+                "input2_data": [1, 4, 4]
+            }
+            self.dynamic_shape.max_input_shape = {
+                "input1_data": [16, 4, 4],
+                "input2_data": [16, 4, 128]
+            }
+            self.dynamic_shape.opt_input_shape = {
+                "input1_data": [8, 4, 4],
+                "input2_data": [8, 4, 16]
+            }
+
+        attrs = [
+            program_config.ops[i].attrs
+            for i in range(len(program_config.ops))
+        ]
+
+        # for dynamic_shape
+        generate_dynamic_shape(attrs)
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), (1, 3), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), (1, 3), 1e-5
+
+    def add_skip_trt_case(self):
+        def teller1(program_config, predictor_config):
+            if len(
+                    self.dynamic_shape.min_input_shape
+            ) != 0 and self.trt_param.precision == paddle_infer.PrecisionType.Half:
+                return True
+            return False
+
+        self.add_skip_case(
+            teller1, SkipReasons.TRT_NOT_IMPLEMENTED,
+            "Tensorrt MatrixMultiply layer will get error when dynamic shape fp16 mode."
+        )
+
+    def test(self):
+        self.add_skip_trt_case()
+        self.run_test()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multihead_matmul.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multihead_matmul.py
index 0754eede6d370..2d2072d277e9c 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multihead_matmul.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multihead_matmul.py
@@ -426,7 +426,7 @@ def clear_dynamic_shape():
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), (1, 3), (1e-5, 1e-5)
+        yield self.create_inference_config(), (1, 3), (1e-5, 1e-4)
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), (1, 3), (1e-5, 1e-5)
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_nearest_interp_v2.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_nearest_interp_v2.py
index 0c7715c957085..57d7d70c66a5b 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_nearest_interp_v2.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_nearest_interp_v2.py
@@ -18,6 +18,7 @@
 import paddle.inference as paddle_infer
 from functools import partial
 from typing import Optional, List, Callable, Dict, Any, Set
+import unittest
 
 
 class TrtConvertNearestInterpV2Test(TrtLayerAutoScanTest):
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_reduce_mean.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_reduce_mean.py
index 6c4c2ef4e1a14..b09ae80555e08 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_reduce_mean.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_reduce_mean.py
@@ -120,7 +120,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
             attrs, False), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), (1e-5, 1e-5)
+            attrs, False), (1e-4, 1e-4)
 
         # for dynamic_shape
         generate_dynamic_shape(attrs)
@@ -129,7 +129,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
                                                                      True), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), (1e-5, 1e-5)
+            attrs, True), (1e-4, 1e-4)
 
         pass
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_reduce_sum.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_reduce_sum.py
index 1cc9defa1010b..ba0f61a276898 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_reduce_sum.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_reduce_sum.py
@@ -120,7 +120,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
             attrs, False), (1e-5, 1e-5)
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), (1e-5, 1e-5)
+            attrs, False), (1e-4, 1e-4)
 
         # for dynamic_shape
         generate_dynamic_shape(attrs)
@@ -129,7 +129,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
             attrs, True), (1e-5, 1e-5)
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), (1e-5, 1e-5)
+            attrs, True), (1e-4, 1e-4)
 
         pass
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_tile.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_tile.py
index c1a5493fd328a..cbbd13a7b8003 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_tile.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_tile.py
@@ -20,6 +20,10 @@
 from typing import Optional, List, Callable, Dict, Any, Set
 import unittest
 
+import hypothesis
+from hypothesis import given, settings, seed, example, assume
+import hypothesis.strategies as st
+
 
 class TrtConvertTileTest(TrtLayerAutoScanTest):
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
@@ -34,35 +38,34 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool:
 
         return True
 
-    def sample_program_configs(self):
+    def sample_program_configs(self, *args, **kwargs):
         def generate_input1(attrs: List[Dict[str, Any]]):
             return np.ones([1, 2, 3, 4]).astype(np.float32)
 
-        for repeat_times in [[100], [1, 2], [0, 3], [1, 2, 100]]:
-            dics = [{"repeat_times": repeat_times}]
-
-            ops_config = [{
-                "op_type": "tile",
-                "op_inputs": {
-                    "X": ["input_data"]
-                },
-                "op_outputs": {
-                    "Out": ["tile_output_data"]
-                },
-                "op_attrs": dics[0]
-            }]
-            ops = self.generate_op_config(ops_config)
-
-            program_config = ProgramConfig(
-                ops=ops,
-                weights={},
-                inputs={
-                    "input_data": TensorConfig(data_gen=partial(generate_input1,
-                                                                dics))
-                },
-                outputs=["tile_output_data"])
-
-            yield program_config
+        dics = [{"repeat_times": kwargs['repeat_times']}]
+
+        ops_config = [{
+            "op_type": "tile",
+            "op_inputs": {
+                "X": ["input_data"]
+            },
+            "op_outputs": {
+                "Out": ["tile_output_data"]
+            },
+            "op_attrs": dics[0]
+        }]
+        ops = self.generate_op_config(ops_config)
+
+        program_config = ProgramConfig(
+            ops=ops,
+            weights={},
+            inputs={
+                "input_data": TensorConfig(data_gen=partial(generate_input1,
+                                                            dics))
+            },
+            outputs=["tile_output_data"])
+
+        yield program_config
 
     def sample_predictor_configs(
             self, program_config) -> (paddle_infer.Config, List[int], float):
@@ -109,8 +112,9 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         yield self.create_inference_config(), generate_trt_nodes_num(attrs,
                                                                      True), 1e-4
 
-    def test(self):
-        self.run_test()
+    @given(repeat_times=st.sampled_from([[100], [1, 2], [0, 3], [1, 2, 100]]))
+    def test(self, *args, **kwargs):
+        self.run_test(*args, **kwargs)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_deformable_conv.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_deformable_conv.py
new file mode 100644
index 0000000000000..508095fb80175
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_deformable_conv.py
@@ -0,0 +1,95 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import PassVersionChecker
+from paddle.fluid.core import AnalysisConfig
+
+
+class TRTDeformableConvTest(InferencePassTest):
+    def setUp(self):
+        self.set_params()
+        with fluid.program_guard(self.main_program, self.startup_program):
+            input = fluid.data(
+                name='input', shape=self.input_size, dtype=self.dtype)
+            offset = fluid.data(
+                name='offset', shape=self.offset_size, dtype=self.dtype)
+            mask = fluid.data(
+                name='mask', shape=self.mask_size, dtype=self.dtype)
+
+            output = fluid.layers.deformable_conv(
+                input,
+                offset,
+                mask,
+                self.num_filters,
+                self.filter_size,
+                stride=self.stride,
+                padding=self.padding,
+                dilation=self.dilations,
+                groups=self.groups,
+                deformable_groups=self.deformable_groups,
+                im2col_step=self.im2col_step)
+
+        self.feeds = {
+            'input': np.random.random(self.input_size).astype(self.dtype),
+            'offset': np.random.random(self.offset_size).astype(self.dtype),
+            'mask': np.random.random(self.mask_size).astype(self.dtype)
+        }
+        self.enable_trt = True
+        dtype = AnalysisConfig.Precision.Float32
+        if self.dtype == 'float16':
+            dtype = AnalysisConfig.Precision.Half
+        self.trt_parameters = TRTDeformableConvTest.TensorRTParam(
+            1 << 30, self.bs, 0, dtype, False, False)
+        self.fetch_list = [output]
+
+    def set_params(self):
+        self.groups = 1
+        self.padding = [1, 1]
+        self.dilations = [1, 1]
+        self.stride = [1, 1]
+        self.im2col_step = 1
+        self.deformable_groups = 1
+
+        self.bs = 2
+        self.input_size = [self.bs, 8, 4, 4]
+        self.num_filters = 8
+        self.filter_size = 3
+        offset_c = 2 * self.deformable_groups * self.filter_size * self.filter_size
+        mask_c = self.deformable_groups * self.filter_size * self.filter_size
+        self.offset_size = [
+            self.input_size[0], offset_c, self.input_size[2], self.input_size[3]
+        ]
+        self.mask_size = [
+            self.input_size[0], mask_c, self.input_size[2], self.input_size[3]
+        ]
+
+        self.dtype = 'float32'
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            self.check_output_with_option(use_gpu)
+            self.assertTrue(
+                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/trt_layer_auto_scan_test.py b/python/paddle/fluid/tests/unittests/ir/inference/trt_layer_auto_scan_test.py
index edd033f28c0ed..7432101e787c2 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/trt_layer_auto_scan_test.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/trt_layer_auto_scan_test.py
@@ -12,275 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import numpy as np
-import unittest
-import itertools
-import abc
-import enum
-import sys
-import os
-import logging
-import time
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-import paddle.inference as paddle_infer
-import shutil
-
-from paddle import compat as cpt
-from typing import Optional, List, Callable, Dict, Any, Set
-from program_config import TensorConfig, OpConfig, ProgramConfig, create_fake_model, create_quant_model
-from auto_scan_test import AutoScanTest, SkipReasons
-
-logging.basicConfig(level=logging.INFO, format="%(message)s")
-
-
-class TrtLayerAutoScanTest(AutoScanTest):
-    class TensorRTParam:
-        '''
-        TensorRT subgraph engine parameters. 
-        '''
-
-        def __init__(self, workspace_size, max_batch_size, min_subgraph_size,
-                     precision, use_static, use_calib_mode):
-            self.workspace_size = workspace_size
-            self.max_batch_size = max_batch_size
-            self.min_subgraph_size = min_subgraph_size
-            self.precision = precision
-            self.use_static = use_static
-            self.use_calib_mode = use_calib_mode
-
-    class DynamicShapeParam:
-        '''
-         Prepare TensorRT subgraph engine dynamic shape parameters. 
-         '''
-
-        def __init__(self, min_input_shape, max_input_shape, opt_input_shape,
-                     disable_trt_plugin_fp16):
-            self.min_input_shape = min_input_shape
-            self.max_input_shape = max_input_shape
-            self.opt_input_shape = opt_input_shape
-            self.disable_trt_plugin_fp16 = disable_trt_plugin_fp16
-
-    def __init__(self, methodName='runTest'):
-        super(TrtLayerAutoScanTest, self).__init__(methodName)
-        self.trt_param = self.TensorRTParam(
-            workspace_size=1024,
-            max_batch_size=4,
-            min_subgraph_size=0,
-            precision=paddle_infer.PrecisionType.Float32,
-            use_static=True,
-            use_calib_mode=False)
-        self.dynamic_shape = self.DynamicShapeParam({}, {}, {}, False)
-        self.num_percent_cases = float(
-            os.getenv(
-                'TEST_NUM_PERCENT_CASES', default='1.0'))
-        abs_dir = os.path.abspath(os.path.dirname(__file__))
-        cache_dir = str(self.__module__) + '_trt_cache_dir'
-        self.trt_cache_dir = os.path.join(abs_dir, cache_dir)
-
-    def create_inference_config(self, use_trt=True) -> paddle_infer.Config:
-        config = paddle_infer.Config()
-        config.disable_glog_info()
-        config.enable_use_gpu(100, 0)
-        config.set_optim_cache_dir(self.trt_cache_dir)
-        if use_trt:
-            config.switch_ir_debug()
-            config.enable_tensorrt_engine(
-                max_batch_size=self.trt_param.max_batch_size,
-                workspace_size=self.trt_param.workspace_size,
-                min_subgraph_size=self.trt_param.min_subgraph_size,
-                precision_mode=self.trt_param.precision,
-                use_static=self.trt_param.use_static,
-                use_calib_mode=self.trt_param.use_calib_mode)
-            if len(self.dynamic_shape.min_input_shape
-                   ) != 0 and self.dynamic_shape.min_input_shape.keys(
-                   ) == self.dynamic_shape.max_input_shape.keys(
-                   ) and self.dynamic_shape.min_input_shape.keys(
-                   ) == self.dynamic_shape.opt_input_shape.keys():
-                config.set_trt_dynamic_shape_info(
-                    self.dynamic_shape.min_input_shape,
-                    self.dynamic_shape.max_input_shape,
-                    self.dynamic_shape.opt_input_shape,
-                    self.dynamic_shape.disable_trt_plugin_fp16)
-        return config
-
-    def assert_tensors_near(self,
-                            atol: float,
-                            rtol: float,
-                            tensor: Dict[str, np.array],
-                            baseline: Dict[str, np.array]):
-        for key, arr in tensor.items():
-            self.assertTrue(
-                baseline[key].shape == arr.shape,
-                "The output shape of GPU and TensorRT are not equal, the baseline shape is "
-                + str(baseline[key].shape) + ', but the trt shape is ' +
-                str(arr.shape))
-            self.assertTrue(
-                np.allclose(
-                    baseline[key], arr, atol=atol, rtol=rtol),
-                "Output has diff between GPU and TensorRT. ")
-
-    def assert_op_size(self, trt_engine_num, paddle_op_num):
-        last_passed_program = 'transpose_flatten_concat_fuse_pass.pdmodel'
-        model_bytes = paddle.static.load_from_file(last_passed_program)
-        pg = paddle.static.deserialize_program(model_bytes)
-        main_block = pg.desc.block(0)
-        op_size = main_block.op_size()
-        op_types = [
-            main_block.op(i).type() == 'tensorrt_engine' for i in range(op_size)
-        ]
-        trt_engine_size = sum(op_types)
-        paddle_op_size = op_size - trt_engine_size
-        self.assertTrue(trt_engine_size == trt_engine_num,
-                        'trt_engine_num is {}, but got {}!'.format(
-                            trt_engine_size, trt_engine_num))
-        self.assertTrue(paddle_op_size == paddle_op_num,
-                        'paddle_op_num is {}, but got {}!'.format(
-                            paddle_op_size, paddle_op_num))
-
-    def skip_log(self, msg: str):
-        logging.warning("SKIP: " + msg)
-
-    def fail_log(self, msg: str):
-        logging.error("FAILE: " + msg)
-
-    def success_log(self, msg: str):
-        logging.info("SUCCESS: " + msg)
-
-    def validate(self, func: Callable[..., bool]):
-        pass
-
-    def generate_op_config(self,
-                           ops_config: List[Dict[str, Any]]) -> List[OpConfig]:
-        ops = []
-        for i in range(len(ops_config)):
-            op_config = ops_config[i]
-            ops.append(
-                OpConfig(
-                    type=op_config['op_type'],
-                    inputs=op_config['op_inputs'],
-                    outputs=op_config['op_outputs'],
-                    attrs=op_config['op_attrs']))
-        return ops
-
-    def inference_config_str(self, config: paddle_infer.Config):
-        dic = {}
-        enable_trt = config.tensorrt_engine_enabled()
-        trt_precison = config.tensorrt_precision_mode()
-        trt_dynamic_shape = config.tensorrt_dynamic_shape_enabled()
-        if enable_trt:
-            dic['use_trt'] = True
-            dic['trt_precision'] = trt_precison
-            dic['use_dynamic_shape'] = trt_dynamic_shape
-        else:
-            dic['use_trt'] = False
-        return str(dic)
-
-    def run_test(self, quant=False):
-        status = True
-        np.random.seed(int(1000 * time.time()) % 2**32)
-        run_flags = []
-        for prog_config in self.sample_program_configs():
-            # In CI, only run 30% cases
-            if np.random.rand() < self.num_percent_cases:
-                run_flags.append(True)
-            else:
-                run_flags.append(False)
-        np.random.seed(1024)
-
-        for prog_config, run_flags in zip(self.sample_program_configs(),
-                                          run_flags):
-            if not run_flags:
-                continue
-
-            # if program is invalid, we should skip that cases.
-            if not self.is_program_valid(prog_config):
-                continue
-
-            model, params = create_fake_model(prog_config)
-            if quant:
-                model, params = create_quant_model(model, params)
-
-            feed_data = {}
-            for name, tensor_config in prog_config.inputs.items():
-                feed_data[name] = {
-                    'data': tensor_config.data,
-                    'lod': tensor_config.lod
-                }
-
-            results: List[Dict[str, Tensor]] = []
-
-            # baseline: gpu run
-            gpu_config = self.create_inference_config(use_trt=False)
-            results.append(
-                self.run_test_config(model, params, prog_config, gpu_config,
-                                     feed_data))
-            self.success_log('RUN_GPU_BASELINE ' + str(prog_config) + ' vs ' +
-                             self.inference_config_str(gpu_config))
-
-            for pred_config, nodes_num, threshold in self.sample_predictor_configs(
-                    prog_config):
-
-                if os.path.exists(self.trt_cache_dir):
-                    shutil.rmtree(self.trt_cache_dir)
-
-                if isinstance(threshold, float):
-                    atol = threshold
-                    rtol = 1e-8
-                elif isinstance(threshold, list) or isinstance(threshold,
-                                                               tuple):
-                    atol = threshold[0]
-                    rtol = threshold[1]
-                else:
-                    raise NotImplementedError
-
-                if quant and pred_config.tensorrt_precision_mode(
-                ) != paddle_infer.PrecisionType.Int8:
-                    continue
-                if pred_config.tensorrt_precision_mode(
-                ) == paddle_infer.PrecisionType.Int8 and not quant:
-                    continue
-
-                skip_flag = False
-                for skip_info in self.skip_cases:
-                    if skip_info[0](prog_config, pred_config):
-                        skip_flag = True
-                        if skip_info[1] == SkipReasons.TRT_NOT_IMPLEMENTED:
-                            self.skip_log("[TRT_NOT_IMPLEMENTED] " + skip_info[
-                                2] + ' ' + repr(prog_config) + ' vs ' + self.
-                                          inference_config_str(pred_config))
-                        elif skip_info[1] == SkipReasons.TRT_NOT_SUPPORT:
-                            self.skip_log("[TRT_NOT_SUPPORT] " + skip_info[
-                                2] + ' ' + repr(prog_config) + ' vs ' + self.
-                                          inference_config_str(pred_config))
-                        else:
-                            raise NotImplementedError
-                        break
-
-                try:
-                    pred_config_deserialize = paddle_infer.Config(pred_config)
-                    results.append(
-                        self.run_test_config(model, params, prog_config,
-                                             pred_config, feed_data))
-                    self.assert_tensors_near(atol, rtol, results[-1],
-                                             results[0])
-                    if not skip_flag:
-                        self.assert_op_size(nodes_num[0], nodes_num[1])
-                    # deserialize test
-                    if nodes_num[0] > 0:
-                        self.run_test_config(model, params, prog_config,
-                                             pred_config_deserialize, feed_data)
-                except Exception as e:
-                    self.fail_log(
-                        str(prog_config) + ' vs ' + self.inference_config_str(
-                            pred_config) +
-                        '\033[1;31m \nERROR INFO: {}\033[0m'.format(str(e)))
-                    if not skip_flag:
-                        status = False
-                    continue
-
-                self.success_log('RUN ' + str(prog_config) + ' vs ' +
-                                 self.inference_config_str(pred_config))
-
-        # self.assertTrue(status)
+from auto_scan_test import TrtLayerAutoScanTest, SkipReasons
diff --git a/python/paddle/fluid/tests/unittests/ir/test_ir_generate_pass.py b/python/paddle/fluid/tests/unittests/ir/test_ir_generate_pass.py
index 61bd554ad2616..2a7c2768e27cd 100644
--- a/python/paddle/fluid/tests/unittests/ir/test_ir_generate_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/test_ir_generate_pass.py
@@ -33,12 +33,12 @@ def pattern(x, w, b):
                 return ewadd
 
         def replace(x, w, b):
-            fc = ir.PassDesc.OP.fc
-            fc.Attr("in_num_col_dims").ReusePattern(
-                "mul", name="x_num_col_dims")
+            fc = ir.PassDesc.OP.fc(Input=x, W=w, Bias=b)
+            fc.Attr("in_num_col_dims").MappedPattern(
+                op="mul", name="x_num_col_dims")
             if with_relu:
                 fc.SetAttr("activation_type", "relu")
-            return fc(Input=x, W=w, Bias=b)
+            return fc
 
         return pattern, replace
 
@@ -96,8 +96,8 @@ def replace(x, y1, y2):
 @ir.RegisterPass
 def generate_combine_mul_v2():
     def pattern(x, y1, y2):
-        mul1 = ir.PassDesc.OP.matmul_v2(x, y1)
-        mul2 = ir.PassDesc.OP.matmul_v2(x, y2)
+        mul1 = ir.PassDesc.OP.matmul_v2(X=x, Y=y1)
+        mul2 = ir.PassDesc.OP.matmul_v2(X=x, Y=y2)
         return mul1, mul2
 
     def replace(x, y1, y2):
@@ -126,11 +126,71 @@ def pattern(x):
         op1 = ir.PassDesc.OP.transpose2
         op2 = ir.PassDesc.OP.transpose2
         # op2.Attr("axis").EQ(op1.Attr("axis"))
-        return op2(X=op1(X=x))
+        return op2(X=op1(X=x).Output("Out")).Output("Out")
 
     return pattern, lambda x: x
 
 
+@ir.RegisterPass
+def generate_layer_norm_fuse_pass():
+    def pattern(x, gamma, beta):
+        gamma.Attr("shape").Size().EQ(1)
+        gamma.Attr("shape")[0].EQ(x.Attr("shape")[-1])
+        beta.Attr("shape").EQ(gamma.Attr("shape"))
+
+        mean1 = ir.PassDesc.OP.reduce_mean(X=x)
+        mean1.SetAttr("dim", [-1])
+        mean1.SetAttr("reduce_all", False)
+        mean1.SetAttr("keep_dim", True)
+        ewsub = ir.PassDesc.OP.elementwise_sub(X=x, Y=mean1)
+        pow = ir.PassDesc.OP.pow(X=ewsub)
+        pow.SetAttr("factor", 2.0)
+        mean2 = ir.PassDesc.OP.reduce_mean(X=pow)
+        mean2.SetAttr("dim", [-1])
+        mean2.SetAttr("reduce_all", False)
+        mean2.SetAttr("keep_dim", True)
+        scale = ir.PassDesc.OP.scale(X=mean2)
+        sqrt = ir.PassDesc.OP.sqrt(X=scale)
+        ewdiv = ir.PassDesc.OP.elementwise_sub(X=ewsub, Y=sqrt)
+        ewmul = ir.PassDesc.OP.elementwise_mul(X=ewdiv, Y=gamma)
+        return ir.PassDesc.OP.elementwise_add(X=ewmul, Y=beta)
+
+    def replace(x, gamma, beta):
+        layer_norm = ir.PassDesc.OP.layer_norm(X=x, Scale=gamma, Bias=beta)
+        layer_norm.SetAttr("begin_norm_axis", x.Attr("shape").Size() - 1)
+        layer_norm.Attr("epsilon").MappedPattern(op="scale", name="bias")
+        layer_norm.SetAttr("is_test", True)
+        return layer_norm.Output("Y")
+
+    return pattern, replace
+
+
+@ir.RegisterPass
+def unimplemented_operand_exception():
+    def pattern(x, y):
+        return ir.PassDesc.OP.elementwise_add(X=x, Y=y)
+
+    def replace(x, y):
+        out = ir.PassDesc.OP.elementwise_add(X=x, Y=y)
+        out.SetAttr("axis", x.Attr("shape") - 1)
+        return out
+
+    return pattern, replace
+
+
+@ir.RegisterPass
+def unimplemented_operation_exception():
+    def pattern(x, y):
+        return ir.PassDesc.OP.elementwise_add(X=x, Y=y)
+
+    def replace(x, y):
+        out = ir.PassDesc.OP.elementwise_add(X=x, Y=y)
+        out.SetAttr("axis", x.Attr("shape").Size() + 1)
+        return out
+
+    return pattern, replace
+
+
 def get_multi_pass_desc_from_str(s):
     multi_pass_desc = ir.pass_desc_pb2.MultiPassDesc()
     multi_pass_desc.ParseFromString(s)
@@ -151,12 +211,24 @@ def convert_ops_to_op_dicts(self, ops):
     def test_has_attr(self):
         self.assertFalse(hasattr(ir.PassDesc.OP, '__name__'))
 
+    def test_exception(self):
+        paddle.enable_static()
+        program = paddle.static.Program()
+        startup_program = paddle.static.Program()
+        with paddle.static.program_guard(program, startup_program):
+            x = paddle.static.data("x", [10, 10], "float32")
+            y = paddle.static.data("y", [10, 10], "float32")
+            paddle.add(x, y)
+        graph = core.Graph(program.desc)
+        with self.assertRaises(NotImplementedError):
+            core.get_pass("unimplemented_operand_exception").apply(graph)
+        with self.assertRaises(NotImplementedError):
+            core.get_pass("unimplemented_operation_exception").apply(graph)
+
     def test_generate_fc_fuse(self):
         def _check_fc_fuse_pass(pass_desc, with_relu):
-            pattern_op_dicts = self.convert_ops_to_op_dicts(
-                pass_desc.pattern.blocks[0].ops)
-            replace_op_dicts = self.convert_ops_to_op_dicts(
-                pass_desc.replace.blocks[0].ops)
+            pattern_op_dicts = self.convert_ops_to_op_dicts(pass_desc.pattern)
+            replace_op_dicts = self.convert_ops_to_op_dicts(pass_desc.replace)
             self.assertEqual(len(pattern_op_dicts.get("mul", [])), 1)
             self.assertEqual(
                 len(pattern_op_dicts.get("elementwise_add", [])), 1)
@@ -166,10 +238,9 @@ def _check_fc_fuse_pass(pass_desc, with_relu):
             else:
                 pattern_op_num = 2  # ewadd, mul
             self.assertEqual(len(pass_desc.var_maps), 4)
-            self.assertEqual(
-                len(pass_desc.pattern.blocks[0].ops), pattern_op_num)
-            self.assertEqual(len(pass_desc.replace.blocks[0].ops), 1)
-            self.assertEqual(len(pass_desc.attr_maps), 1)
+            self.assertEqual(len(pass_desc.pattern), pattern_op_num)
+            self.assertEqual(len(pass_desc.replace), 1)
+            self.assertEqual(len(pass_desc.op_attr_maps), 1)
 
         helper = ir.RegisterPassHelper(generate_fc_fuse())
         s = helper.SerializeMultiPassDesc()
@@ -253,12 +324,10 @@ def test_generate_combine_mul_v2(self):
         self.assertEqual(len(multi_pass_desc.pass_descs), 1)
         pass_desc = multi_pass_desc.pass_descs[0]
         self.assertEqual(len(pass_desc.var_maps), 5)
-        self.assertEqual(len(pass_desc.pattern.blocks[0].ops), 2)
-        self.assertEqual(len(pass_desc.replace.blocks[0].ops), 4)
-        pattern_op_dicts = self.convert_ops_to_op_dicts(
-            pass_desc.pattern.blocks[0].ops)
-        replace_op_dicts = self.convert_ops_to_op_dicts(
-            pass_desc.replace.blocks[0].ops)
+        self.assertEqual(len(pass_desc.pattern), 2)
+        self.assertEqual(len(pass_desc.replace), 4)
+        pattern_op_dicts = self.convert_ops_to_op_dicts(pass_desc.pattern)
+        replace_op_dicts = self.convert_ops_to_op_dicts(pass_desc.replace)
         self.assertEqual(len(pattern_op_dicts.get("matmul_v2", [])), 2)
         self.assertEqual(len(replace_op_dicts.get("concat", [])), 1)
         self.assertEqual(len(replace_op_dicts.get("matmul_v2", [])), 1)
@@ -292,3 +361,33 @@ def check_generate_simplify_inference(self, pass_type):
     def test_generate_simplify_inference(self):
         self.check_generate_simplify_inference("generate_simplify_inference_v1")
         self.check_generate_simplify_inference("generate_simplify_inference_v2")
+
+    def test_generate_layer_norm_fuse_pass(self):
+        paddle.enable_static()
+        program = paddle.static.Program()
+        startup_program = paddle.static.Program()
+        with paddle.static.program_guard(program, startup_program):
+            x = paddle.static.data("x", [3, 64, 120], "float32")
+            gamma = paddle.static.create_parameter(
+                shape=[120], dtype="float32", is_bias=True)
+            beta = paddle.static.create_parameter(
+                shape=[120], dtype="float32", is_bias=True)
+
+            x_sub_mean = x - paddle.mean(x, axis=-1, keepdim=True)
+            std_dev = paddle.mean(x_sub_mean.pow(2), axis=-1, keepdim=True)
+            lnorm = x_sub_mean - (std_dev + 1e-5).sqrt()
+            out = lnorm * gamma + beta
+        graph = core.Graph(program.desc)
+        before_node_nums = len(graph.nodes())
+        core.get_pass("generate_layer_norm_fuse_pass").apply(graph)
+        after_node_nums = len(graph.nodes())
+        self.assertEqual(after_node_nums, before_node_nums - 14)
+        after_program = paddle.fluid.framework.IrGraph(graph).to_program()
+        executor = paddle.static.Executor(paddle.CPUPlace())
+        executor.run(startup_program)
+        feed = {"x": np.random.random([3, 64, 120]).astype("float32")}
+        before_out = executor.run(program, feed=feed, fetch_list=[out.name])
+        after_out = executor.run(after_program,
+                                 feed=feed,
+                                 fetch_list=[out.name])
+        self.assertTrue(np.allclose(before_out, after_out))
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py
index 2cfb6146f3f55..7508ecbb2946d 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py
@@ -23,13 +23,12 @@
 
 
 def conv2d_forward_refer(input, filter, group, conv_param):
-    out, in_n, out_h, out_w, out_c = conv2d_forward_naive(input, filter, group,
-                                                          conv_param)
+    out, _, _, _, _ = conv2d_forward_naive(input, filter, group, conv_param)
     return out
 
 
-@unittest.skipIf(not core.supports_bfloat16(),
-                 "place does not support BF16 evaluation")
+@unittest.skipIf(not core.supports_int8(),
+                 "place does not support int8 computation")
 class TestConv2DInt8Op(TestConv2DOp):
     def setUp(self):
         self.op_type = "conv2d"
@@ -53,73 +52,61 @@ def setUp(self):
             'pad': self.pad,
             'dilation': self.dilations
         }
-
+        # This implementation of convolution quantization is based on OneDNN documentation
+        # https://oneapi-src.github.io/oneDNN/dev_guide_int8_computations.html#doxid-dev-guide-int8-computations-1dg-i8-comp-s11
+        scale_output_shift = (self.scale_out /
+                              (self.scale_in * self.scale_weights[0]))
         filter = np.random.random(self.filter_size).astype(self.weighttype)
-        if self.srctype == np.uint8:
-            input = np.random.randint(0, 10,
+
+        # When the Intel AVX2 or Intel AVX512 Instruction Set is used
+        # the reorder additionally scales the weights by 0.5
+        # to overcome the potential overflow issue. If the processor supports VNNI instructions,
+        # modification of the weights is not necessary.
+        avx_scale = 0.5 if not core.supports_vnni(
+        ) and self.srctype == np.int8 else 1.
+        filter_int = np.round(filter * self.scale_weights[0] *
+                              avx_scale).astype(np.int32)
+        scale_output_shift = scale_output_shift / avx_scale
+
+        def conv2d_forward_refer_helper(input_):
+            return conv2d_forward_refer(
+                input_.astype(np.int32), filter_int, self.groups,
+                conv2d_param).astype(np.float32) * scale_output_shift
+
+        def residual_helper(init_low, init_high, output_):
+            input_residual_ = np.random.randint(
+                init_low, init_high,
+                self.input_residual_size).astype(self.srctype)
+            return (output_ + input_residual_ *
+                    (self.scale_out / self.scale_in_eltwise)), input_residual_
+
+        if self.srctype == np.int8:
+            init_low, init_high = (-5, 5)
+            input = np.random.randint(init_low, init_high,
                                       self.input_size).astype(self.srctype)
+            input_shift = (np.ones(self.input_size) * 128).astype(np.uint8)
+
+            output1 = conv2d_forward_refer_helper(
+                np.round(input + input_shift).astype(np.int32))
+            output2 = conv2d_forward_refer_helper(
+                np.round(input_shift).astype(np.int32))
+            output = output1 - output2
         else:
-            input = np.random.randint(-5, 5,
+            init_low, init_high = (0, 10)
+            input = np.random.randint(init_low, init_high,
                                       self.input_size).astype(self.srctype)
-            input_shift = (np.ones(self.input_size) * 128).astype(np.uint8)
+            output = conv2d_forward_refer_helper(input)
 
-        if self.srctype == np.int8:
-            filter_int = np.round(filter * self.scale_weights[0] *
-                                  0.5).astype(np.int32)
-            scale_output_shift = self.scale_out / (self.scale_in *
-                                                   self.scale_weights[0] * 0.5)
-            output1 = conv2d_forward_refer(
-                np.round((input.astype(np.int32) + input_shift) *
-                         self.scale_in).astype(np.int32), filter_int,
-                self.groups,
-                conv2d_param).astype(np.float32) * scale_output_shift
-            output2 = conv2d_forward_refer(
-                np.round((input_shift) * self.scale_in).astype(np.int32),
-                filter_int, self.groups,
-                conv2d_param).astype(np.float32) * scale_output_shift
-            if self.fuse_residual:
-                input_residual = np.random.randint(
-                    -5, 5, self.input_residual_size).astype(self.srctype)
-                output_tmp = np.round(output1 - output2 + input_residual.astype(
-                    self.srctype) * (self.scale_out / self.scale_in_eltwise))
-                if self.fuse_activation == "relu":
-                    output = np.maximum(output_tmp, 0).astype(self.dsttype)
-                else:
-                    output = output_tmp.astype(self.dsttype)
-            else:
-                if self.fuse_activation == "relu":
-                    output = np.maximum(np.round(output1 - output2),
-                                        0).astype(self.dsttype)
-                else:
-                    output = np.round(output1 - output2).astype(self.dsttype)
+        if self.fuse_residual:
+            output, input_residual = residual_helper(init_low, init_high,
+                                                     output)
 
-        else:
-            filter_int = np.round(filter *
-                                  self.scale_weights[0]).astype(np.int32)
-            scale_output_shift = self.scale_out / (self.scale_in *
-                                                   self.scale_weights[0])
-            output1 = conv2d_forward_refer(
-                input.astype(np.int32), filter_int, self.groups,
-                conv2d_param).astype(np.float32)
-            output1_tmp = np.round(output1 * (
-                self.scale_out / (self.scale_in * self.scale_weights[0])))
-
-            if self.fuse_residual:
-                input_residual = np.random.randint(
-                    0, 10, self.input_residual_size).astype(self.srctype)
-                output_tmp_res = np.round(output1 * (self.scale_out / (
-                    self.scale_in * self.scale_weights[
-                        0])) + input_residual.astype(np.int32) * (
-                            self.scale_out / self.scale_in_eltwise))
-                if self.fuse_activation == "relu":
-                    output = np.maximum(output_tmp_res, 0).astype(self.dsttype)
-                else:
-                    output = output_tmp_res.astype(self.dsttype)
-            else:
-                if self.fuse_activation == "relu":
-                    output = np.maximum(output1_tmp, 0).astype(self.dsttype)
-                else:
-                    output = output1_tmp.astype(self.dsttype)
+        output = np.round(output)
+
+        if self.fuse_activation == "relu":
+            output = np.maximum(output, 0)
+
+        output = output.astype(self.dsttype)
 
         self.inputs = {
             'Input':
@@ -169,7 +156,7 @@ def init_test_case(self):
         f_c = self.input_size[1] // self.groups
         self.input_residual_size = [1, 2, 3, 3]
         self.filter_size = [2, f_c, 3, 3]
-        self.scale_in = 1.0
+        self.scale_in = 0.95
         self.scale_out = 0.5
         self.scale_weights = [10.0]
         self.scale_in_eltwise = 0.6
@@ -185,7 +172,7 @@ def init_fuse_residual(self):
         self.fuse_residual = True
 
 
-#--------------------test conv2d u8 in and u8 out with residual fuse--------------------
+# --------------------test conv2d u8 in and u8 out with residual fuse--------------------
 
 
 class TestConv2D(TestConv2DInt8Op):
@@ -197,7 +184,7 @@ def init_test_case(self):
         assert np.mod(self.input_size[1], self.groups) == 0
         f_c = self.input_size[1] // self.groups
         self.filter_size = [6, f_c, 3, 3]
-        self.scale_in = 1.0
+        self.scale_in = 0.95
         self.scale_out = 0.5
         self.scale_weights = [10.0]
         self.scale_in_eltwise = 0.6
@@ -224,7 +211,7 @@ def init_test_case(self):
         assert np.mod(self.input_size[1], self.groups) == 0
         f_c = self.input_size[1] // self.groups
         self.filter_size = [6, f_c, 3, 3]
-        self.scale_in = 1.0
+        self.scale_in = 0.95
         self.scale_out = 0.8
         self.scale_weights = [10.0]
         self.scale_in_eltwise = 0.5
@@ -240,7 +227,7 @@ def init_test_case(self):
         assert np.mod(self.input_size[1], self.groups) == 0
         f_c = self.input_size[1] // self.groups
         self.filter_size = [6, f_c, 3, 3]
-        self.scale_in = 1.0
+        self.scale_in = 0.95
         self.scale_out = 0.8
         self.scale_weights = [10.0]
         self.scale_in_eltwise = 0.5
@@ -255,7 +242,7 @@ def init_test_case(self):
         assert np.mod(self.input_size[1], self.groups) == 0
         f_c = self.input_size[1] // self.groups
         self.filter_size = [6, f_c, 1, 1]
-        self.scale_in = 1.0
+        self.scale_in = 0.95
         self.scale_out = 0.5
         self.scale_weights = [12.0]
         self.scale_in_eltwise = 0.5
@@ -270,7 +257,7 @@ def init_test_case(self):
         assert np.mod(self.input_size[1], self.groups) == 0
         f_c = self.input_size[1] // self.groups
         self.filter_size = [6, f_c, 1, 1]
-        self.scale_in = 1.0
+        self.scale_in = 0.95
         self.scale_out = 0.5
         self.scale_weights = [10.0]
         self.scale_in_eltwise = 0.8
@@ -290,32 +277,32 @@ def init_data_type_with_fusion(self, input_dt, fuse_activation, fuse_residual):
 
 def create_test_int8_class(parent):
 
-    #--------------------test conv2d s8 in and u8 out--------------------
+    # --------------------test conv2d s8 in and u8 out--------------------
     class TestS8U8Case(parent):
         def init_data_type(self):
             init_data_type_with_fusion(self, np.int8, "relu", False)
 
-    #--------------------test conv2d s8 in and s8 out--------------------
+    # --------------------test conv2d s8 in and s8 out--------------------
     class TestS8S8Case(parent):
         def init_data_type(self):
             init_data_type_with_fusion(self, np.int8, "", False)
 
-    #--------------------test conv2d u8 in and s8 out--------------------
+    # --------------------test conv2d u8 in and s8 out--------------------
     class TestU8S8Case(parent):
         def init_data_type(self):
             init_data_type_with_fusion(self, np.uint8, "", False)
 
-    #--------------------test conv2d u8 in and u8 out without residual fuse--------------------
+    # --------------------test conv2d u8 in and u8 out without residual fuse--------------------
     class TestU8U8Case(parent):
         def init_data_type(self):
             init_data_type_with_fusion(self, np.uint8, "relu", False)
 
-    #--------------------test conv2d s8 in and s8 out with residual fuse--------------------
+    # --------------------test conv2d s8 in and s8 out with residual fuse--------------------
     class TestS8S8ResCase(parent):
         def init_data_type(self):
             init_data_type_with_fusion(self, np.int8, "", True)
 
-    #--------------------test conv2d u8 in and s8 out with residual fuse--------------------
+    # --------------------test conv2d u8 in and s8 out with residual fuse--------------------
     class TestU8S8ResCase(parent):
         def init_data_type(self):
             init_data_type_with_fusion(self, np.uint8, "", True)
@@ -333,9 +320,9 @@ def init_data_type(self):
     TestS8S8Case.__name__ = cls_name_s8s8
     TestU8S8Case.__name__ = cls_name_u8s8
     TestU8U8Case.__name__ = cls_name_u8u8
-
     TestS8S8ResCase.__name__ = cls_name_s8s8_re_1
     TestU8S8ResCase.__name__ = cls_name_u8s8_re_1
+
     globals()[cls_name_s8u8] = TestS8U8Case
     globals()[cls_name_s8s8] = TestS8S8Case
     globals()[cls_name_u8s8] = TestU8S8Case
@@ -344,7 +331,7 @@ def init_data_type(self):
     globals()[cls_name_u8s8_re_1] = TestU8S8ResCase
 
     if os.name != 'nt':
-        #--------------------test conv2d s8 in and u8 out with residual fuse--------------------
+        # --------------------test conv2d s8 in and u8 out with residual fuse--------------------
         class TestS8U8ResCase(parent):
             def init_data_type(self):
                 init_data_type_with_fusion(self, np.int8, "relu", True)
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_conv3d_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_conv3d_mkldnn_op.py
index ca25b849b4a78..dcaee49558ba2 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_conv3d_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_conv3d_mkldnn_op.py
@@ -95,4 +95,6 @@ def init_kernel_type(self):
 
 
 if __name__ == '__main__':
+    from paddle import enable_static
+    enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_div_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_div_mkldnn_op.py
new file mode 100644
index 0000000000000..a3c41d2f03476
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_div_mkldnn_op.py
@@ -0,0 +1,179 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+from paddle import enable_static
+from paddle.fluid.tests.unittests.op_test import OpTest, OpTestTool, convert_float_to_uint16
+from paddle.fluid.framework import _current_expected_place
+import paddle.fluid.core as core
+
+
+@OpTestTool.skip_if(not (isinstance(_current_expected_place(), core.CPUPlace)),
+                    "GPU is not supported")
+class TestMKLDNNElementwiseDivOp(OpTest):
+    def setUp(self):
+        self.op_type = "elementwise_div"
+        self.init_dtype()
+        self.init_input_output()
+        self.init_kernel_type()
+        self.init_axis()
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(self.x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
+        }
+        self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn}
+        self.outputs = {'Out': self.out}
+
+    def init_input_output(self):
+        self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+        self.y = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+        self.out = np.divide(self.x, self.y)
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Y'], 'Out', None, 0.005, False, 0.02)
+
+    def test_check_grad_ignore_x(self):
+        self.check_grad(['Y'], 'Out', set("X"), 0.005, False, 0.02)
+
+    def test_check_grad_ignore_y(self):
+        self.check_grad(['X'], 'Out', set('Y'), 0.005, False, 0.02)
+
+    def init_axis(self):
+        self.axis = -1
+
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestMKLDNNElementwiseDivOp2(TestMKLDNNElementwiseDivOp):
+    def init_input_output(self):
+        self.x = np.random.uniform(0.1, 1, [100]).astype(self.dtype)
+        self.y = np.random.uniform(0.1, 1, [100]).astype(self.dtype)
+        self.out = np.divide(self.x, self.y)
+
+
+class TestMKLDNNElementwiseDivOp3(TestMKLDNNElementwiseDivOp):
+    def init_input_output(self):
+        self.x = np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype(self.dtype)
+        self.y = np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype(self.dtype)
+        self.out = np.divide(self.x, self.y)
+
+
+class TestMKLDNNElementwiseDivOp4(TestMKLDNNElementwiseDivOp):
+    def init_input_output(self):
+        self.x = np.random.uniform(1, 2, [2, 3, 4, 32]).astype(self.dtype)
+        self.y = np.random.uniform(1, 2, [4, 32]).astype(self.dtype)
+        self.out = np.divide(self.x, self.y)
+
+    # TODO(piotrekobiIntel): Enable when grad is ready
+    def test_check_grad_normal(self):
+        pass
+
+    def test_check_grad_ignore_x(self):
+        pass
+
+
+class TestMKLDNNElementwiseDivOp5(TestMKLDNNElementwiseDivOp):
+    def init_input_output(self):
+        self.x = np.random.uniform(1, 2, [2, 3, 4, 100]).astype(self.dtype)
+        self.y = np.random.uniform(1, 2, [100]).astype(self.dtype)
+        self.out = np.divide(self.x, self.y)
+
+    # TODO(piotrekobiIntel): Enable when grad is ready
+    def test_check_grad_normal(self):
+        pass
+
+    def test_check_grad_ignore_x(self):
+        pass
+
+
+@OpTestTool.skip_if_not_cpu_bf16()
+class TestBf16(TestMKLDNNElementwiseDivOp):
+    def setUp(self):
+        self.op_type = "elementwise_div"
+        self.init_dtype()
+        self.init_input_output()
+        self.init_kernel_type()
+        self.init_axis()
+
+        self.x_bf16 = convert_float_to_uint16(self.x)
+        self.y_bf16 = convert_float_to_uint16(self.y)
+        self.inputs = {'X': self.x_bf16, 'Y': self.y_bf16}
+        self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn}
+        self.outputs = {'Out': convert_float_to_uint16(self.out)}
+
+    def init_dtype(self):
+        self.dtype = np.float32
+        self.mkldnn_data_type = "bfloat16"
+
+    def init_input_output(self):
+        self.x = np.random.uniform(0.1, 1, [100]).astype(self.dtype)
+        self.y = np.random.uniform(0.1, 1, [100]).astype(self.dtype)
+        self.out = np.divide(self.x, self.y)
+
+    def test_check_output(self):
+        self.check_output_with_place(core.CPUPlace())
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(
+            core.CPUPlace(), ["X", "Y"],
+            "Out",
+            user_defined_grads=[
+                np.divide(self.x, self.y), np.divide(
+                    (np.multiply(-self.x, self.x)), np.multiply(self.y, self.y))
+            ],
+            user_defined_grad_outputs=[self.x_bf16])
+
+    def test_check_grad_ignore_x(self):
+        self.check_grad_with_place(
+            core.CPUPlace(), ["Y"],
+            "Out",
+            user_defined_grads=[
+                np.divide((np.multiply(-self.x, self.y)),
+                          np.multiply(self.y, self.y))
+            ],
+            user_defined_grad_outputs=[self.y_bf16])
+
+    def test_check_grad_ignore_y(self):
+        self.check_grad_with_place(
+            core.CPUPlace(), ["X"],
+            "Out",
+            user_defined_grads=[np.divide(self.x, self.y)],
+            user_defined_grad_outputs=[self.x_bf16])
+
+
+class TestBf16Broadcasting(TestBf16):
+    def init_input_output(self):
+        self.x = np.random.uniform(1, 2, [2, 3, 4, 100]).astype(self.dtype)
+        self.y = np.random.uniform(1, 2, [100]).astype(self.dtype)
+        self.out = np.subtract(self.x, self.y)
+
+    def test_check_grad_normal(self):
+        pass
+
+    def test_check_grad_ignore_x(self):
+        pass
+
+
+if __name__ == '__main__':
+    enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_stack_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_stack_mkldnn_op.py
new file mode 100644
index 0000000000000..f7424014c2111
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_stack_mkldnn_op.py
@@ -0,0 +1,115 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from paddle.fluid.tests.unittests.op_test import OpTest, OpTestTool, skip_check_grad_ci
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+
+
+@OpTestTool.skip_if_not_cpu()
+class TestStack2DOneDNNOp(OpTest):
+    def initDefaultParameters(self):
+        self.num_inputs = 4
+        self.input_dim = (2, 2)
+        self.axis = 1
+        self.dtype = np.float32
+
+    def initParameters(self):
+        pass
+
+    def getInputNames(self):
+        input_names = []
+        for i in range(self.num_inputs):
+            input_names.append('x{}'.format(i))
+        return input_names
+
+    def setUp(self):
+        self.initDefaultParameters()
+        self.initParameters()
+        self.op_type = 'stack'
+        self.op_inputs = []
+
+        for i in range(self.num_inputs):
+            self.op_inputs.append(
+                np.random.random(size=self.input_dim).astype(np.float32))
+
+        input_list = []
+        input_names = self.getInputNames()
+        for i in range(self.num_inputs):
+            input_list.append((input_names[i], self.op_inputs[i]))
+
+        self.inputs = {'X': input_list}
+        self.outputs = {'Y': np.stack(self.op_inputs, axis=self.axis)}
+        self.attrs = {'axis': self.axis, 'use_mkldnn': True}
+
+    def test_check_output(self):
+        self.check_output_with_place(core.CPUPlace())
+
+    # JUST FOR CI TO PASS, GRAD IS NOT IMPLEMENTED YET
+    def test_check_grad(self):
+        pass
+
+
+class TestStack1DOneDNNOp(TestStack2DOneDNNOp):
+    def initParameters(self):
+        self.input_dim = (100)
+        self.axis = 0
+
+
+class TestStack1DAxis1OneDNNOp(TestStack2DOneDNNOp):
+    def initParameters(self):
+        self.input_dim = (100)
+        self.axis = 1
+
+
+class TestStack2DAxisLastOneDNNOp(TestStack2DOneDNNOp):
+    def initParameters(self):
+        self.input_dim = (13, 24)
+        self.num_inputs = 5
+        self.axis = -1
+
+
+class TestStack3DAxisNegativeOneDNNOp(TestStack2DOneDNNOp):
+    def initParameters(self):
+        self.input_dim = (10, 128, 128)
+        self.axis = -2
+
+
+class TestStack3DOneDNNOp(TestStack2DOneDNNOp):
+    def initParameters(self):
+        self.input_dim = (10, 128, 128)
+        self.num_inputs = 3
+        self.axis = 1
+
+
+class TestStack4DOneDNNOp(TestStack2DOneDNNOp):
+    def initParameters(self):
+        self.input_dim = (2, 2, 2, 2)
+        self.num_inputs = 3
+        self.axis = 4
+
+
+class TestStack5DOneDNNOp(TestStack2DOneDNNOp):
+    def initParameters(self):
+        self.input_dim = (2, 3, 4, 5, 6)
+        self.num_inputs = 6
+        self.axis = 0
+
+
+if __name__ == "__main__":
+    paddle.enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt b/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt
index 4e81bb9544ceb..8e31d58195be8 100644
--- a/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt
@@ -17,6 +17,7 @@ if (WITH_ASCEND_CL)
     # Note: the following test cases has running time more than 120s
     set_tests_properties(test_nearest_interp_op_npu PROPERTIES TIMEOUT 200)
     set_tests_properties(test_nearest_interp_v2_op_npu PROPERTIES TIMEOUT 200)
+    set_tests_properties(test_bilinear_interp_v2_op_npu PROPERTIES TIMEOUT 200)
     set_tests_properties(test_stack_op_npu PROPERTIES TIMEOUT 300)
     set_tests_properties(test_conv2d_transpose_op_npu PROPERTIES TIMEOUT 200)
     set_tests_properties(test_conv2d_op_npu PROPERTIES TIMEOUT 300)
diff --git a/python/paddle/fluid/tests/unittests/npu/test_bilinear_interp_v2_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_bilinear_interp_v2_op_npu.py
new file mode 100644
index 0000000000000..6da49b8d84d19
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_bilinear_interp_v2_op_npu.py
@@ -0,0 +1,279 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from paddle.nn.functional import interpolate
+import paddle
+
+from test_bilinear_interp_v2_op import bilinear_interp_np
+
+paddle.enable_static()
+
+
+class TestBilinearInterpOp(OpTest):
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.place = paddle.NPUPlace(0)
+
+    def setUp(self):
+        self.set_npu()
+        self.out_size = None
+        self.actual_shape = None
+        self.data_layout = 'NCHW'
+        self.init_test_case()
+        self.op_type = "bilinear_interp_v2"
+        input_np = np.random.random(self.input_shape).astype(self.dtype)
+
+        if self.data_layout == "NCHW":
+            in_h = self.input_shape[2]
+            in_w = self.input_shape[3]
+        else:
+            in_h = self.input_shape[1]
+            in_w = self.input_shape[2]
+        scale_h = 0
+        scale_w = 0
+        if self.scale:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                if self.scale > 0.:
+                    scale_h = scale_w = float(self.scale)
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                scale_w = scale_h = self.scale[0]
+            elif isinstance(self.scale, list) and len(self.scale) > 1:
+                scale_w = self.scale[1]
+                scale_h = self.scale[0]
+            out_h = int(in_h * scale_h)
+            out_w = int(in_w * scale_w)
+        else:
+            out_h = self.out_h
+            out_w = self.out_w
+
+        output_np = bilinear_interp_np(input_np, out_h, out_w, scale_w, scale_h,
+                                       self.out_size, self.actual_shape,
+                                       self.align_corners, self.align_mode,
+                                       self.data_layout)
+
+        self.inputs = {'X': input_np}
+        if self.out_size is not None:
+            self.inputs['OutSize'] = self.out_size
+        if self.actual_shape is not None:
+            self.inputs['OutSize'] = self.actual_shape
+
+        self.attrs = {
+            'out_h': self.out_h,
+            'out_w': self.out_w,
+            'interp_method': self.interp_method,
+            'align_corners': self.align_corners,
+            'align_mode': self.align_mode,
+            'data_layout': self.data_layout
+        }
+        if self.scale:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                if self.scale > 0.:
+                    self.scale = [self.scale]
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                self.scale = [self.scale[0], self.scale[0]]
+            self.attrs['scale'] = self.scale
+        self.outputs = {'Out': output_np}
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, atol=self.atol)
+
+    def test_check_grad(self):
+        self.__class__.exist_check_grad = True
+        if self.dtype == 'float16':
+            return
+        self.max_relative_error = 0.005
+        inputs_to_check = ['X']
+        output_names = ['Out']
+        no_grad_set = set()
+        cpu_place = fluid.CPUPlace()
+        cpu_grads = self._get_gradient(inputs_to_check, cpu_place, output_names,
+                                       no_grad_set)
+        npu_grads = self._get_gradient(inputs_to_check, self.place,
+                                       output_names, no_grad_set)
+        self._assert_is_close(cpu_grads, npu_grads, inputs_to_check,
+                              self.max_relative_error,
+                              "Gradient Check between places")
+
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [2, 3, 5, 7]
+        self.out_h = 60
+        self.out_w = 25
+        self.scale = 1.5
+        self.align_corners = False
+        self.align_mode = 1
+        self.dtype = 'float32'
+        self.atol = 1e-5
+
+
+class TestBilinearInterpCaseFP16(TestBilinearInterpOp):
+    def init_test_case(self):
+        super(TestBilinearInterpCaseFP16, self).init_test_case()
+        self.dtype = 'float16'
+        self.atol = 1e-2
+
+
+class TestBilinearInterpCase1(TestBilinearInterpOp):
+    def init_test_case(self):
+        super(TestBilinearInterpCase1, self).init_test_case()
+        self.input_shape = [4, 1, 7, 8]
+        self.out_h = 1
+        self.out_w = 1
+        self.scale = 0.
+
+
+class TestBilinearInterpCase2(TestBilinearInterpOp):
+    def init_test_case(self):
+        super(TestBilinearInterpCase2, self).init_test_case()
+        self.input_shape = [3, 3, 9, 6]
+        self.out_h = 12
+        self.out_w = 12
+        self.scale = 0.
+
+
+class TestBilinearInterpCase3(TestBilinearInterpOp):
+    def init_test_case(self):
+        super(TestBilinearInterpCase3, self).init_test_case()
+        self.input_shape = [1, 1, 32, 64]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = 0.
+
+
+class TestBilinearInterpCase4(TestBilinearInterpOp):
+    def init_test_case(self):
+        super(TestBilinearInterpCase4, self).init_test_case()
+        self.input_shape = [4, 1, 7, 8]
+        self.out_h = 1
+        self.out_w = 1
+        self.scale = 0.
+        self.out_size = np.array([2, 2]).astype("int32")
+
+
+class TestBilinearInterpCase5(TestBilinearInterpOp):
+    def init_test_case(self):
+        super(TestBilinearInterpCase5, self).init_test_case()
+        self.input_shape = [3, 3, 9, 6]
+        self.out_h = 12
+        self.out_w = 12
+        self.scale = 0.
+        self.out_size = np.array([11, 11]).astype("int32")
+
+
+class TestBilinearInterpCase6(TestBilinearInterpOp):
+    def init_test_case(self):
+        super(TestBilinearInterpCase6, self).init_test_case()
+        self.input_shape = [1, 1, 32, 64]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = 0.
+        self.out_size = np.array([65, 33]).astype("int32")
+
+
+class TestBilinearInterpCase7(TestBilinearInterpOp):
+    def init_test_case(self):
+        super(TestBilinearInterpCase7, self).init_test_case()
+        self.input_shape = [1, 1, 32, 64]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = [2.0, 0.5]
+
+
+class TestBilinearInterpSame(TestBilinearInterpOp):
+    def init_test_case(self):
+        super(TestBilinearInterpSame, self).init_test_case()
+        self.input_shape = [2, 3, 32, 64]
+        self.out_h = 32
+        self.out_w = 64
+        self.scale = 0.
+
+
+class TestBilinearInterpActualShape(TestBilinearInterpOp):
+    def init_test_case(self):
+        super(TestBilinearInterpActualShape, self).init_test_case()
+        self.input_shape = [3, 2, 32, 16]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = 0.
+        self.out_size = np.array([66, 40]).astype("int32")
+
+
+class TestBilinearInterpDataLayout(TestBilinearInterpOp):
+    def init_test_case(self):
+        super(TestBilinearInterpDataLayout, self).init_test_case()
+        self.input_shape = [2, 5, 5, 3]
+        self.out_h = 2
+        self.out_w = 2
+        self.scale = 0.
+        self.out_size = np.array([3, 3]).astype("int32")
+        self.data_layout = "NHWC"
+
+
+class TestBilinearInterpOtherMethod1(TestBilinearInterpOp):
+    def set_align_mode(self):
+        self.align_corners = False
+        self.align_mode = 1
+
+
+class TestBilinearInterpWithMethod2(TestBilinearInterpOp):
+    def set_align_mode(self):
+        self.align_corners = False
+        self.align_mode = 0
+
+
+class TestBilinearInterpWithMethod3(TestBilinearInterpOp):
+    def set_align_mode(self):
+        self.align_corners = True
+        self.align_mode = 0
+
+
+class TestBilinearInterpScale1(TestBilinearInterpOp):
+    def init_test_case(self):
+        super(TestBilinearInterpScale1, self).init_test_case()
+        self.input_shape = [2, 3, 5, 7]
+        self.out_h = 60
+        self.out_w = 25
+        self.scale = 2.
+
+
+class TestBilinearInterpScale2(TestBilinearInterpOp):
+    def init_test_case(self):
+        super(TestBilinearInterpScale2, self).init_test_case()
+        self.input_shape = [2, 3, 5, 7]
+        self.out_h = 60
+        self.out_w = 25
+        self.scale = 1.
+
+
+class TestBilinearInterpZero(TestBilinearInterpOp):
+    def init_test_case(self):
+        super(TestBilinearInterpZero, self).init_test_case()
+        self.input_shape = [2, 3, 5, 7]
+        self.out_h = 60
+        self.out_w = 25
+        self.scale = 0.2
+        self.align_mode = 0
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_expand_v2_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_expand_v2_op_npu.py
index d48d2a8430134..fd0b9850308b2 100755
--- a/python/paddle/fluid/tests/unittests/npu/test_expand_v2_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_expand_v2_op_npu.py
@@ -201,13 +201,16 @@ def test_check_output(self):
 # Situation 5: input x is int32
 # skip grad check for int32
 class TestExpandV2OpInteger(OpTest):
+    def init_dtype(self):
+        self.dtype = 'int32'
+
     def setUp(self):
         self.set_npu()
         self.place = paddle.NPUPlace(0)
         self.op_type = "expand_v2"
         self.inputs = {
             'X': np.random.randint(
-                10, size=(2, 4, 20)).astype("int32")
+                10, size=(2, 4, 20)).astype(self.dtype)
         }
         self.attrs = {'shape': [2, 4, 20]}
         output = np.tile(self.inputs['X'], (1, 1, 1))
@@ -221,6 +224,25 @@ def test_check_output(self):
         self.check_output_with_place(self.place)
 
 
+class TesstExpandV2OpInt64(TestExpandV2OpInteger):
+    def init_dtype(self):
+        self.dtype = 'int64'
+
+
+class TesstExpandV2OpBool(TestExpandV2OpInteger):
+    def init_dtype(self):
+        self.dtype = 'bool'
+
+    def setUp(self):
+        self.set_npu()
+        self.place = paddle.NPUPlace(0)
+        self.op_type = "expand_v2"
+        self.inputs = {'X': np.random.randint(10, size=(2, 4, 20)) > 5}
+        self.attrs = {'shape': [2, 4, 20]}
+        output = np.tile(self.inputs['X'], (1, 1, 1))
+        self.outputs = {'Out': output}
+
+
 class TestExpandV2Error(unittest.TestCase):
     def test_errors(self):
         with program_guard(Program(), Program()):
diff --git a/python/paddle/fluid/tests/unittests/npu/test_fill_constant_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_fill_constant_op_npu.py
index 2ab15213803a9..a3e781c990ecb 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_fill_constant_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_fill_constant_op_npu.py
@@ -120,5 +120,29 @@ def test_check_output(self):
         self.check_output_with_place(self.place, atol=1e-3)
 
 
+class TestFillConstantBool(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.place = paddle.NPUPlace(0)
+        self.op_type = "fill_constant"
+
+        self.inputs = {}
+        self.attrs = {
+            'shape': [123, 92],
+            'value': True,
+            'dtype': core.VarDesc.VarType.BOOL
+        }
+        self.outputs = {'Out': np.full((123, 92), True).astype(self.dtype)}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.BOOL
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py
index 1031be4c1a7b4..fefff0974ae40 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py
@@ -38,7 +38,7 @@ def setUp(self):
         np.random.seed(SEED)
         w = np.random.random([self.vocab, self.dim]).astype(self.dtype)
         x = np.random.randint(
-            0, self.vocab, size=(self.bsz, self.seqlen)).astype(np.int32)
+            0, self.vocab, size=(self.bsz, self.seqlen)).astype(self.ids_dtype)
         out = w[x]
         if self.padding_idx != -1:
             out[np.squeeze(x == self.padding_idx)] = np.zeros(self.dim)
@@ -60,6 +60,7 @@ def set_npu(self):
 
     def init_dtype(self):
         self.dtype = np.float32
+        self.ids_dtype = np.int32
 
     def init_dims(self):
         self.bsz = 6
@@ -85,6 +86,7 @@ class TestLookupTableV2FP16(TestLookupTableV2):
 
     def init_dtype(self):
         self.dtype = np.float16
+        self.ids_dtype = np.int32
 
     def set_npu(self):
         self.__class__.use_npu = True
@@ -105,6 +107,7 @@ class TestLookupTableV2Dim32FP16(TestLookupTableV2):
 
     def init_dtype(self):
         self.dtype = np.float16
+        self.ids_dtype = np.int64
 
     def init_dims(self):
         self.bsz = 6
@@ -122,5 +125,14 @@ def init_padding_idx(self):
         self.padding_idx = np.random.randint(0, self.vocab)
 
 
+class TestLookupTableV2WithPadding1(TestLookupTableV2):
+    def init_padding_idx(self):
+        self.padding_idx = np.random.randint(0, self.vocab)
+
+    def init_dtype(self):
+        self.dtype = np.float32
+        self.ids_dtype = np.int64
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_reduce_max_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_reduce_max_op_npu.py
index f6c346159b8be..68a28ea72e1fc 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_reduce_max_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_reduce_max_op_npu.py
@@ -271,5 +271,30 @@ def init_dtype(self):
         self.dtype = np.float16
 
 
+@skip_check_grad_ci(
+    reason="reduce_max is discontinuous non-derivable function,"
+    " its gradient check is not supported by unittest framework.")
+class TestReduceMaxOpInt64(TestNPUReduceMaxOp):
+    """Remove Max with subgradient from gradient check to confirm the success of CI."""
+
+    def setUp(self):
+        self.op_type = "reduce_max"
+        self.set_npu()
+        self.init_dtype()
+
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)}
+        self.attrs = {
+            'dim': [-2, -1],
+            'out_dtype': int(core.VarDesc.VarType.INT64)
+        }
+        self.outputs = {
+            'Out': self.inputs['X'].max(
+                axis=tuple(self.attrs['dim'])).astype(np.float32)
+        }
+
+    def init_dtype(self):
+        self.dtype = np.int64
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_scale_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_scale_op_npu.py
index 65ec28fbf7d3a..424c4ca0ff35d 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_scale_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_scale_op_npu.py
@@ -39,7 +39,8 @@ def setUp(self):
         }
         self.attrs = {'scale': -2.3, 'bias': 0, 'bias_after_scale': True}
         self.outputs = {
-            'Out': self.inputs['X'] * self.dtype(self.attrs['scale'])
+            'Out': (self.inputs['X'] *
+                    self.dtype(self.attrs['scale'])).astype(self.dtype)
         }
 
     def set_npu(self):
@@ -57,6 +58,16 @@ def init_dtype(self):
         self.dtype = np.float16
 
 
+class TestScaleInt(TestScale):
+    def init_dtype(self):
+        self.dtype = np.int32
+
+
+class TestScaleInt64(TestScale):
+    def init_dtype(self):
+        self.dtype = np.int64
+
+
 class TestBiasAfterScale(OpTest):
     def setUp(self):
         self.set_npu()
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index 41fd0b442fe1c..a3e1650c131cd 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -1832,3 +1832,9 @@ def skip_if_not_cpu_bf16(cls):
             not (isinstance(_current_expected_place(), core.CPUPlace) and
                  core.supports_bfloat16()),
             "Place does not support BF16 evaluation")
+
+    @classmethod
+    def skip_if_not_cpu(cls):
+        return OpTestTool.skip_if(
+            not isinstance(_current_expected_place(), core.CPUPlace),
+            "OneDNN supports only CPU for now")
diff --git a/python/paddle/fluid/tests/unittests/op_test_xpu.py b/python/paddle/fluid/tests/unittests/op_test_xpu.py
index 239708cc17449..187d78ba04aee 100644
--- a/python/paddle/fluid/tests/unittests/op_test_xpu.py
+++ b/python/paddle/fluid/tests/unittests/op_test_xpu.py
@@ -44,86 +44,33 @@ class XPUOpTest(OpTest):
     @classmethod
     def setUpClass(cls):
         '''Fix random seeds to remove randomness from tests'''
-        cls._np_rand_state = np.random.get_state()
-        cls._py_rand_state = random.getstate()
-        cls.call_once = False
-        cls.dtype = np.float32
-        cls.outputs = {}
-        cls.input_shape_is_large = True
-
-        np.random.seed(123)
-        random.seed(124)
-
-        cls._use_system_allocator = _set_use_system_allocator(True)
+        cls.use_xpu = True
+        cls.use_mkldnn = False
+        super().setUpClass()
 
     @classmethod
     def tearDownClass(cls):
         """Restore random seeds"""
-        np.random.set_state(cls._np_rand_state)
-        random.setstate(cls._py_rand_state)
-
-        _set_use_system_allocator(cls._use_system_allocator)
 
         def is_empty_grad_op(op_type):
             all_op_kernels = core._get_all_register_op_kernels()
             grad_op = op_type + '_grad'
             if grad_op in all_op_kernels.keys():
-                if is_mkldnn_op_test():
-                    grad_op_kernels = all_op_kernels[grad_op]
-                    for grad_op_kernel in grad_op_kernels:
-                        if 'MKLDNN' in grad_op_kernel:
-                            return False
-                else:
-                    return False
-            return True
-
-        def is_xpu_op_test():
+                grad_op_kernels = all_op_kernels[grad_op]
+                for grad_op_kernel in grad_op_kernels:
+                    if 'XPU' in grad_op_kernel:
+                        return False
             return True
 
-        def is_mkldnn_op_test():
-            return False
+        if cls.dtype == np.float16:
+            place = paddle.XPUPlace(0)
+            if core.is_float16_supported(place) == False:
+                return
+        super().tearDownClass()
 
-        if not hasattr(cls, "op_type"):
-            raise AssertionError(
-                "This test do not have op_type in class attrs, "
-                "please set self.__class__.op_type=the_real_op_type manually.")
-
-        # case in NO_FP64_CHECK_GRAD_CASES and op in NO_FP64_CHECK_GRAD_OP_LIST should be fixed
-        if not hasattr(cls, "no_need_check_grad") \
-            and not is_empty_grad_op(cls.op_type):
-            if cls.dtype is None or \
-                (cls.dtype == np.float16 \
-                    and cls.op_type not in op_accuracy_white_list.NO_FP16_CHECK_GRAD_OP_LIST \
-                    and not hasattr(cls, "exist_check_grad")):
-                raise AssertionError("This test of %s op needs check_grad." %
-                                     cls.op_type)
-
-            # check for op test with fp64 precision, but not check mkldnn op test for now
-            if cls.dtype in [np.float32, np.float64] \
-                and cls.op_type not in op_accuracy_white_list.NO_FP64_CHECK_GRAD_OP_LIST \
-                and not hasattr(cls, 'exist_fp64_check_grad') \
-                and not is_xpu_op_test() \
-                and not is_mkldnn_op_test() \
-                and not is_rocm_op_test() \
-                and not is_npu_op_test():
-                raise AssertionError(
-                    "This test of %s op needs check_grad with fp64 precision." %
-                    cls.op_type)
-
-            if not cls.input_shape_is_large \
-                and cls.op_type not in check_shape_white_list.NEED_TO_FIX_OP_LIST:
-                raise AssertionError(
-                    "Input's shape should be large than or equal to 100 for " +
-                    cls.op_type + " Op.")
-
-    def try_call_once(self, data_type):
-        if not self.call_once:
-            self.call_once = True
-            if data_type is not None and \
-                data_type != np.float32:
-                raise AssertionError("Unsupport data type %s in xpu" %
-                                     data_type)
-            self.dtype = data_type
+    def _get_places(self):
+        places = [fluid.XPUPlace(0)]
+        return places
 
     def check_output_with_place(self,
                                 place,
@@ -133,166 +80,19 @@ def check_output_with_place(self,
                                 check_dygraph=True,
                                 inplace_atol=None):
         self.infer_dtype_from_inputs_outputs(self.inputs, self.outputs)
-        if self.dtype == np.float64 and \
-            self.op_type not in op_threshold_white_list.NEED_FIX_FP64_CHECK_OUTPUT_THRESHOLD_OP_LIST:
-            atol = 0
-
-        if self.is_bfloat16_op():
-            check_dygraph = False
-            if hasattr(self, 'force_fp32_output') and getattr(
-                    self, 'force_fp32_output'):
-                atol = 1e-2
-            else:
-                atol = 2
-
-        if no_check_set is not None:
-            if self.op_type not in no_check_set_white_list.no_check_set_white_list:
-                raise AssertionError(
-                    "no_check_set of op %s must be set to None." % self.op_type)
-
-        if check_dygraph:
-            dygraph_outs = self._calc_dygraph_output(
-                place, no_check_set=no_check_set)
-        outs, fetch_list = self._calc_output(place, no_check_set=no_check_set)
-        for out_name, out_dup in Operator.get_op_outputs(self.op_type):
-            if out_name not in self.outputs:
-                continue
-            if no_check_set is not None and out_name in no_check_set:
-                continue
-
-            def find_imperative_actual(target_name, dygraph_outs, place):
-                with fluid.dygraph.base.guard(place=place):
-                    for name in dygraph_outs:
-                        if name == target_name:
-                            return dygraph_outs[name][0]
-                        var_list = dygraph_outs[name]
-                        for i, var in enumerate(var_list):
-                            if var.name == target_name:
-                                return dygraph_outs[name][i]
-                    self.assertTrue(False, "Found failed {} {}".format(
-                        dygraph_outs.keys(), target_name))
-
-            def find_actual(target_name, fetch_list):
-                found = [
-                    i for i, var_name in enumerate(fetch_list)
-                    if var_name == target_name
-                ]
-                self.assertTrue(
-                    len(found) == 1, "Found {} {}".format(
-                        len(found), target_name))
-                return found[0]
-
-            if out_dup:
-                sub_out = self.outputs[out_name]
-                if not isinstance(sub_out, list):
-                    raise AssertionError("sub_out type %s is not list",
-                                         type(sub_out))
-                for item in sub_out:
-                    sub_out_name, expect = item[0], item[1]
-                    if check_dygraph:
-                        imperative_actual = find_imperative_actual(
-                            sub_out_name, dygraph_outs, place)
-                        imperative_actual_t = np.array(imperative_actual.value()
-                                                       .get_tensor())
-                    idx = find_actual(sub_out_name, fetch_list)
-                    actual = outs[idx]
-                    actual_t = np.array(actual)
-                    expect_t = expect[0] \
-                        if isinstance(expect, tuple) else expect
-                    self.assertTrue(
-                        np.allclose(
-                            actual_t, expect_t, atol=atol, equal_nan=equal_nan),
-                        "Output (" + sub_out_name + ") has diff at " +
-                        str(place))
-                    if check_dygraph:
-                        self.assertTrue(
-                            np.allclose(
-                                imperative_actual_t,
-                                expect_t,
-                                atol=atol,
-                                equal_nan=equal_nan),
-                            "Output (" + sub_out_name + ") has diff at " +
-                            str(place) + " in dygraph mode")
-                    if isinstance(expect, tuple):
-                        self.assertListEqual(
-                            actual.recursive_sequence_lengths(), expect[1],
-                            "Output (" + sub_out_name +
-                            ") has different lod at " + str(place))
-                        if check_dygraph:
-                            self.assertListEqual(
-                                imperative_actual.value().get_tensor()
-                                .recursive_sequence_lengths(), expect[1],
-                                "Output (" + out_name +
-                                ") has different lod at " + str(place) +
-                                " in dygraph mode")
-            else:
-                if check_dygraph:
-                    imperative_actual = find_imperative_actual(
-                        out_name, dygraph_outs, place)
-                    imperative_actual_t = np.array(imperative_actual.value()
-                                                   .get_tensor())
-                idx = find_actual(out_name, fetch_list)
-                actual = outs[idx]
-                actual_t = np.array(actual)
-                expect = self.outputs[out_name]
-                expect_t = expect[0] if isinstance(expect, tuple) else expect
-                self.assertTrue(
-                    np.allclose(
-                        actual_t, expect_t, atol=atol, equal_nan=equal_nan),
-                    "Output (" + out_name + ") has diff at " + str(place) +
-                    "\nExpect " + str(expect_t) + "\n" + "But Got" +
-                    str(actual_t) + " in class " + self.__class__.__name__ + " "
-                    + str(atol) + " " + str(expect_t - actual_t))
-                if check_dygraph:
-                    if six.moves.reduce(
-                            lambda x, y: x * y, imperative_actual_t.shape,
-                            1) == 0 and six.moves.reduce(
-                                lambda x, y: x * y, expect_t.shape, 1) == 0:
-                        pass
-                    else:
-                        self.assertTrue(
-                            np.allclose(
-                                imperative_actual_t,
-                                expect_t,
-                                atol=atol,
-                                equal_nan=equal_nan),
-                            "Output (" + out_name + ") has diff at " +
-                            str(place) + "\nExpect " + str(expect_t) + "\n" +
-                            "But Got" + str(imperative_actual_t) + " in class "
-                            + self.__class__.__name__)
-                if isinstance(expect, tuple):
-                    self.assertListEqual(actual.recursive_sequence_lengths(),
-                                         expect[1], "Output (" + out_name +
-                                         ") has different lod at " + str(place))
-                    if check_dygraph:
-                        self.assertListEqual(
-                            imperative_actual.value().get_tensor()
-                            .recursive_sequence_lengths(), expect[1],
-                            "Output (" + out_name + ") has different lod at " +
-                            str(place) + " in dygraph mode")
-
-        # Note(zhiqiu): inplace_atol should be only set when op doesn't ensure
-        # computational consistency.
-        # For example, group_norm uses AtomicAdd on CUDAPlace, which do not ensure
-        # computation order when multiple threads write the same address. So the
-        # result of group_norm is non-deterministic when datatype is float.
-        # When inplace_atol is not None, the inplace check uses numpy.allclose
-        # to check inplace result instead of numpy.array_equal.
-        if inplace_atol is not None:
-            warnings.warn(
-                "inplace_atol should only be set when op doesn't ensure computational consistency, please check it!"
-            )
-        # Check inplace for given op, its grad op, its grad_grad op, etc.
-        # No effect on original OpTest
-        # Currently not support ParallelExecutor on XPUPlace.
-        if not paddle.is_compiled_with_xpu():
-            self.check_inplace_output_with_place(
-                place, no_check_set=no_check_set, inplace_atol=inplace_atol)
-
-        if check_dygraph:
-            return outs
-        else:
-            return outs
+        #xpu not support float64
+        if self.dtype == np.float64:
+            return
+        if place == None:
+            place = paddle.XPUPlace(0)
+
+        if self.dtype == np.float16:
+            if core.is_float16_supported(place) == False:
+                return
+        if self.dtype == np.float16:
+            atol = 0.1
+        return super().check_output_with_place(
+            place, atol, no_check_set, equal_nan, check_dygraph, inplace_atol)
 
     def check_grad_with_place(self,
                               place,
@@ -303,8 +103,26 @@ def check_grad_with_place(self,
                               in_place=False,
                               max_relative_error=0.005,
                               user_defined_grads=None,
-                              check_dygraph=True):
-        place = paddle.XPUPlace(0)
+                              user_defined_grad_outputs=None,
+                              check_dygraph=True,
+                              numeric_place=None):
+        if place == None:
+            place = paddle.XPUPlace(0)
+
+        if self.dtype == np.float64:
+            return
+
+        if self.dtype == np.float16:
+            if core.is_float16_supported(place) == False:
+                return
+
+        if self.dtype == np.float16:
+            max_relative_error = 1.0
+            return super().check_grad_with_place(
+                place, inputs_to_check, output_names, no_grad_set,
+                numeric_grad_delta, in_place, max_relative_error,
+                user_defined_grads, user_defined_grads, check_dygraph)
+
         a1 = self.get_grad_with_place(
             place, inputs_to_check, output_names, no_grad_set=no_grad_set)
         a2 = self.get_grad_with_place(
diff --git a/python/paddle/fluid/tests/unittests/test_adamw_op.py b/python/paddle/fluid/tests/unittests/test_adamw_op.py
index 0a60f4cba09bc..dbeb5a430377f 100644
--- a/python/paddle/fluid/tests/unittests/test_adamw_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adamw_op.py
@@ -333,7 +333,7 @@ def test_adamw_op_dygraph(self):
             lr_ratio=simple_lr_fun)
 
         loss_ref = np.array(
-            [4.8383293, 3.0854003, 1.33299, -0.418993, -2.171043])
+            [4.8383293, 3.084947, 1.3323904, -0.41943002, -2.1710064])
         for i in range(5):
             a1 = linear1(a)
             out = linear2(a1)
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_api.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_api.py
index 3f1d692b72e98..8593e44b3d820 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_api.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_api.py
@@ -15,128 +15,153 @@
 from __future__ import print_function
 
 import unittest
-import functools
-import operator
-import numpy as np
 import paddle
 import paddle.fluid as fluid
-import paddle.fluid.core as core
 import paddle.nn as nn
 import paddle.distributed as dist
+from paddle.distributed.auto_parallel.dist_context import get_default_distributed_context
+from paddle.distributed.auto_parallel.process_mesh import ProcessMesh
 
 paddle.enable_static()
 
-
-def _flatten_nested_list(nested_list):
-    result = functools.reduce(operator.iconcat, nested_list, [])
-    return result
-
-
-def _append_attr_suffix(name):
-    return name + core.kAutoParallelSuffix()
-
-
-LAST_PP_STAGE = 3
-MASK = [[0, 1, 1], [0, 1, 1]]
-MESH = dist.ProcessMesh([[0, 1, 2], [3, 4, 5]])
+process_mesh1 = [0, 1, 2, 3]
+process_mesh2 = [[0, 1, 2], [3, 4, 5]]
 
 
 class SimpleNet(nn.Layer):
     def __init__(self, vocab_size=128, hidden_size=4):
         super(SimpleNet, self).__init__()
-        self.mesh = MESH
-        self.mesh.set_placement([5, 4, 3, 2, 1, 0])
         self.word_embeddings = nn.Embedding(vocab_size, hidden_size)
         self.dense1 = nn.Linear(hidden_size, hidden_size)
         self.dense2 = nn.Linear(hidden_size, hidden_size // 2)
 
     def forward(self, x, y):
-        x = dist.shard_tensor(x, self.mesh, dim_mapping=[0, -1])
-        x = dist.set_shard_mask(x, MASK)
+        # Test shard_tensor interface with dist_attr arg
+        x = dist.shard_tensor(
+            x,
+            dist_attr={"process_mesh": process_mesh1,
+                       "dims_mapping": [0, -1]})
         emb_out = self.word_embeddings(x)
-
-        dist.set_pipeline_stage(LAST_PP_STAGE)
-
-        y = dist.shard_tensor(y, self.mesh, dim_mapping=[0, -1])
-        dist.set_offload_device(y, "cpu")
+        # Test shard_tensor interface with no dist_attr arg
+        y = dist.shard_tensor(y)
         linear1 = self.dense1(y)
         out = self.dense2(linear1)
 
-        return x, y, self.mesh
+        return x, y
 
 
 class TestAutoParallelAPI(unittest.TestCase):
     def test_api(self):
+        dist_context = get_default_distributed_context()
+
         net = SimpleNet()
         data1 = fluid.layers.fill_constant(shape=[2, 4], value=1, dtype="int64")
         data2 = fluid.layers.fill_constant(
             shape=[2, 4], value=2, dtype="float32")
         data3 = fluid.layers.fill_constant(
             shape=[2, 4], value=4, dtype="float32")
-        x, y, mesh = net.forward(data1, data2)
-        mesh_attr = _append_attr_suffix('mesh_id')
-        x_mesh_id = x._get_attr(mesh_attr)
-        self.assertEqual(x_mesh_id, mesh._id)
-        x_mesh = x.process_mesh
-
-        allatts = x.attr_names
-        self.assertEqual(x_mesh, mesh)
-        shard_mask_attr = _append_attr_suffix('mask')
-        self.assertEqual(
-            x._get_attr(shard_mask_attr), _flatten_nested_list(MASK))
-        self.assertEqual(x.shard_mask, _flatten_nested_list(MASK))
-        offload_attr = _append_attr_suffix('offload_device')
-        self.assertEqual(y._get_attr(offload_attr), "cpu")
-        self.assertEqual(y.desc.has_attr(offload_attr), True)
-        self.assertEqual(y.offload_device, "cpu")
-        y._remove_attr(offload_attr)
-        self.assertEqual(y._has_attr(offload_attr), False)
-        ops = paddle.static.default_main_program().block(0).ops
-        first_op = ops[0]
-        last_op = ops[-1]
 
-        self.assertEqual(last_op.pipeline_stage, LAST_PP_STAGE)
-
-        DIMS_MAPPING1 = [0, 1]
-        DIMS_MAPPING2 = [-1, 0]
-        kwargs = {'x': data2, 'y': data3}
-        dist.shard_op(
+        x, y = net.forward(data1, data2)
+
+        dist_x = dist_context.get_dist_tensor_for_program(x)
+        self.assertEqual(dist_x.dist_attr.process_mesh.processes, process_mesh1)
+        self.assertEqual(dist_x.dist_attr.dims_mapping, [0, -1])
+        self.assertEqual(dist_x.dist_attr.shard_sizes, None)
+        self.assertEqual(dist_x.dist_attr.device_placement, None)
+        self.assertTrue(dist_x.dist_attr.is_annotated("process_mesh"))
+        self.assertTrue(dist_x.dist_attr.is_annotated("dims_mapping"))
+        self.assertFalse(dist_x.dist_attr.is_annotated("shard_sizes"))
+        self.assertFalse(dist_x.dist_attr.is_annotated("device_placement"))
+
+        dist_y = dist_context.get_dist_tensor_for_program(y)
+        self.assertEqual(dist_y.dist_attr.process_mesh, None)
+        self.assertEqual(dist_y.dist_attr.dims_mapping, [-1, -1])
+        self.assertEqual(dist_y.dist_attr.shard_sizes, None)
+        self.assertEqual(dist_y.dist_attr.device_placement, None)
+        self.assertFalse(dist_y.dist_attr.is_annotated("process_mesh"))
+        self.assertFalse(dist_y.dist_attr.is_annotated("dims_mapping"))
+        self.assertFalse(dist_y.dist_attr.is_annotated("shard_sizes"))
+        self.assertFalse(dist_y.dist_attr.is_annotated("device_placement"))
+
+        # Test shard_op interface with dist_attr
+        dims_mapping1 = [0, 1]
+        dims_mapping2 = [-1, 0]
+        dist_add = dist.shard_op(
             paddle.add,
-            mesh=mesh,
-            dim_mapping_dict={
-                data2.name: DIMS_MAPPING1,
-                data3.name: DIMS_MAPPING2
-            },
-            **kwargs)
+            dist_attr={
+                data2: {
+                    "process_mesh": process_mesh2,
+                    "dims_mapping": dims_mapping1
+                },
+                data3: {
+                    "dims_mapping": dims_mapping2
+                }
+            })
+        results = dist_add(data2, data3)
         ops = paddle.static.default_main_program().block(0).ops
         last_op = ops[-1]
 
-        self.assertEqual(last_op.process_mesh, mesh)
-        attr_name = "IN_" + data2.name
-        attr_name = _append_attr_suffix(attr_name)
-        self.assertEqual(last_op.attr(attr_name), DIMS_MAPPING1)
-        attr_name = "IN_" + data3.name
-        attr_name = _append_attr_suffix(attr_name)
-        self.assertEqual(last_op.attr(attr_name), DIMS_MAPPING2)
-
-    def test_process_mesh(self):
-        mesh1 = dist.ProcessMesh([[0, 1, 2], [3, 4, 5]], parent=MESH)
-        mesh2 = dist.ProcessMesh([[0, 1, 2], [3, 4, 5]], parent=mesh1)
-        mesh3 = dist.ProcessMesh([[0, 1], [2, 3]], parent=mesh1)
-        mesh4 = dist.ProcessMesh([[2, 3], [4, 5]], parent=mesh1)
-
-        self.assertEqual(MESH.parent, None)
-        self.assertEqual(mesh1.parent, MESH)
-        self.assertEqual(mesh1._desc.parent, MESH._id)
-        self.assertEqual(mesh3.parent, mesh1)
-        self.assertEqual(mesh4.parent, mesh1)
-        self.assertEqual(mesh1, mesh2)
-        self.assertNotEqual(mesh3, mesh4)
-        self.assertEqual(mesh2._id, mesh2._desc.id)
-        self.assertEqual(mesh3.topology, mesh3._desc.topology)
-        self.assertEqual(mesh3.topology, [2, 2])
-        self.assertEqual(mesh3.process_group, [0, 1, 2, 3])
-        self.assertEqual(mesh4.process_group, mesh4._desc.process_group)
+        dist_op = dist_context.get_dist_op_for_program(last_op)
+        self.assertEqual(dist_op.dist_attr.process_mesh,
+                         ProcessMesh(process_mesh2))
+        self.assertEqual(dist_op.dist_attr.impl_type, "default")
+        self.assertEqual(dist_op.dist_attr.impl_idx, -2)
+        self.assertTrue(dist_op.dist_attr.is_annotated("process_mesh"))
+
+        data2_dist_attr = dist_op.dist_attr.get_input_dist_attr(data2.name)
+        self.assertEqual(data2_dist_attr.process_mesh,
+                         dist_op.dist_attr.process_mesh)
+        self.assertEqual(data2_dist_attr.dims_mapping, dims_mapping1)
+        self.assertEqual(data2_dist_attr.shard_sizes, None)
+        self.assertEqual(data2_dist_attr.device_placement, None)
+        self.assertTrue(data2_dist_attr.is_annotated("process_mesh"))
+        self.assertTrue(data2_dist_attr.is_annotated("dims_mapping"))
+        self.assertFalse(data2_dist_attr.is_annotated("shard_sizes"))
+        self.assertFalse(data2_dist_attr.is_annotated("device_placement"))
+
+        data3_dist_attr = dist_op.dist_attr.get_input_dist_attr(data3.name)
+        self.assertEqual(data3_dist_attr.process_mesh,
+                         dist_op.dist_attr.process_mesh)
+        self.assertEqual(data3_dist_attr.dims_mapping, dims_mapping2)
+        self.assertEqual(data3_dist_attr.shard_sizes, None)
+        self.assertEqual(data3_dist_attr.device_placement, None)
+        self.assertTrue(data3_dist_attr.is_annotated("process_mesh"))
+        self.assertTrue(data3_dist_attr.is_annotated("dims_mapping"))
+        self.assertFalse(data3_dist_attr.is_annotated("shard_sizes"))
+        self.assertFalse(data3_dist_attr.is_annotated("device_placement"))
+
+        # Test shard_op interface with dist_attr
+        dist_add = dist.shard_op(paddle.add)
+        results = dist_add(data2, data3)
+        ops = paddle.static.default_main_program().block(0).ops
+        last_op = ops[-1]
+        dist_op = dist_context.get_dist_op_for_program(last_op)
+        self.assertEqual(dist_op.dist_attr.process_mesh, None)
+        self.assertEqual(dist_op.dist_attr.impl_type, "default")
+        self.assertEqual(dist_op.dist_attr.impl_idx, -2)
+        self.assertFalse(dist_op.dist_attr.is_annotated("process_mesh"))
+
+        data2_dist_attr = dist_op.dist_attr.get_input_dist_attr(data2.name)
+        self.assertEqual(data2_dist_attr.process_mesh,
+                         dist_op.dist_attr.process_mesh)
+        self.assertEqual(data2_dist_attr.dims_mapping, [-1, -1])
+        self.assertEqual(data2_dist_attr.shard_sizes, None)
+        self.assertEqual(data2_dist_attr.device_placement, None)
+        self.assertFalse(data2_dist_attr.is_annotated("process_mesh"))
+        self.assertFalse(data2_dist_attr.is_annotated("dims_mapping"))
+        self.assertFalse(data2_dist_attr.is_annotated("shard_sizes"))
+        self.assertFalse(data2_dist_attr.is_annotated("device_placement"))
+
+        data3_dist_attr = dist_op.dist_attr.get_input_dist_attr(data3.name)
+        self.assertEqual(data3_dist_attr.process_mesh,
+                         dist_op.dist_attr.process_mesh)
+        self.assertEqual(data3_dist_attr.dims_mapping, [-1, -1])
+        self.assertEqual(data3_dist_attr.shard_sizes, None)
+        self.assertEqual(data3_dist_attr.device_placement, None)
+        self.assertFalse(data3_dist_attr.is_annotated("process_mesh"))
+        self.assertFalse(data3_dist_attr.is_annotated("dims_mapping"))
+        self.assertFalse(data3_dist_attr.is_annotated("shard_sizes"))
+        self.assertFalse(data3_dist_attr.is_annotated("device_placement"))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_completion.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_completion.py
index 21726596ca76a..05d71aca5db2c 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_completion.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_completion.py
@@ -28,15 +28,14 @@
 from paddle.nn.layer.transformer import _convert_param_attr_to_list
 import paddle.distributed.auto_parallel as auto
 from paddle.distributed.auto_parallel.utils import check_distributed_attr_for_program
-from paddle.distributed.auto_parallel.utils import print_program_with_distributed_attr
+from paddle.distributed.auto_parallel.utils import print_program_with_dist_attr
 from paddle.distributed.auto_parallel.utils import append_distributed_attr_suffix
-from paddle.distributed.auto_parallel.context import DistributedContext
-from paddle.distributed.auto_parallel.context import set_default_distributed_context
+from paddle.distributed.auto_parallel.dist_context import DistributedContext
+from paddle.distributed.auto_parallel.dist_context import set_default_distributed_context
 paddle.enable_static()
 _global_parallel_strategy = None
 _global_process_mesh = None
 _global_process_mesh2 = None
-ROOT_MESH = auto.ProcessMesh([[0, 1, 2, 3], [4, 5, 6, 7]])
 
 
 class MLPLayer(nn.Layer):
@@ -62,20 +61,43 @@ def __init__(self,
     def forward(self, input):
         if _global_parallel_strategy == "mp":
             auto.shard_tensor(
-                self.linear0.weight, _global_process_mesh, dim_mapping=[-1, 0])
-            auto.shard_tensor(
-                self.linear1.weight, _global_process_mesh, dim_mapping=[0, -1])
+                self.linear0.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [-1, 0]
+                })
+            auto.shard_tensor(
+                self.linear1.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [0, -1]
+                })
         elif _global_parallel_strategy == "dp_mp":
             auto.shard_tensor(
-                self.linear0.weight, _global_process_mesh, dim_mapping=[-1, 1])
-            auto.shard_tensor(
-                self.linear1.weight, _global_process_mesh, dim_mapping=[1, -1])
+                self.linear0.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [-1, 1]
+                })
+            auto.shard_tensor(
+                self.linear1.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [1, -1]
+                })
         elif _global_parallel_strategy == "pp":
             auto.shard_tensor(
-                self.linear0.weight, _global_process_mesh, dim_mapping=[-1, 1])
+                self.linear0.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [-1, 1]
+                })
             auto.shard_tensor(
-                self.linear1.weight, _global_process_mesh2,
-                dim_mapping=[1, -1])
+                self.linear1.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh2,
+                    "dims_mapping": [1, -1]
+                })
 
         out = self.norm(input)
         out = self.linear0(out)
@@ -99,10 +121,18 @@ def mlp_pretrain_forward(train_program, start_program):
 
         if _global_parallel_strategy == "dp":
             auto.shard_tensor(
-                input, _global_process_mesh, dim_mapping=[0, -1, -1])
+                input,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [0, -1, -1]
+                })
         elif _global_parallel_strategy == "dp_mp":
             auto.shard_tensor(
-                input, _global_process_mesh, dim_mapping=[0, -1, -1])
+                input,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [0, -1, -1]
+                })
 
         mlp = MLPLayer(
             hidden_size=hidden_size,
@@ -118,8 +148,7 @@ def test_mlp_dp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "dp"
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh(
-            mesh=[0, 1, 2, 3], parent=ROOT_MESH)
+        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1, 2, 3])
         train_program = static.Program()
         start_program = static.Program()
         dist_context = DistributedContext()
@@ -127,18 +156,15 @@ def test_mlp_dp(self):
                                                             start_program)
         complete_train_program = auto.complete_annotation(train_program,
                                                           dist_context)
-        # print_program_with_distributed_attr(complete_train_program,
+        # print_program_with_dist_attr(complete_train_program,
         #                                     dist_context)
-        self.assertTrue(
-            check_distributed_attr_for_program(complete_train_program,
-                                               dist_context))
+        self.assertTrue(dist_context.validate_dist_attr_for_program())
 
     def test_mlp_mp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "mp"
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh(
-            mesh=[0, 1, 2, 3], parent=ROOT_MESH)
+        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1, 2, 3])
 
         train_program = static.Program()
         start_program = static.Program()
@@ -147,81 +173,77 @@ def test_mlp_mp(self):
                                                             start_program)
         complete_train_program = auto.complete_annotation(train_program,
                                                           dist_context)
-        # print_program_with_distributed_attr(complete_train_program,
+        # print_program_with_dist_attr(complete_train_program,
         #                                     dist_context)
-        self.assertTrue(
-            check_distributed_attr_for_program(complete_train_program,
-                                               dist_context))
+        self.assertTrue(dist_context.validate_dist_attr_for_program())
 
     def test_mlp_dp_mp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "dp_mp"
         global _global_process_mesh
         _global_process_mesh = auto.ProcessMesh(
-            mesh=[[0, 1, 2, 3], [4, 5, 6, 7]], parent=ROOT_MESH)
-
-        train_program = static.Program()
-        start_program = static.Program()
-        dist_context = DistributedContext()
-        train_program, start_program = mlp_pretrain_forward(train_program,
-                                                            start_program)
-        complete_train_program = auto.complete_annotation(train_program,
-                                                          dist_context)
-        # print_program_with_distributed_attr(complete_train_program,
-        #                                     dist_context)
-        self.assertTrue(
-            check_distributed_attr_for_program(complete_train_program,
-                                               dist_context))
-
-    def test_mlp_misc(self):
-        # import pdb
-        global _global_parallel_strategy
-        _global_parallel_strategy = "pp"
-        global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh(
-            mesh=[[0, 1], [2, 3]], parent=ROOT_MESH)
-        global _global_process_mesh2
-        _global_process_mesh2 = auto.ProcessMesh(
-            mesh=[[4, 5], [6, 7]], parent=ROOT_MESH)
+            mesh=[[0, 1, 2, 3], [4, 5, 6, 7]])
 
         train_program = static.Program()
         start_program = static.Program()
         dist_context = DistributedContext()
         train_program, start_program = mlp_pretrain_forward(train_program,
                                                             start_program)
-        # pdb.set_trace()
         complete_train_program = auto.complete_annotation(train_program,
                                                           dist_context)
-        # print_program_with_distributed_attr(complete_train_program,
+        # print_program_with_dist_attr(complete_train_program,
         #                                     dist_context)
-        dist_context.finalize_distributed_attr_for_program(
-            complete_train_program)
-        from paddle.distributed.auto_parallel.interface import _g_process_mesh_map
-        for block in complete_train_program.blocks:
-            for tensor in block.vars.values():
-                desc = tensor.desc
-                attr_name = append_distributed_attr_suffix("mesh_id")
-                self.assertIsNotNone(desc.has_attr(attr_name))
-                attr_name = append_distributed_attr_suffix("dim_mapping")
-                self.assertIsNotNone(desc.has_attr(attr_name))
-            for op in block.ops:
-                desc = op.desc
-                attr_name = append_distributed_attr_suffix("mesh_id")
-                self.assertIsNotNone(desc.has_attr(attr_name))
-                for tensor_name in desc.input_arg_names():
-                    attr_name = append_distributed_attr_suffix("IN_" +
-                                                               tensor_name)
-                    self.assertIsNotNone(desc.has_attr(attr_name))
-                for tensor_name in desc.output_arg_names():
-                    attr_name = append_distributed_attr_suffix("OUT_" +
-                                                               tensor_name)
-                    self.assertIsNotNone(desc.has_attr(attr_name))
-        set_default_distributed_context(dist_context)
-        self.assertTrue("dist_attr" in str(complete_train_program))
-        with unittest.mock.patch(
-                "sys.stdout", new_callable=StringIO) as mock_stdout:
-            print_program_with_distributed_attr(complete_train_program)
-            self.assertIsNotNone(mock_stdout.getvalue())
+        self.assertTrue(dist_context.validate_dist_attr_for_program())
+
+    # def test_mlp_misc(self):
+    #     # import pdb
+    #     global _global_parallel_strategy
+    #     _global_parallel_strategy = "pp"
+    #     global _global_process_mesh
+    #     _global_process_mesh = auto.ProcessMesh(
+    #         mesh=[[0, 1], [2, 3]])
+    #     global _global_process_mesh2
+    #     _global_process_mesh2 = auto.ProcessMesh(
+    #         mesh=[[4, 5], [6, 7]])
+
+    #     train_program = static.Program()
+    #     start_program = static.Program()
+    #     dist_context = DistributedContext()
+    #     train_program, start_program = mlp_pretrain_forward(train_program,
+    #                                                         start_program)
+    #     # pdb.set_trace()
+    #     complete_train_program = auto.complete_annotation(train_program,
+    #                                                       dist_context)
+    #     # print_program_with_dist_attr(complete_train_program,
+    #     #                                     dist_context)
+    #     dist_context.finalize_distributed_attr_for_program(
+    #         complete_train_program)
+    #     from paddle.distributed.auto_parallel.interface import _g_process_mesh_map
+    #     for block in complete_train_program.blocks:
+    #         for tensor in block.vars.values():
+    #             desc = tensor.desc
+    #             attr_name = append_distributed_attr_suffix("mesh_id")
+    #             self.assertIsNotNone(desc.has_attr(attr_name))
+    #             attr_name = append_distributed_attr_suffix("dims_mapping")
+    #             self.assertIsNotNone(desc.has_attr(attr_name))
+    #         for op in block.ops:
+    #             desc = op.desc
+    #             attr_name = append_distributed_attr_suffix("mesh_id")
+    #             self.assertIsNotNone(desc.has_attr(attr_name))
+    #             for tensor_name in desc.input_arg_names():
+    #                 attr_name = append_distributed_attr_suffix("IN_" +
+    #                                                            tensor_name)
+    #                 self.assertIsNotNone(desc.has_attr(attr_name))
+    #             for tensor_name in desc.output_arg_names():
+    #                 attr_name = append_distributed_attr_suffix("OUT_" +
+    #                                                            tensor_name)
+    #                 self.assertIsNotNone(desc.has_attr(attr_name))
+    #     set_default_distributed_context(dist_context)
+    #     self.assertTrue("dist_attr" in str(complete_train_program))
+    #     with unittest.mock.patch(
+    #             "sys.stdout", new_callable=StringIO) as mock_stdout:
+    #         print_program_with_dist_attr(complete_train_program)
+    #         self.assertIsNotNone(mock_stdout.getvalue())
 
 
 class AttentionLayer(nn.Layer):
@@ -262,10 +284,18 @@ def __init__(self,
     def forward(self, input):
         if _global_parallel_strategy == "dp":
             auto.shard_tensor(
-                input, _global_process_mesh, dim_mapping=[0, -1, -1])
+                input,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [0, -1, -1]
+                })
         elif _global_parallel_strategy == "dp_mp":
             auto.shard_tensor(
-                input, _global_process_mesh, dim_mapping=[0, -1, -1])
+                input,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [0, -1, -1]
+                })
 
         q = self.q_proj(input)
         q = tensor.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim])
@@ -276,18 +306,42 @@ def forward(self, input):
 
         if _global_parallel_strategy == "mp":
             auto.shard_tensor(
-                self.q_proj.weight, _global_process_mesh, dim_mapping=[-1, 0])
-            auto.shard_tensor(
-                self.k_proj.weight, _global_process_mesh, dim_mapping=[-1, 0])
-            auto.shard_tensor(
-                self.v_proj.weight, _global_process_mesh, dim_mapping=[-1, 0])
+                self.q_proj.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [-1, 0]
+                })
+            auto.shard_tensor(
+                self.k_proj.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [-1, 0]
+                })
+            auto.shard_tensor(
+                self.v_proj.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [-1, 0]
+                })
         elif _global_parallel_strategy == "dp_mp":
             auto.shard_tensor(
-                self.q_proj.weight, _global_process_mesh, dim_mapping=[-1, 1])
+                self.q_proj.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [-1, 1]
+                })
             auto.shard_tensor(
-                self.k_proj.weight, _global_process_mesh, dim_mapping=[-1, 1])
+                self.k_proj.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [-1, 1]
+                })
             auto.shard_tensor(
-                self.v_proj.weight, _global_process_mesh, dim_mapping=[-1, 1])
+                self.v_proj.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [-1, 1]
+                })
 
         k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim])
         k = tensor.transpose(x=k, perm=[0, 2, 1, 3])
@@ -320,12 +374,18 @@ def forward(self, input):
         out = self.out_proj(out)
         if _global_parallel_strategy == "mp":
             auto.shard_tensor(
-                self.out_proj.weight, _global_process_mesh,
-                dim_mapping=[0, -1])
+                self.out_proj.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [0, -1]
+                })
         elif _global_parallel_strategy == "dp_mp":
             auto.shard_tensor(
-                self.out_proj.weight, _global_process_mesh,
-                dim_mapping=[1, -1])
+                self.out_proj.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [1, -1]
+                })
 
         return out
 
@@ -357,8 +417,7 @@ def test_attn_dp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "dp"
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh(
-            mesh=[0, 1, 2, 3], parent=ROOT_MESH)
+        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1, 2, 3])
         train_program = static.Program()
         start_program = static.Program()
         dist_context = DistributedContext()
@@ -366,18 +425,15 @@ def test_attn_dp(self):
                                                              start_program)
         complete_train_program = auto.complete_annotation(train_program,
                                                           dist_context)
-        # print_program_with_distributed_attr(complete_train_program,
+        # print_program_with_dist_attr(complete_train_program,
         #                                     dist_context)
-        self.assertTrue(
-            check_distributed_attr_for_program(complete_train_program,
-                                               dist_context))
+        self.assertTrue(dist_context.validate_dist_attr_for_program())
 
     def test_attn_mp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "mp"
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh(
-            mesh=[0, 1, 2, 3], parent=ROOT_MESH)
+        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1, 2, 3])
 
         train_program = static.Program()
         start_program = static.Program()
@@ -386,18 +442,16 @@ def test_attn_mp(self):
                                                              start_program)
         complete_train_program = auto.complete_annotation(train_program,
                                                           dist_context)
-        # print_program_with_distributed_attr(complete_train_program,
+        # print_program_with_dist_attr(complete_train_program,
         #                                     dist_context)
-        self.assertTrue(
-            check_distributed_attr_for_program(complete_train_program,
-                                               dist_context))
+        self.assertTrue(dist_context.validate_dist_attr_for_program())
 
     def test_attn_dp_mp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "dp_mp"
         global _global_process_mesh
         _global_process_mesh = auto.ProcessMesh(
-            mesh=[[0, 1, 2, 3], [4, 5, 6, 7]], parent=ROOT_MESH)
+            mesh=[[0, 1, 2, 3], [4, 5, 6, 7]])
 
         train_program = static.Program()
         start_program = static.Program()
@@ -406,11 +460,9 @@ def test_attn_dp_mp(self):
                                                              start_program)
         complete_train_program = auto.complete_annotation(train_program,
                                                           dist_context)
-        # print_program_with_distributed_attr(complete_train_program,
+        # print_program_with_dist_attr(complete_train_program,
         #                                     dist_context)
-        self.assertTrue(
-            check_distributed_attr_for_program(complete_train_program,
-                                               dist_context))
+        self.assertTrue(dist_context.validate_dist_attr_for_program())
 
 
 class DecoderLayer(nn.Layer):
@@ -486,10 +538,18 @@ def __init__(self,
     def forward(self, input_ids, position_ids):
         if _global_parallel_strategy == "dp":
             auto.shard_tensor(
-                input_ids, _global_process_mesh, dim_mapping=[0, -1])
+                input_ids,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [0, -1]
+                })
         elif _global_parallel_strategy == "dp_mp":
             auto.shard_tensor(
-                input_ids, _global_process_mesh, dim_mapping=[0, -1])
+                input_ids,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [0, -1]
+                })
 
         input_embeddings = self.word_embeddings(input_ids)
         position_embeddings = self.position_embeddings(position_ids)
@@ -497,13 +557,17 @@ def forward(self, input_ids, position_ids):
         if _global_parallel_strategy == "mp":
             auto.shard_tensor(
                 self.word_embeddings.weight,
-                _global_process_mesh,
-                dim_mapping=[0, -1])
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [0, -1]
+                })
         elif _global_parallel_strategy == "dp_mp":
             auto.shard_tensor(
                 self.word_embeddings.weight,
-                _global_process_mesh,
-                dim_mapping=[1, -1])
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [1, -1]
+                })
 
         embeddings = input_embeddings + position_embeddings
         embeddings = self.dropout1(embeddings)
@@ -521,18 +585,42 @@ def forward(self, input_ids, position_ids):
 
         if _global_parallel_strategy == "mp":
             auto.shard_tensor(
-                self.q_proj.weight, _global_process_mesh, dim_mapping=[-1, 0])
-            auto.shard_tensor(
-                self.k_proj.weight, _global_process_mesh, dim_mapping=[-1, 0])
-            auto.shard_tensor(
-                self.v_proj.weight, _global_process_mesh, dim_mapping=[-1, 0])
+                self.q_proj.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [-1, 0]
+                })
+            auto.shard_tensor(
+                self.k_proj.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [-1, 0]
+                })
+            auto.shard_tensor(
+                self.v_proj.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [-1, 0]
+                })
         elif _global_parallel_strategy == "dp_mp":
             auto.shard_tensor(
-                self.q_proj.weight, _global_process_mesh, dim_mapping=[-1, 1])
+                self.q_proj.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [-1, 1]
+                })
             auto.shard_tensor(
-                self.k_proj.weight, _global_process_mesh, dim_mapping=[-1, 1])
+                self.k_proj.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [-1, 1]
+                })
             auto.shard_tensor(
-                self.v_proj.weight, _global_process_mesh, dim_mapping=[-1, 1])
+                self.v_proj.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [-1, 1]
+                })
 
         k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim])
         k = tensor.transpose(x=k, perm=[0, 2, 1, 3])
@@ -566,12 +654,18 @@ def forward(self, input_ids, position_ids):
 
         if _global_parallel_strategy == "mp":
             auto.shard_tensor(
-                self.out_proj.weight, _global_process_mesh,
-                dim_mapping=[0, -1])
+                self.out_proj.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [0, -1]
+                })
         elif _global_parallel_strategy == "dp_mp":
             auto.shard_tensor(
-                self.out_proj.weight, _global_process_mesh,
-                dim_mapping=[1, -1])
+                self.out_proj.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [1, -1]
+                })
 
         # Add residual
         residual = embeddings + self.dropout2(out)
@@ -586,14 +680,30 @@ def forward(self, input_ids, position_ids):
 
         if _global_parallel_strategy == "mp":
             auto.shard_tensor(
-                self.linear0.weight, _global_process_mesh, dim_mapping=[-1, 0])
-            auto.shard_tensor(
-                self.linear1.weight, _global_process_mesh, dim_mapping=[0, -1])
+                self.linear0.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [-1, 0]
+                })
+            auto.shard_tensor(
+                self.linear1.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [0, -1]
+                })
         elif _global_parallel_strategy == "dp_mp":
             auto.shard_tensor(
-                self.linear0.weight, _global_process_mesh, dim_mapping=[-1, 1])
+                self.linear0.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [-1, 1]
+                })
             auto.shard_tensor(
-                self.linear1.weight, _global_process_mesh, dim_mapping=[1, -1])
+                self.linear1.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [1, -1]
+                })
 
         # Add residual
         final = residual + self.dropout3(out3)
@@ -631,8 +741,7 @@ def test_decoder_dp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "dp"
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh(
-            mesh=[0, 1, 2, 3], parent=ROOT_MESH)
+        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1, 2, 3])
         train_program = static.Program()
         start_program = static.Program()
         dist_context = DistributedContext()
@@ -640,18 +749,15 @@ def test_decoder_dp(self):
                                                                 start_program)
         complete_train_program = auto.complete_annotation(train_program,
                                                           dist_context)
-        # print_program_with_distributed_attr(complete_train_program,
+        # print_program_with_dist_attr(complete_train_program,
         #                                     dist_context)
-        self.assertTrue(
-            check_distributed_attr_for_program(complete_train_program,
-                                               dist_context))
+        self.assertTrue(dist_context.validate_dist_attr_for_program())
 
     def test_decoder_mp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "mp"
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh(
-            mesh=[0, 1, 2, 3], parent=ROOT_MESH)
+        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1, 2, 3])
 
         train_program = static.Program()
         start_program = static.Program()
@@ -660,18 +766,16 @@ def test_decoder_mp(self):
                                                                 start_program)
         complete_train_program = auto.complete_annotation(train_program,
                                                           dist_context)
-        # print_program_with_distributed_attr(complete_train_program,
+        # print_program_with_dist_attr(complete_train_program,
         #                                     dist_context)
-        self.assertTrue(
-            check_distributed_attr_for_program(complete_train_program,
-                                               dist_context))
+        self.assertTrue(dist_context.validate_dist_attr_for_program())
 
     def test_decoder_dp_mp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "dp_mp"
         global _global_process_mesh
         _global_process_mesh = auto.ProcessMesh(
-            mesh=[[0, 1, 2, 3], [4, 5, 6, 7]], parent=ROOT_MESH)
+            mesh=[[0, 1, 2, 3], [4, 5, 6, 7]])
 
         train_program = static.Program()
         start_program = static.Program()
@@ -680,11 +784,9 @@ def test_decoder_dp_mp(self):
                                                                 start_program)
         complete_train_program = auto.complete_annotation(train_program,
                                                           dist_context)
-        # print_program_with_distributed_attr(complete_train_program,
+        # print_program_with_dist_attr(complete_train_program,
         #                                     dist_context)
-        self.assertTrue(
-            check_distributed_attr_for_program(complete_train_program,
-                                               dist_context))
+        self.assertTrue(dist_context.validate_dist_attr_for_program())
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_completion_gpt.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_completion_gpt.py
index cd87a72a7e68f..c2c1e63155c3a 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_completion_gpt.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_completion_gpt.py
@@ -32,13 +32,12 @@
 import paddle.static as static
 import paddle.distributed.auto_parallel as auto
 from paddle.distributed.auto_parallel.utils import check_distributed_attr_for_program
-from paddle.distributed.auto_parallel.utils import print_program_with_distributed_attr
-from paddle.distributed.auto_parallel.context import DistributedContext
+from paddle.distributed.auto_parallel.utils import print_program_with_dist_attr
+from paddle.distributed.auto_parallel.dist_context import DistributedContext
 
 paddle.enable_static()
 _global_parallel_strategy = None
 _global_process_mesh = None
-ROOT_MESH = auto.ProcessMesh([[0, 1, 2, 3], [4, 5, 6, 7]])
 
 
 class MultiHeadAttention(nn.Layer):
@@ -108,10 +107,18 @@ def _prepare_qkv(self, query, key, value, use_cache=False, cache=None):
 
         if _global_parallel_strategy == "mp":
             auto.shard_tensor(
-                self.q_proj.weight, _global_process_mesh, dim_mapping=[-1, 0])
+                self.q_proj.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [-1, 0]
+                })
         elif _global_parallel_strategy == "dp_mp":
             auto.shard_tensor(
-                self.q_proj.weight, _global_process_mesh, dim_mapping=[-1, 1])
+                self.q_proj.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [-1, 1]
+                })
 
         q = tensor.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim])
         q = tensor.transpose(x=q, perm=[0, 2, 1, 3])
@@ -145,19 +152,35 @@ def compute_kv(self, key, value):
 
         if _global_parallel_strategy == "mp":
             auto.shard_tensor(
-                self.k_proj.weight, _global_process_mesh, dim_mapping=[-1, 0])
+                self.k_proj.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [-1, 0]
+                })
         elif _global_parallel_strategy == "dp_mp":
             auto.shard_tensor(
-                self.k_proj.weight, _global_process_mesh, dim_mapping=[-1, 1])
+                self.k_proj.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [-1, 1]
+                })
 
         v = self.v_proj(value)
 
         if _global_parallel_strategy == "mp":
             auto.shard_tensor(
-                self.v_proj.weight, _global_process_mesh, dim_mapping=[-1, 0])
+                self.v_proj.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [-1, 0]
+                })
         elif _global_parallel_strategy == "dp_mp":
             auto.shard_tensor(
-                self.v_proj.weight, _global_process_mesh, dim_mapping=[-1, 1])
+                self.v_proj.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [-1, 1]
+                })
 
         k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim])
         k = tensor.transpose(x=k, perm=[0, 2, 1, 3])
@@ -238,12 +261,18 @@ def forward(self,
 
         if _global_parallel_strategy == "mp":
             auto.shard_tensor(
-                self.out_proj.weight, _global_process_mesh,
-                dim_mapping=[0, -1])
+                self.out_proj.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [0, -1]
+                })
         elif _global_parallel_strategy == "dp_mp":
             auto.shard_tensor(
-                self.out_proj.weight, _global_process_mesh,
-                dim_mapping=[1, -1])
+                self.out_proj.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [1, -1]
+                })
 
         outs = [out]
         if self.need_weights:
@@ -411,17 +440,33 @@ def forward(self, tgt, memory, tgt_mask=None, use_cache=False, cache=None):
 
         if _global_parallel_strategy == "mp":
             auto.shard_tensor(
-                self.linear1.weight, _global_process_mesh, dim_mapping=[-1, 0])
+                self.linear1.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [-1, 0]
+                })
         elif _global_parallel_strategy == "dp_mp":
             auto.shard_tensor(
-                self.linear1.weight, _global_process_mesh, dim_mapping=[-1, 1])
+                self.linear1.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [-1, 1]
+                })
 
         if _global_parallel_strategy == "mp":
             auto.shard_tensor(
-                self.linear2.weight, _global_process_mesh, dim_mapping=[0, -1])
+                self.linear2.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [0, -1]
+                })
         elif _global_parallel_strategy == "dp_mp":
             auto.shard_tensor(
-                self.linear2.weight, _global_process_mesh, dim_mapping=[1, -1])
+                self.linear2.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [1, -1]
+                })
 
         # tgt = self.dropout2(
         #     self.linear2(F.gelu(
@@ -485,13 +530,17 @@ def forward(self, input_ids, position_ids=None):
         if _global_parallel_strategy == "mp":
             auto.shard_tensor(
                 self.word_embeddings.weight,
-                _global_process_mesh,
-                dim_mapping=[0, -1])
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [0, -1]
+                })
         elif _global_parallel_strategy == "dp_mp":
             auto.shard_tensor(
                 self.word_embeddings.weight,
-                _global_process_mesh,
-                dim_mapping=[1, -1])
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [1, -1]
+                })
 
         position_embeddings = self.position_embeddings(position_ids)
         embeddings = input_embedings + position_embeddings
@@ -717,10 +766,18 @@ def gpt_pretrain_forward(train_program, start_program):
 
         if _global_parallel_strategy == "dp":
             auto.shard_tensor(
-                input_ids, _global_process_mesh, dim_mapping=[0, -1])
+                input_ids,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [0, -1]
+                })
         elif _global_parallel_strategy == "dp_mp":
             auto.shard_tensor(
-                input_ids, _global_process_mesh, dim_mapping=[0, -1])
+                input_ids,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [0, -1]
+                })
 
         gpt = GPTModel(
             vocab_size=32768,
@@ -753,8 +810,7 @@ def test_gpt_dp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "dp"
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh(
-            mesh=[0, 1, 2, 3], parent=ROOT_MESH)
+        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1, 2, 3])
 
         train_program = static.Program()
         start_program = static.Program()
@@ -763,18 +819,15 @@ def test_gpt_dp(self):
                                                             start_program)
         complete_train_program = auto.complete_annotation(train_program,
                                                           dist_context)
-        # print_program_with_distributed_attr(complete_train_program,
+        # print_program_with_dist_attr(complete_train_program,
         #                                     dist_context)
-        self.assertTrue(
-            check_distributed_attr_for_program(complete_train_program,
-                                               dist_context))
+        self.assertTrue(dist_context.validate_dist_attr_for_program())
 
     def test_gpt_mp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "mp"
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh(
-            mesh=[0, 1, 2, 3], parent=ROOT_MESH)
+        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1, 2, 3])
 
         train_program = static.Program()
         start_program = static.Program()
@@ -783,18 +836,16 @@ def test_gpt_mp(self):
                                                             start_program)
         complete_train_program = auto.complete_annotation(train_program,
                                                           dist_context)
-        # print_program_with_distributed_attr(complete_train_program,
+        # print_program_with_dist_attr(complete_train_program,
         #                                     dist_context)
-        self.assertTrue(
-            check_distributed_attr_for_program(complete_train_program,
-                                               dist_context))
+        self.assertTrue(dist_context.validate_dist_attr_for_program())
 
     def test_gpt_dp_mp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "dp_mp"
         global _global_process_mesh
         _global_process_mesh = auto.ProcessMesh(
-            mesh=[[0, 1, 2, 3], [4, 5, 6, 7]], parent=ROOT_MESH)
+            mesh=[[0, 1, 2, 3], [4, 5, 6, 7]])
 
         train_program = static.Program()
         start_program = static.Program()
@@ -803,11 +854,9 @@ def test_gpt_dp_mp(self):
                                                             start_program)
         complete_train_program = auto.complete_annotation(train_program,
                                                           dist_context)
-        # print_program_with_distributed_attr(complete_train_program,
+        # print_program_with_dist_attr(complete_train_program,
         #                                     dist_context)
-        self.assertTrue(
-            check_distributed_attr_for_program(complete_train_program,
-                                               dist_context))
+        self.assertTrue(dist_context.validate_dist_attr_for_program())
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_cost_model.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_cost_model.py
index 000b1db61381e..4c9c01b99e050 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_cost_model.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_cost_model.py
@@ -23,21 +23,19 @@
 import paddle.nn.functional as F
 import paddle.utils as utils
 import paddle.distributed.auto_parallel as auto
-from paddle.distributed.auto_parallel.context import DistributedContext
+from paddle.distributed.auto_parallel.dist_context import DistributedContext
 from paddle.distributed import fleet
 from paddle.distributed.auto_parallel.partitioner import Partitioner
 from paddle.distributed.auto_parallel.completion import complete_backward_annotation
 from paddle.distributed.auto_parallel.reshard import reshard
 from paddle.distributed.auto_parallel.cost_model import estimate_cost
 import paddle.fluid.core as core
+from paddle.distributed.auto_parallel.utils import print_program_with_dist_attr
 
 paddle.enable_static()
 _global_parallel_strategy = "dp_mp_pp"
-ROOT_MESH = auto.ProcessMesh([[[0, 1], [4, 5]], [[2, 3], [6, 7]]])
-_global_process_mesh = auto.ProcessMesh(
-    [[[0, 1], [4, 5]], [[2, 3], [6, 7]]], parent=ROOT_MESH)
-PP_MESH_0 = auto.ProcessMesh([[0, 1], [4, 5]], parent=ROOT_MESH)
-PP_MESH_1 = auto.ProcessMesh([[2, 3], [6, 7]], parent=ROOT_MESH)
+PP_MESH_0 = auto.ProcessMesh([[0, 1], [4, 5]])
+PP_MESH_1 = auto.ProcessMesh([[2, 3], [6, 7]])
 NUM_RANKS = 8
 STAGE_0_CNT = 5
 STAGE_1_CNT = 10
@@ -70,9 +68,13 @@ def __init__(self,
     def forward(self, input):
         if self.is_distributed:
             auto.shard_tensor(
-                self.linear0.weight, PP_MESH_0, dim_mapping=[-1, 1])
+                self.linear0.weight,
+                dist_attr={"process_mesh": PP_MESH_0,
+                           "dims_mapping": [-1, 1]})
             auto.shard_tensor(
-                self.linear1.weight, PP_MESH_1, dim_mapping=[1, -1])
+                self.linear1.weight,
+                dist_attr={"process_mesh": PP_MESH_1,
+                           "dims_mapping": [1, -1]})
 
         out = self.norm(input)
         out = self.linear0(out)
@@ -120,8 +122,14 @@ def mlp_forward(train_program, start_program, is_distributed=True):
                 name="label", shape=[batch_size, 1], dtype='float32')
 
         if is_distributed:
-            auto.shard_tensor(input, PP_MESH_0, dim_mapping=[0, -1])
-            auto.shard_tensor(label, PP_MESH_1, dim_mapping=[0, -1])
+            auto.shard_tensor(
+                input,
+                dist_attr={"process_mesh": PP_MESH_0,
+                           "dims_mapping": [0, -1]})
+            auto.shard_tensor(
+                label,
+                dist_attr={"process_mesh": PP_MESH_1,
+                           "dims_mapping": [0, -1]})
 
         mlp = MLPLayer(
             hidden_size=hidden_size,
@@ -137,8 +145,6 @@ def mlp_forward(train_program, start_program, is_distributed=True):
 
 
 def get_dist_prog(train_program, startup_program, dist_context, rank_id):
-    global _global_process_mesh
-    dist_context.set_process_mesh(_global_process_mesh)
     loss, train_program, startup_program = mlp_forward(train_program,
                                                        startup_program)
 
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner.py
index 44a525244015b..3a23f9b2611dc 100755
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner.py
@@ -29,19 +29,17 @@
 from paddle.nn.layer.transformer import _convert_param_attr_to_list
 import paddle.distributed.auto_parallel as auto
 from paddle.distributed.auto_parallel.utils import check_distributed_attr_for_program
-from paddle.distributed.auto_parallel.utils import print_program_with_distributed_attr
+from paddle.distributed.auto_parallel.utils import print_program_with_dist_attr
 from paddle.distributed.auto_parallel.utils import append_distributed_attr_suffix
-from paddle.distributed.auto_parallel.context import DistributedContext
-from paddle.distributed.auto_parallel.context import set_default_distributed_context
+from paddle.distributed.auto_parallel.dist_context import DistributedContext
 from paddle.distributed import fleet
 from paddle.distributed.auto_parallel.partitioner import Partitioner
 from paddle.distributed.auto_parallel.utils import _get_comm_group
-from paddle.distributed.auto_parallel.process import new_process_group
+from paddle.distributed.auto_parallel.process_group import new_process_group
 
 paddle.enable_static()
 _global_parallel_strategy = None
 _global_process_mesh = None
-ROOT_MESH = auto.ProcessMesh([[0, 1, 2, 3], [4, 5, 6, 7]])
 
 
 def get_programs(annotated_func):
@@ -49,7 +47,7 @@ def get_programs(annotated_func):
     start_program = static.Program()
     dist_context = DistributedContext()
     global _global_process_mesh
-    dist_context.set_process_mesh(_global_process_mesh)
+    dist_context.process_mesh = _global_process_mesh
     train_program, start_program = annotated_func(train_program, start_program)
     complete_train_program = auto.complete_annotation(train_program,
                                                       dist_context)
@@ -95,9 +93,8 @@ def initialization_check(mode, dist_context, dist_startup_prog,
                          serial_startup_prog, var_need_broadcast, process_mesh,
                          mp_parallel_axis, dp_parallel_axis):
     if 'mp' in mode:
-        group_ranks = _get_comm_group(process_mesh.process_group,
-                                      process_mesh.topology, mp_parallel_axis,
-                                      3)
+        group_ranks = _get_comm_group(
+            process_mesh.processes, process_mesh.topology, mp_parallel_axis, 3)
         mp_ring_id = new_process_group(group_ranks).id
         broadcast_ops = [
             op for op in dist_startup_prog.global_block().ops
@@ -110,9 +107,8 @@ def initialization_check(mode, dist_context, dist_startup_prog,
             return False
 
     if 'dp' in mode:
-        group_ranks = _get_comm_group(process_mesh.process_group,
-                                      process_mesh.topology, dp_parallel_axis,
-                                      3)
+        group_ranks = _get_comm_group(
+            process_mesh.processes, process_mesh.topology, dp_parallel_axis, 3)
         dp_ring_id = new_process_group(group_ranks).id
         nparam = len(serial_startup_prog.all_parameters())
         nbroadcast_dp = len([
@@ -137,22 +133,21 @@ def initialization_check(mode, dist_context, dist_startup_prog,
 def get_input_var_dist_attr(op, main_program, dist_context):
     varname = op.desc.input_arg_names()
     var = main_program.global_block().var(varname[0])
-    dist_attr = dist_context.get_tensor_distributed_attr_for_program(var)
+    dist_attr = dist_context.get_tensor_dist_attr_for_program(var)
     return dist_attr
 
 
 def get_output_var_dist_attr(op, main_program, dist_context):
     varname = op.desc.output_arg_names()
     var = main_program.global_block().var(varname[0])
-    dist_attr = dist_context.get_tensor_distributed_attr_for_program(var)
+    dist_attr = dist_context.get_tensor_dist_attr_for_program(var)
     return dist_attr
 
 
 def check_equal_var_dist_attr(serial_dist_attr, dist_attr):
     equal = True
-    if serial_dist_attr.get_process_mesh() != dist_attr.get_process_mesh() or \
-        serial_dist_attr.is_parameter() != dist_attr.is_parameter() or \
-        serial_dist_attr.get_dims_mapping() != dist_attr.get_dims_mapping():
+    if serial_dist_attr.process_mesh != dist_attr.process_mesh or \
+        serial_dist_attr.dims_mapping != dist_attr.dims_mapping:
         equal = False
     return equal
 
@@ -161,36 +156,33 @@ def check_equal_dist_op_attr(dist_context, dist_main_prog, serial_op, dist_ops,
                              dist_op_idx):
     equal = True
     # get serial op's process_mesh and impl_idx
-    serial_op_dist_attr = dist_context.get_op_distributed_attr_for_program(
-        serial_op)
-    serial_process_mesh = serial_op_dist_attr.get_process_mesh()
-    serial_impl_idx = serial_op_dist_attr.get_impl_idx()
+    serial_op_dist_attr = dist_context.get_op_dist_attr_for_program(serial_op)
+    serial_process_mesh = serial_op_dist_attr.process_mesh
+    serial_impl_idx = serial_op_dist_attr.impl_idx
 
     # check dist_attr between serial op and dist op
     for i in dist_op_idx:
-        op_dist_attr = dist_context.get_op_distributed_attr_for_program(
-            dist_ops[i])
+        op_dist_attr = dist_context.get_op_dist_attr_for_program(dist_ops[i])
         for in_varname in dist_ops[i].desc.input_arg_names():
             in_var = dist_main_prog.global_block().var(in_varname)
-            tensor_dist_attr = dist_context.get_tensor_distributed_attr_for_program(
+            tensor_dist_attr = dist_context.get_tensor_dist_attr_for_program(
                 in_var)
-            tensor_dims_mapping = tensor_dist_attr.get_dims_mapping()
+            tensor_dims_mapping = tensor_dist_attr.dims_mapping
             in_var_dims_mapping = op_dist_attr.get_input_dims_mapping(
                 in_varname)
             if tensor_dims_mapping != in_var_dims_mapping:
                 equal = False
         for out_varname in dist_ops[i].desc.output_arg_names():
             out_var = dist_main_prog.global_block().var(out_varname)
-            tensor_dist_attr = dist_context.get_tensor_distributed_attr_for_program(
+            tensor_dist_attr = dist_context.get_tensor_dist_attr_for_program(
                 out_var)
-            tensor_dims_mapping = tensor_dist_attr.get_dims_mapping()
+            tensor_dims_mapping = tensor_dist_attr.dims_mapping
             out_var_dims_mapping = op_dist_attr.get_output_dims_mapping(
                 out_varname)
             if tensor_dims_mapping != out_var_dims_mapping:
                 equal = False
-
-        dist_op_process_mesh = op_dist_attr.get_process_mesh()
-        dist_op_impl_idx = op_dist_attr.get_impl_idx()
+        dist_op_process_mesh = op_dist_attr.process_mesh
+        dist_op_impl_idx = op_dist_attr.impl_idx
         if serial_op.desc.id() == dist_ops[i].desc.id() or \
             serial_process_mesh != dist_op_process_mesh or \
             serial_impl_idx != dist_op_impl_idx:
@@ -242,13 +234,13 @@ def distributed_attr_check_for_program(dist_main_prog, dist_context):
     have_dist_attr = True
     for block in dist_main_prog.blocks:
         for tensor in block.vars.values():
-            var_dist_attr = dist_context.get_tensor_distributed_attr_for_program(
+            var_dist_attr = dist_context.get_tensor_dist_attr_for_program(
                 tensor)
             if var_dist_attr is None:
                 have_dist_attr = False
 
         for op in block.ops:
-            op_dist_attr = dist_context.get_op_distributed_attr_for_program(op)
+            op_dist_attr = dist_context.get_op_dist_attr_for_program(op)
             if op_dist_attr is None:
                 have_dist_attr = False
 
@@ -278,21 +270,43 @@ def __init__(self,
     def forward(self, input):
         if _global_parallel_strategy == "mp":
             auto.shard_tensor(
-                self.linear0.weight, _global_process_mesh, dim_mapping=[-1, 0])
+                self.linear0.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [-1, 0]
+                })
             auto.shard_tensor(
-                self.linear1.weight, _global_process_mesh, dim_mapping=[0, -1])
+                self.linear1.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [0, -1]
+                })
         elif _global_parallel_strategy == "dp_mp":
             auto.shard_tensor(
-                self.linear0.weight, _global_process_mesh, dim_mapping=[-1, 1])
+                self.linear0.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [-1, 1]
+                })
             auto.shard_tensor(
-                self.linear1.weight, _global_process_mesh, dim_mapping=[1, -1])
+                self.linear1.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [1, -1]
+                })
         else:
             auto.shard_tensor(
-                self.linear0.weight, _global_process_mesh,
-                dim_mapping=[-1, -1])
+                self.linear0.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [-1, -1]
+                })
             auto.shard_tensor(
-                self.linear1.weight, _global_process_mesh,
-                dim_mapping=[-1, -1])
+                self.linear1.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [-1, -1]
+                })
 
         out = self.norm(input)
         out = self.linear0(out)
@@ -316,10 +330,18 @@ def mlp_pretrain_forward(train_program, start_program):
 
         if _global_parallel_strategy == "dp":
             auto.shard_tensor(
-                input, _global_process_mesh, dim_mapping=[0, -1, -1])
+                input,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [0, -1, -1]
+                })
         elif _global_parallel_strategy == "dp_mp":
             auto.shard_tensor(
-                input, _global_process_mesh, dim_mapping=[0, -1, -1])
+                input,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [0, -1, -1]
+                })
 
         mlp = MLPLayer(
             hidden_size=hidden_size,
@@ -335,8 +357,7 @@ def test_mlp_dp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "dp"
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh(
-            mesh=[0, 1, 2, 3], parent=ROOT_MESH)
+        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1, 2, 3])
 
         serial_main_prog, serial_startup_prog, dist_main_prog, dist_startup_prog, dist_context = get_programs(
             mlp_pretrain_forward)
@@ -372,8 +393,7 @@ def test_mlp_mp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "mp"
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh(
-            mesh=[0, 1, 2, 3], parent=ROOT_MESH)
+        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1, 2, 3])
         serial_main_prog, serial_startup_prog, dist_main_prog, dist_startup_prog, dist_context = get_programs(
             mlp_pretrain_forward)
 
@@ -437,7 +457,7 @@ def test_mlp_dp_mp(self):
         _global_parallel_strategy = "dp_mp"
         global _global_process_mesh
         _global_process_mesh = auto.ProcessMesh(
-            mesh=[[0, 1, 2, 3], [4, 5, 6, 7]], parent=ROOT_MESH)
+            mesh=[[0, 1, 2, 3], [4, 5, 6, 7]])
         serial_main_prog, serial_startup_prog, dist_main_prog, dist_startup_prog, dist_context = get_programs(
             mlp_pretrain_forward)
 
@@ -535,10 +555,18 @@ def __init__(self,
     def forward(self, input):
         if _global_parallel_strategy == "dp":
             auto.shard_tensor(
-                input, _global_process_mesh, dim_mapping=[0, -1, -1])
+                input,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [0, -1, -1]
+                })
         elif _global_parallel_strategy == "dp_mp":
             auto.shard_tensor(
-                input, _global_process_mesh, dim_mapping=[0, -1, -1])
+                input,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [0, -1, -1]
+                })
 
         q = self.q_proj(input)
         q = tensor.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim])
@@ -549,18 +577,42 @@ def forward(self, input):
 
         if _global_parallel_strategy == "mp":
             auto.shard_tensor(
-                self.q_proj.weight, _global_process_mesh, dim_mapping=[-1, 0])
+                self.q_proj.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [-1, 0]
+                })
             auto.shard_tensor(
-                self.k_proj.weight, _global_process_mesh, dim_mapping=[-1, 0])
+                self.k_proj.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [-1, 0]
+                })
             auto.shard_tensor(
-                self.v_proj.weight, _global_process_mesh, dim_mapping=[-1, 0])
+                self.v_proj.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [-1, 0]
+                })
         elif _global_parallel_strategy == "dp_mp":
             auto.shard_tensor(
-                self.q_proj.weight, _global_process_mesh, dim_mapping=[-1, 1])
+                self.q_proj.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [-1, 1]
+                })
             auto.shard_tensor(
-                self.k_proj.weight, _global_process_mesh, dim_mapping=[-1, 1])
+                self.k_proj.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [-1, 1]
+                })
             auto.shard_tensor(
-                self.v_proj.weight, _global_process_mesh, dim_mapping=[-1, 1])
+                self.v_proj.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [-1, 1]
+                })
 
         k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim])
         k = tensor.transpose(x=k, perm=[0, 2, 1, 3])
@@ -593,12 +645,18 @@ def forward(self, input):
         out = self.out_proj(out)
         if _global_parallel_strategy == "mp":
             auto.shard_tensor(
-                self.out_proj.weight, _global_process_mesh,
-                dim_mapping=[0, -1])
+                self.out_proj.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [0, -1]
+                })
         elif _global_parallel_strategy == "dp_mp":
             auto.shard_tensor(
-                self.out_proj.weight, _global_process_mesh,
-                dim_mapping=[1, -1])
+                self.out_proj.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [1, -1]
+                })
 
         return out
 
@@ -630,8 +688,7 @@ def test_attn_dp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "dp"
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh(
-            mesh=[0, 1, 2, 3], parent=ROOT_MESH)
+        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1, 2, 3])
 
         serial_main_prog, serial_startup_prog, dist_main_prog, dist_startup_prog, dist_context = get_programs(
             attn_pretrain_forward)
@@ -666,8 +723,7 @@ def test_attn_mp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "mp"
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh(
-            mesh=[0, 1, 2, 3], parent=ROOT_MESH)
+        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1, 2, 3])
 
         serial_main_prog, serial_startup_prog, dist_main_prog, dist_startup_prog, dist_context = get_programs(
             attn_pretrain_forward)
@@ -735,7 +791,7 @@ def test_attn_dp_mp(self):
         _global_parallel_strategy = "dp_mp"
         global _global_process_mesh
         _global_process_mesh = auto.ProcessMesh(
-            mesh=[[0, 1, 2, 3], [4, 5, 6, 7]], parent=ROOT_MESH)
+            mesh=[[0, 1, 2, 3], [4, 5, 6, 7]])
 
         serial_main_prog, serial_startup_prog, dist_main_prog, dist_startup_prog, dist_context = get_programs(
             attn_pretrain_forward)
@@ -871,10 +927,18 @@ def __init__(self,
     def forward(self, input_ids, position_ids):
         if _global_parallel_strategy == "dp":
             auto.shard_tensor(
-                input_ids, _global_process_mesh, dim_mapping=[0, -1])
+                input_ids,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [0, -1]
+                })
         elif _global_parallel_strategy == "dp_mp":
             auto.shard_tensor(
-                input_ids, _global_process_mesh, dim_mapping=[0, -1])
+                input_ids,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [0, -1]
+                })
 
         input_embeddings = self.word_embeddings(input_ids)
         position_embeddings = self.position_embeddings(position_ids)
@@ -882,13 +946,17 @@ def forward(self, input_ids, position_ids):
         if _global_parallel_strategy == "mp":
             auto.shard_tensor(
                 self.word_embeddings.weight,
-                _global_process_mesh,
-                dim_mapping=[0, -1])
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [0, -1]
+                })
         elif _global_parallel_strategy == "dp_mp":
             auto.shard_tensor(
                 self.word_embeddings.weight,
-                _global_process_mesh,
-                dim_mapping=[1, -1])
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [1, -1]
+                })
 
         embeddings = input_embeddings + position_embeddings
         embeddings = self.dropout1(embeddings)
@@ -906,18 +974,42 @@ def forward(self, input_ids, position_ids):
 
         if _global_parallel_strategy == "mp":
             auto.shard_tensor(
-                self.q_proj.weight, _global_process_mesh, dim_mapping=[-1, 0])
+                self.q_proj.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [-1, 0]
+                })
             auto.shard_tensor(
-                self.k_proj.weight, _global_process_mesh, dim_mapping=[-1, 0])
+                self.k_proj.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [-1, 0]
+                })
             auto.shard_tensor(
-                self.v_proj.weight, _global_process_mesh, dim_mapping=[-1, 0])
+                self.v_proj.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [-1, 0]
+                })
         elif _global_parallel_strategy == "dp_mp":
             auto.shard_tensor(
-                self.q_proj.weight, _global_process_mesh, dim_mapping=[-1, 1])
+                self.q_proj.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [-1, 1]
+                })
             auto.shard_tensor(
-                self.k_proj.weight, _global_process_mesh, dim_mapping=[-1, 1])
+                self.k_proj.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [-1, 1]
+                })
             auto.shard_tensor(
-                self.v_proj.weight, _global_process_mesh, dim_mapping=[-1, 1])
+                self.v_proj.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [-1, 1]
+                })
 
         k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim])
         k = tensor.transpose(x=k, perm=[0, 2, 1, 3])
@@ -951,17 +1043,25 @@ def forward(self, input_ids, position_ids):
 
         if _global_parallel_strategy == "mp":
             auto.shard_tensor(
-                self.out_proj.weight, _global_process_mesh,
-                dim_mapping=[0, -1])
+                self.out_proj.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [0, -1]
+                })
         elif _global_parallel_strategy == "dp_mp":
             auto.shard_tensor(
-                self.out_proj.weight, _global_process_mesh,
-                dim_mapping=[1, -1])
+                self.out_proj.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [1, -1]
+                })
         else:
             auto.shard_tensor(
                 self.out_proj.weight,
-                _global_process_mesh,
-                dim_mapping=[-1, -1])
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [-1, -1]
+                })
 
         # Add residual
         residual = embeddings + self.dropout2(out)
@@ -976,14 +1076,30 @@ def forward(self, input_ids, position_ids):
 
         if _global_parallel_strategy == "mp":
             auto.shard_tensor(
-                self.linear0.weight, _global_process_mesh, dim_mapping=[-1, 0])
+                self.linear0.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [-1, 0]
+                })
             auto.shard_tensor(
-                self.linear1.weight, _global_process_mesh, dim_mapping=[0, -1])
+                self.linear1.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [0, -1]
+                })
         elif _global_parallel_strategy == "dp_mp":
             auto.shard_tensor(
-                self.linear0.weight, _global_process_mesh, dim_mapping=[-1, 1])
+                self.linear0.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [-1, 1]
+                })
             auto.shard_tensor(
-                self.linear1.weight, _global_process_mesh, dim_mapping=[1, -1])
+                self.linear1.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [1, -1]
+                })
 
         # Add residual
         final = residual + self.dropout3(out3)
@@ -1022,7 +1138,7 @@ def test_decoder_dp_mp(self):
         _global_parallel_strategy = "dp_mp"
         global _global_process_mesh
         _global_process_mesh = auto.ProcessMesh(
-            mesh=[[0, 1, 2, 3], [4, 5, 6, 7]], parent=ROOT_MESH)
+            mesh=[[0, 1, 2, 3], [4, 5, 6, 7]])
         serial_main_prog, serial_startup_prog, dist_main_prog, dist_startup_prog, dist_context = get_programs(
             decoder_pretrain_forward)
 
@@ -1105,7 +1221,7 @@ def test_decoder_noparallel(self):
         _global_parallel_strategy = "None"
         global _global_process_mesh
         _global_process_mesh = auto.ProcessMesh(
-            mesh=[[0, 1, 2, 3], [4, 5, 6, 7]], parent=ROOT_MESH)
+            mesh=[[0, 1, 2, 3], [4, 5, 6, 7]])
         serial_main_prog, serial_startup_prog, dist_main_prog, dist_startup_prog, dist_context = get_programs(
             decoder_pretrain_forward)
 
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner_gpt.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner_gpt.py
index 11b3338bc675c..7fcb18db12817 100755
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner_gpt.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner_gpt.py
@@ -32,14 +32,13 @@
 import paddle.static as static
 import paddle.distributed.auto_parallel as auto
 from paddle.distributed.auto_parallel.utils import check_distributed_attr_for_program
-from paddle.distributed.auto_parallel.utils import print_program_with_distributed_attr
-from paddle.distributed.auto_parallel.context import DistributedContext
+from paddle.distributed.auto_parallel.utils import print_program_with_dist_attr
+from paddle.distributed.auto_parallel.dist_context import DistributedContext
 from paddle.distributed.auto_parallel.partitioner import Partitioner
 from paddle.distributed.auto_parallel.utils import _get_comm_group
-from paddle.distributed.auto_parallel.process import new_process_group
+from paddle.distributed.auto_parallel.process_group import new_process_group
 
 paddle.enable_static()
-ROOT_MESH = auto.ProcessMesh([[0, 1, 2, 3], [4, 5, 6, 7]])
 _global_parallel_strategy = None
 _global_process_mesh = None
 
@@ -55,6 +54,38 @@ def check_tensor_split(prog1, varnames1, prog2, varnames2, axis, nsplit):
     return True
 
 
+def is_valid_completed_program(dist_context, program):
+
+    # TODO (ZJ-LIANG) should check all block
+    ops = program.global_block().ops
+    vars_ = program.list_vars()
+    for op in ops:
+        op_dist_attrs = dist_context.get_op_dist_attr_for_program(op)
+        if op_dist_attrs == None:
+            return False
+
+        if op_dist_attrs.process_mesh == None:
+            return False
+
+        for tensor_dist_attr in op_dist_attrs.inputs_dist_attrs.values():
+            if None == tensor_dist_attr.dims_mapping:
+                return False
+        for tensor_dist_attr in op_dist_attrs.outputs_dist_attrs.values():
+            if None == tensor_dist_attr.dims_mapping:
+                return False
+
+    for var in vars_:
+        var_dist_attrs = dist_context.get_tensor_dist_attr_for_program(var)
+        if var_dist_attrs == None:
+            return False
+        elif var_dist_attrs.process_mesh == None:
+            return False
+        elif var_dist_attrs.dims_mapping == None:
+            return False
+
+    return True
+
+
 class MultiHeadAttention(nn.Layer):
     """
     Attention mapps queries and a set of key-value pairs to outputs, and
@@ -122,10 +153,18 @@ def _prepare_qkv(self, query, key, value, use_cache=False, cache=None):
 
         if _global_parallel_strategy == "mp":
             auto.shard_tensor(
-                self.q_proj.weight, _global_process_mesh, dim_mapping=[-1, 0])
+                self.q_proj.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [-1, 0]
+                })
         elif _global_parallel_strategy == "dp_mp":
             auto.shard_tensor(
-                self.q_proj.weight, _global_process_mesh, dim_mapping=[-1, 1])
+                self.q_proj.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [-1, 1]
+                })
 
         q = tensor.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim])
         q = tensor.transpose(x=q, perm=[0, 2, 1, 3])
@@ -159,19 +198,35 @@ def compute_kv(self, key, value):
 
         if _global_parallel_strategy == "mp":
             auto.shard_tensor(
-                self.k_proj.weight, _global_process_mesh, dim_mapping=[-1, 0])
+                self.k_proj.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [-1, 0]
+                })
         elif _global_parallel_strategy == "dp_mp":
             auto.shard_tensor(
-                self.k_proj.weight, _global_process_mesh, dim_mapping=[-1, 1])
+                self.k_proj.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [-1, 1]
+                })
 
         v = self.v_proj(value)
 
         if _global_parallel_strategy == "mp":
             auto.shard_tensor(
-                self.v_proj.weight, _global_process_mesh, dim_mapping=[-1, 0])
+                self.v_proj.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [-1, 0]
+                })
         elif _global_parallel_strategy == "dp_mp":
             auto.shard_tensor(
-                self.v_proj.weight, _global_process_mesh, dim_mapping=[-1, 1])
+                self.v_proj.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [-1, 1]
+                })
 
         k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim])
         k = tensor.transpose(x=k, perm=[0, 2, 1, 3])
@@ -252,12 +307,18 @@ def forward(self,
 
         if _global_parallel_strategy == "mp":
             auto.shard_tensor(
-                self.out_proj.weight, _global_process_mesh,
-                dim_mapping=[0, -1])
+                self.out_proj.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [0, -1]
+                })
         elif _global_parallel_strategy == "dp_mp":
             auto.shard_tensor(
-                self.out_proj.weight, _global_process_mesh,
-                dim_mapping=[1, -1])
+                self.out_proj.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [1, -1]
+                })
 
         outs = [out]
         if self.need_weights:
@@ -425,17 +486,33 @@ def forward(self, tgt, memory, tgt_mask=None, use_cache=False, cache=None):
 
         if _global_parallel_strategy == "mp":
             auto.shard_tensor(
-                self.linear1.weight, _global_process_mesh, dim_mapping=[-1, 0])
+                self.linear1.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [-1, 0]
+                })
         elif _global_parallel_strategy == "dp_mp":
             auto.shard_tensor(
-                self.linear1.weight, _global_process_mesh, dim_mapping=[-1, 1])
+                self.linear1.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [-1, 1]
+                })
 
         if _global_parallel_strategy == "mp":
             auto.shard_tensor(
-                self.linear2.weight, _global_process_mesh, dim_mapping=[0, -1])
+                self.linear2.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [0, -1]
+                })
         elif _global_parallel_strategy == "dp_mp":
             auto.shard_tensor(
-                self.linear2.weight, _global_process_mesh, dim_mapping=[1, -1])
+                self.linear2.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [1, -1]
+                })
 
         # tgt = self.dropout2(
         #     self.linear2(F.gelu(
@@ -499,13 +576,17 @@ def forward(self, input_ids, position_ids=None):
         if _global_parallel_strategy == "mp":
             auto.shard_tensor(
                 self.word_embeddings.weight,
-                _global_process_mesh,
-                dim_mapping=[0, -1])
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [0, -1]
+                })
         elif _global_parallel_strategy == "dp_mp":
             auto.shard_tensor(
                 self.word_embeddings.weight,
-                _global_process_mesh,
-                dim_mapping=[1, -1])
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [1, -1]
+                })
 
         position_embeddings = self.position_embeddings(position_ids)
         embeddings = input_embedings + position_embeddings
@@ -731,10 +812,18 @@ def gpt_pretrain_forward(train_program, start_program):
 
         if _global_parallel_strategy == "dp":
             auto.shard_tensor(
-                input_ids, _global_process_mesh, dim_mapping=[0, -1])
+                input_ids,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [0, -1]
+                })
         elif _global_parallel_strategy == "dp_mp":
             auto.shard_tensor(
-                input_ids, _global_process_mesh, dim_mapping=[0, -1])
+                input_ids,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [0, -1]
+                })
 
         gpt = GPTModel(
             vocab_size=32768,
@@ -769,12 +858,12 @@ def test_gpt_dp_mp(self):
         global _global_process_mesh
 
         _global_process_mesh = auto.ProcessMesh(
-            mesh=[[0, 1, 2, 3], [4, 5, 6, 7]], parent=ROOT_MESH)
+            mesh=[[0, 1, 2, 3], [4, 5, 6, 7]])
 
         train_program = static.Program()
         start_program = static.Program()
         dist_context = DistributedContext()
-        dist_context.set_process_mesh(_global_process_mesh)
+        dist_context.process_mesh = _global_process_mesh
         train_program, start_program, loss = gpt_pretrain_forward(train_program,
                                                                   start_program)
         complete_train_program = auto.complete_annotation(train_program,
@@ -804,7 +893,7 @@ def test_gpt_dp_mp(self):
         opt_ops = partitioner.apply_optimize(optimizer, dist_params_grads,
                                              auto_parallel_main_prog,
                                              auto_parallel_startup_prog)
-        from paddle.distributed.auto_parallel.context import set_default_distributed_context
+        from paddle.distributed.auto_parallel.dist_context import set_default_distributed_context
         set_default_distributed_context(dist_context)
         with open("./test_auto_parallel_partitioner_main_new.txt1", "w") as fw:
             fw.write(str(auto_parallel_main_prog))
@@ -848,14 +937,12 @@ def test_gpt_dp_mp(self):
         mp_parallel_axis = 1
         dp_parallel_axis = 0
 
-        group_ranks = _get_comm_group(process_mesh.process_group,
-                                      process_mesh.topology, mp_parallel_axis,
-                                      3)
+        group_ranks = _get_comm_group(
+            process_mesh.processes, process_mesh.topology, mp_parallel_axis, 3)
         mp_ring_id = new_process_group(group_ranks).id
 
-        group_ranks = _get_comm_group(process_mesh.process_group,
-                                      process_mesh.topology, dp_parallel_axis,
-                                      3)
+        group_ranks = _get_comm_group(
+            process_mesh.processes, process_mesh.topology, dp_parallel_axis, 3)
         dp_ring_id = new_process_group(group_ranks).id
 
         tensor_parallel_allreduce_vars = sorted([
@@ -874,6 +961,9 @@ def test_gpt_dp_mp(self):
         self.assertTrue(all_params == data_parallel_allreduce_vars)
         self.assertTrue(allreduce_grads == tensor_parallel_allreduce_vars)
 
+        self.assertTrue(
+            is_valid_completed_program(dist_context, auto_parallel_main_prog))
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py
index fe9b965ed8733..0439b9a287cf6 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py
@@ -22,16 +22,16 @@
 import paddle.nn.functional as F
 import paddle.utils as utils
 import paddle.distributed.auto_parallel as auto
-from paddle.distributed.auto_parallel.context import DistributedContext
+from paddle.distributed.auto_parallel.dist_context import DistributedContext
 from paddle.distributed import fleet
 from paddle.distributed.auto_parallel.partitioner import Partitioner
 from paddle.distributed.auto_parallel.reshard import reshard
-from paddle.distributed.auto_parallel.process import PROCESS_GROUP_MAP
+from paddle.distributed.auto_parallel.process_group import _g_process_group_map
+from paddle.distributed.auto_parallel.utils import print_program_with_dist_attr
 
 paddle.enable_static()
 _global_parallel_strategy = None
 _global_process_mesh = None
-ROOT_MESH = auto.ProcessMesh([0, 1])
 PP_MESH_0 = None
 PP_MESH_1 = None
 
@@ -57,16 +57,30 @@ def __init__(self,
     def forward(self, input):
         if _global_parallel_strategy == "pp":
             auto.shard_tensor(
-                self.linear0.weight, PP_MESH_0, dim_mapping=[-1, -1])
+                self.linear0.weight,
+                dist_attr={
+                    "process_mesh": PP_MESH_0,
+                    "dims_mapping": [-1, -1]
+                })
             auto.shard_tensor(
-                self.linear1.weight, PP_MESH_1, dim_mapping=[-1, -1])
+                self.linear1.weight,
+                dist_attr={
+                    "process_mesh": PP_MESH_1,
+                    "dims_mapping": [-1, -1]
+                })
         else:
             auto.shard_tensor(
-                self.linear0.weight, _global_process_mesh,
-                dim_mapping=[-1, -1])
+                self.linear0.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [-1, -1]
+                })
             auto.shard_tensor(
-                self.linear1.weight, _global_process_mesh,
-                dim_mapping=[-1, -1])
+                self.linear1.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [-1, -1]
+                })
 
         out = self.norm(input)
         out = self.linear0(out)
@@ -88,12 +102,32 @@ def mlp_forward(train_program, start_program):
             name="label", shape=[batch_size, 1], dtype='float32')
 
         if _global_parallel_strategy == "pp":
-            auto.shard_tensor(input, PP_MESH_0, dim_mapping=[-1, -1])
-            auto.shard_tensor(label, PP_MESH_1, dim_mapping=[-1, -1])
+            auto.shard_tensor(
+                input,
+                dist_attr={
+                    "process_mesh": PP_MESH_0,
+                    "dims_mapping": [-1, -1]
+                })
+            auto.shard_tensor(
+                label,
+                dist_attr={
+                    "process_mesh": PP_MESH_1,
+                    "dims_mapping": [-1, -1]
+                })
         elif _global_parallel_strategy == "dp":
-            auto.shard_tensor(input, _global_process_mesh, dim_mapping=[0, -1])
+            auto.shard_tensor(
+                input,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [0, -1]
+                })
         else:
-            auto.shard_tensor(input, _global_process_mesh, dim_mapping=[-1, -1])
+            auto.shard_tensor(
+                input,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [-1, -1]
+                })
 
         mlp = MLPLayer(
             hidden_size=hidden_size,
@@ -108,8 +142,6 @@ def mlp_forward(train_program, start_program):
 
 
 def get_dist_prog(train_program, startup_program, dist_context, rank_id):
-    global _global_process_mesh
-    dist_context.set_process_mesh(_global_process_mesh)
     loss, train_program, startup_program = mlp_forward(train_program,
                                                        startup_program)
 
@@ -136,22 +168,21 @@ def check_backward_dist_attr(dist_context, dist_main_prog, op_need_check):
     has_dist_attr = True
     vars = dist_main_prog.global_block().vars
 
-    op_dist_attr = dist_context.get_op_distributed_attr_for_program(
-        op_need_check)
-    if not op_dist_attr or not op_dist_attr.get_process_mesh():
+    op_dist_attr = dist_context.get_op_dist_attr_for_program(op_need_check)
+    if not op_dist_attr or not op_dist_attr.process_mesh:
         has_dist_attr = False
 
     for var_name in op_need_check.input_arg_names:
         if not op_dist_attr.get_input_dims_mapping(var_name) or \
-        not dist_context.get_tensor_distributed_attr_for_program(vars[var_name]).get_dims_mapping() or \
-        not dist_context.get_tensor_distributed_attr_for_program(vars[var_name]).get_process_mesh():
+        not dist_context.get_tensor_dist_attr_for_program(vars[var_name]).dims_mapping or \
+        not dist_context.get_tensor_dist_attr_for_program(vars[var_name]).process_mesh:
             has_dist_attr = False
             break
 
     if has_dist_attr:
         for var_name in op_need_check.output_arg_names:
-            if not dist_context.get_tensor_distributed_attr_for_program(vars[var_name]).get_dims_mapping() or \
-            not dist_context.get_tensor_distributed_attr_for_program(vars[var_name]).get_process_mesh():
+            if not dist_context.get_tensor_dist_attr_for_program(vars[var_name]).dims_mapping or \
+            not dist_context.get_tensor_dist_attr_for_program(vars[var_name]).process_mesh:
                 has_dist_attr = False
                 break
 
@@ -162,6 +193,7 @@ def check_send_recv_result(dist_main_prog, rank_id):
     send_result = False
     recv_result = False
     ops = dist_main_prog.global_block().ops
+
     if rank_id == 0:
         for idx, op in enumerate(ops):
             if op.type == "send_v2" and "gelu_0.tmp_0" in op.input_arg_names:
@@ -217,7 +249,7 @@ def check_initialization_for_dp(dist_startup_prog):
 class TestMLPReshard(unittest.TestCase):
     def test_complete_backward_annotation(self):
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1], parent=ROOT_MESH)
+        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1])
 
         train_program = paddle.static.Program()
         startup_program = paddle.static.Program()
@@ -231,6 +263,7 @@ def test_complete_backward_annotation(self):
             if op.type == "gelu_grad":
                 op_need_check = op
                 break
+        # print_program_with_dist_attr(dist_main_prog, dist_context)
 
         # grad op should have dist attr
         self.assertTrue(
@@ -241,11 +274,11 @@ def test_mlp_pp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "pp"
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1], parent=ROOT_MESH)
+        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1])
         global PP_MESH_0
-        PP_MESH_0 = auto.ProcessMesh(mesh=[0], parent=ROOT_MESH)
+        PP_MESH_0 = auto.ProcessMesh(mesh=[0])
         global PP_MESH_1
-        PP_MESH_1 = auto.ProcessMesh(mesh=[1], parent=ROOT_MESH)
+        PP_MESH_1 = auto.ProcessMesh(mesh=[1])
 
         train_program = paddle.static.Program()
         startup_program = paddle.static.Program()
@@ -253,9 +286,10 @@ def test_mlp_pp(self):
         rank_id = 1
         dist_main_prog, dist_startup_prog = get_dist_prog(
             train_program, startup_program, dist_context, rank_id)
-        for key in list(PROCESS_GROUP_MAP.keys()):
-            del PROCESS_GROUP_MAP[key]
+        for key in list(_g_process_group_map.keys()):
+            del _g_process_group_map[key]
         reshard(dist_main_prog, dist_startup_prog, rank_id, dist_context)
+        # print_program_with_dist_attr(dist_main_prog, dist_context)
 
         # check send and recv result
         self.assertTrue(check_send_recv_result(dist_main_prog, rank_id))
@@ -267,7 +301,7 @@ def test_mlp_dp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "dp"
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1], parent=ROOT_MESH)
+        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1])
 
         train_program = paddle.static.Program()
         startup_program = paddle.static.Program()
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_dpmppp.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_dpmppp.py
index babc622393c40..4bd03a3e1bd92 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_dpmppp.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_dpmppp.py
@@ -22,18 +22,17 @@
 import paddle.nn.functional as F
 import paddle.utils as utils
 import paddle.distributed.auto_parallel as auto
-from paddle.distributed.auto_parallel.context import DistributedContext
+from paddle.distributed.auto_parallel.dist_context import DistributedContext
 from paddle.distributed import fleet
 from paddle.distributed.auto_parallel.partitioner import Partitioner
 from paddle.distributed.auto_parallel.reshard import reshard
+from paddle.distributed.auto_parallel.utils import print_program_with_dist_attr
 
 paddle.enable_static()
 _global_parallel_strategy = "dp_mp_pp"
-ROOT_MESH = auto.ProcessMesh([[[0, 1], [4, 5]], [[2, 3], [6, 7]]])
-_global_process_mesh = auto.ProcessMesh(
-    [[[0, 1], [4, 5]], [[2, 3], [6, 7]]], parent=ROOT_MESH)
-PP_MESH_0 = auto.ProcessMesh([[0, 1], [4, 5]], parent=ROOT_MESH)
-PP_MESH_1 = auto.ProcessMesh([[2, 3], [6, 7]], parent=ROOT_MESH)
+_global_process_mesh = auto.ProcessMesh([[[0, 1], [4, 5]], [[2, 3], [6, 7]]])
+PP_MESH_0 = auto.ProcessMesh([[0, 1], [4, 5]])
+PP_MESH_1 = auto.ProcessMesh([[2, 3], [6, 7]])
 
 
 class MLPLayer(nn.Layer):
@@ -55,8 +54,14 @@ def __init__(self,
         self.norm = nn.LayerNorm(d_model, epsilon=1e-5)
 
     def forward(self, input):
-        auto.shard_tensor(self.linear0.weight, PP_MESH_0, dim_mapping=[-1, 1])
-        auto.shard_tensor(self.linear1.weight, PP_MESH_1, dim_mapping=[1, -1])
+        auto.shard_tensor(
+            self.linear0.weight,
+            dist_attr={"process_mesh": PP_MESH_0,
+                       "dims_mapping": [-1, 1]})
+        auto.shard_tensor(
+            self.linear1.weight,
+            dist_attr={"process_mesh": PP_MESH_1,
+                       "dims_mapping": [1, -1]})
 
         out = self.norm(input)
         out = self.linear0(out)
@@ -77,8 +82,14 @@ def mlp_forward(train_program, start_program):
         label = static.data(
             name="label", shape=[batch_size, 1], dtype='float32')
 
-        auto.shard_tensor(input, PP_MESH_0, dim_mapping=[0, -1])
-        auto.shard_tensor(label, PP_MESH_1, dim_mapping=[0, -1])
+        auto.shard_tensor(
+            input,
+            dist_attr={"process_mesh": PP_MESH_0,
+                       "dims_mapping": [0, -1]})
+        auto.shard_tensor(
+            label,
+            dist_attr={"process_mesh": PP_MESH_1,
+                       "dims_mapping": [0, -1]})
 
         mlp = MLPLayer(
             hidden_size=hidden_size,
@@ -94,7 +105,7 @@ def mlp_forward(train_program, start_program):
 
 def get_dist_prog(train_program, startup_program, dist_context, rank_id):
     global _global_process_mesh
-    dist_context.set_process_mesh(_global_process_mesh)
+    dist_context.process_mesh = _global_process_mesh
     loss, train_program, startup_program = mlp_forward(train_program,
                                                        startup_program)
 
@@ -156,10 +167,8 @@ def test_mlp_dpmppp(self):
         rank_id = 2
         dist_main_prog, dist_startup_prog = get_dist_prog(
             train_program, startup_program, dist_context, rank_id)
-        print(dist_main_prog)
         reshard(dist_main_prog, dist_startup_prog, rank_id, dist_context)
-        print(dist_main_prog)
-        print(dist_startup_prog)
+        # print_program_with_dist_attr(dist_main_prog, dist_context)
         # check send and recv result
         self.assertTrue(check_send_recv_result(dist_main_prog, rank_id))
 
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_mppp.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_mppp.py
index 96a8b2a8d7cdb..ae79712dc7936 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_mppp.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_mppp.py
@@ -22,17 +22,17 @@
 import paddle.nn.functional as F
 import paddle.utils as utils
 import paddle.distributed.auto_parallel as auto
-from paddle.distributed.auto_parallel.context import DistributedContext
+from paddle.distributed.auto_parallel.dist_context import DistributedContext
 from paddle.distributed import fleet
 from paddle.distributed.auto_parallel.partitioner import Partitioner
 from paddle.distributed.auto_parallel.reshard import reshard
+from paddle.distributed.auto_parallel.utils import print_program_with_dist_attr
 
 paddle.enable_static()
 _global_parallel_strategy = "mp_pp"
-ROOT_MESH = auto.ProcessMesh([[0, 1], [2, 3]])
-_global_process_mesh = auto.ProcessMesh([[0, 1], [2, 3]], parent=ROOT_MESH)
-PP_MESH_0 = auto.ProcessMesh([0, 1], parent=ROOT_MESH)
-PP_MESH_1 = auto.ProcessMesh([2, 3], parent=ROOT_MESH)
+_global_process_mesh = auto.ProcessMesh([[0, 1], [2, 3]])
+PP_MESH_0 = auto.ProcessMesh([0, 1])
+PP_MESH_1 = auto.ProcessMesh([2, 3])
 
 
 class MLPLayer(nn.Layer):
@@ -64,10 +64,21 @@ def __init__(self,
 
     def forward(self, input):
         auto.shard_tensor(
-            self.word_embeddings.weight, PP_MESH_0, dim_mapping=[0, -1])
-        auto.shard_tensor(self.linear0.weight, PP_MESH_0, dim_mapping=[-1, 0])
-        auto.shard_tensor(self.linear1.weight, PP_MESH_1, dim_mapping=[0, -1])
-        auto.shard_tensor(self.linear2.weight, PP_MESH_1, dim_mapping=[0, -1])
+            self.word_embeddings.weight,
+            dist_attr={"process_mesh": PP_MESH_0,
+                       "dims_mapping": [0, -1]})
+        auto.shard_tensor(
+            self.linear0.weight,
+            dist_attr={"process_mesh": PP_MESH_0,
+                       "dims_mapping": [-1, 0]})
+        auto.shard_tensor(
+            self.linear1.weight,
+            dist_attr={"process_mesh": PP_MESH_1,
+                       "dims_mapping": [0, -1]})
+        auto.shard_tensor(
+            self.linear2.weight,
+            dist_attr={"process_mesh": PP_MESH_1,
+                       "dims_mapping": [0, -1]})
         w_out = self.word_embeddings(input)
         out = self.linear0(w_out)
         gelu_out = F.gelu(out, approximate=True)
@@ -88,8 +99,13 @@ def mlp_forward(train_program, start_program):
         label = static.data(
             name="label", shape=[batch_size, 1], dtype='float32')
 
-        auto.shard_tensor(input, PP_MESH_0, dim_mapping=[-1])
-        auto.shard_tensor(label, PP_MESH_1, dim_mapping=[-1, -1])
+        auto.shard_tensor(
+            input, dist_attr={"process_mesh": PP_MESH_0,
+                              "dims_mapping": [-1]})
+        auto.shard_tensor(
+            label,
+            dist_attr={"process_mesh": PP_MESH_1,
+                       "dims_mapping": [-1, -1]})
 
         mlp = MLPLayer(
             hidden_size=hidden_size,
@@ -105,7 +121,7 @@ def mlp_forward(train_program, start_program):
 
 def get_dist_prog(train_program, startup_program, dist_context, rank_id):
     global _global_process_mesh
-    dist_context.set_process_mesh(_global_process_mesh)
+    dist_context.process_mesh = _global_process_mesh
     loss, train_program, startup_program = mlp_forward(train_program,
                                                        startup_program)
 
@@ -198,19 +214,41 @@ def test_mlp_mppp(self):
     def test_allgather(self):
         train_program = paddle.static.Program()
         startup_program = paddle.static.Program()
-        process_mesh = auto.ProcessMesh(mesh=[0, 3], parent=ROOT_MESH)
+        process_mesh = auto.ProcessMesh(mesh=[0, 3])
         with static.program_guard(train_program, startup_program):
             x = paddle.static.data(name="x", shape=[4, 4], dtype='float32')
-            x = auto.shard_tensor(x, process_mesh, dim_mapping=[0, -1])
+            x = auto.shard_tensor(
+                x,
+                dist_attr={
+                    "process_mesh": process_mesh,
+                    "dims_mapping": [0, -1]
+                })
 
             w = paddle.static.data(name="w", shape=[4, 4], dtype='float32')
-            w = auto.shard_tensor(w, process_mesh, dim_mapping=[-1, -1])
-
-            y = paddle.distributed.shard_op(paddle.matmul, process_mesh, {
-                x.name: [-1, -1],
-                w.name: [-1, -1]
-            }, **{"x": x,
-                  "y": w})[0]
+            w = auto.shard_tensor(
+                w,
+                dist_attr={
+                    "process_mesh": process_mesh,
+                    "dims_mapping": [-1, -1]
+                })
+
+            # y = paddle.distributed.shard_op(paddle.matmul, process_mesh, {
+            #     x.name: [-1, -1],
+            #     w.name: [-1, -1]
+            # }, **{"x": x,
+            #       "y": w})[0]
+
+            y = paddle.distributed.shard_op(
+                paddle.matmul,
+                dist_attr={
+                    "process_mesh": process_mesh,
+                    x: {
+                        "dims_mapping": [-1, -1]
+                    },
+                    w: {
+                        "dims_mapping": [-1, -1]
+                    }
+                })(x, w)[0]
 
         rank_id = 0
         dist_context = DistributedContext()
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_serial.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_serial.py
index bf2ba9f061fd8..90dd0111dff3d 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_serial.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_serial.py
@@ -26,16 +26,15 @@
 import paddle.nn.functional as F
 import paddle.utils as utils
 import paddle.distributed.auto_parallel as auto
-from paddle.distributed.auto_parallel.context import get_default_distributed_context
+from paddle.distributed.auto_parallel.dist_context import get_default_distributed_context
 from paddle.distributed import fleet
 from paddle.distributed.auto_parallel.partitioner import Partitioner
 from paddle.distributed.auto_parallel.reshard import reshard
-from paddle.distributed.auto_parallel.process import new_process_group
+from paddle.distributed.auto_parallel.process_group import new_process_group
 
 paddle.enable_static()
 _global_parallel_strategy = None
 _global_process_mesh = None
-ROOT_MESH = auto.ProcessMesh([0])
 
 
 class MLPLayer(nn.Layer):
@@ -59,16 +58,30 @@ def __init__(self,
     def forward(self, input):
         if _global_parallel_strategy == "pp":
             auto.shard_tensor(
-                self.linear0.weight, PP_MESH_0, dim_mapping=[-1, -1])
+                self.linear0.weight,
+                dist_attr={
+                    "process_mesh": PP_MESH_0,
+                    "dims_mapping": [-1, -1]
+                })
             auto.shard_tensor(
-                self.linear1.weight, PP_MESH_1, dim_mapping=[-1, -1])
+                self.linear1.weight,
+                dist_attr={
+                    "process_mesh": PP_MESH_1,
+                    "dims_mapping": [-1, -1]
+                })
         else:
             auto.shard_tensor(
-                self.linear0.weight, _global_process_mesh,
-                dim_mapping=[-1, -1])
+                self.linear0.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [-1, -1]
+                })
             auto.shard_tensor(
-                self.linear1.weight, _global_process_mesh,
-                dim_mapping=[-1, -1])
+                self.linear1.weight,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [-1, -1]
+                })
 
         out = self.norm(input)
         out = self.linear0(out)
@@ -90,12 +103,32 @@ def mlp_forward(train_program, start_program):
             name="label", shape=[batch_size, 1], dtype='float32')
 
         if _global_parallel_strategy == "pp":
-            auto.shard_tensor(input, PP_MESH_0, dim_mapping=[-1, -1])
-            auto.shard_tensor(label, PP_MESH_1, dim_mapping=[-1, -1])
+            auto.shard_tensor(
+                input,
+                dist_attr={
+                    "process_mesh": PP_MESH_0,
+                    "dims_mapping": [-1, -1]
+                })
+            auto.shard_tensor(
+                label,
+                dist_attr={
+                    "process_mesh": PP_MESH_1,
+                    "dims_mapping": [-1, -1]
+                })
         elif _global_parallel_strategy == "dp":
-            auto.shard_tensor(input, _global_process_mesh, dim_mapping=[0, -1])
+            auto.shard_tensor(
+                input,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [0, -1]
+                })
         else:
-            auto.shard_tensor(input, _global_process_mesh, dim_mapping=[-1, -1])
+            auto.shard_tensor(
+                input,
+                dist_attr={
+                    "process_mesh": _global_process_mesh,
+                    "dims_mapping": [-1, -1]
+                })
 
         mlp = MLPLayer(
             hidden_size=hidden_size,
@@ -168,7 +201,7 @@ def test_mlp_serial(self):
         global _global_parallel_strategy
         _global_parallel_strategy = None
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh(mesh=[0], parent=ROOT_MESH)
+        _global_process_mesh = auto.ProcessMesh(mesh=[0])
 
         train_program = paddle.static.Program()
         startup_program = paddle.static.Program()
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_save_load.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_save_load.py
new file mode 100644
index 0000000000000..b96b51e556772
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_save_load.py
@@ -0,0 +1,29 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import paddle.fluid as fluid
+
+from test_parallel_dygraph_dataparallel import TestMultipleGpus
+
+
+class TestAutoParallelSaveLoad(TestMultipleGpus):
+    def test_auto_parallel_save_load(self):
+        self.run_mnist_2gpu('auto_parallel_save_load.py')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_cost_model.py b/python/paddle/fluid/tests/unittests/test_cost_model.py
index 483f665fde7e8..79e2b78792142 100644
--- a/python/paddle/fluid/tests/unittests/test_cost_model.py
+++ b/python/paddle/fluid/tests/unittests/test_cost_model.py
@@ -18,6 +18,7 @@
 
 import paddle
 import paddle.fluid.core as core
+from paddle.cost_model import CostModel
 
 paddle.enable_static()
 
@@ -51,6 +52,41 @@ def test_profiler_measure_program(self):
         self.assertGreaterEqual(cost_data.get_whole_time_ms(),
                                 fc_op_time + mean_op_time)
 
+    def test_static_op_benchmark_cost_model(self):
+        op_name = "abs"
+        cost_model = CostModel()
+        # init static data
+        cost_model.static_cost_data()
+        op_name = "abs"
+        abs_op_cost = cost_model.get_static_op_time(op_name)
+        abs_op_time = abs_op_cost["op_time"]
+        abs_op_config = abs_op_cost["config"]
+        print("abs_op_time:", abs_op_time)
+        print("abs_op_config:", abs_op_config)
+        self.assertGreater(float(abs_op_time), 0)
+        conv2d_op_cost = cost_model.get_static_op_time("conv2d")
+        conv2d_op_time = conv2d_op_cost["op_time"]
+        conv2d_op_config = conv2d_op_cost["config"]
+        self.assertGreater(float(conv2d_op_time), 0)
+        print("conv2d_op_time:", conv2d_op_time)
+        print("conv2d_op_config:", conv2d_op_config)
+
+        conv2d_backward_op_cost = cost_model.get_static_op_time(
+            "conv2d", forward=False)
+        conv2d_backward_op_time = conv2d_backward_op_cost["op_time"]
+        conv2d_backward_op_config = conv2d_backward_op_cost["config"]
+        self.assertGreater(float(conv2d_backward_op_time), 0)
+        print("conv2d_backward_op_time:", conv2d_backward_op_time)
+        print("conv2d_backward_op_config:", conv2d_backward_op_config)
+
+        conv2d_fp16_op_cost = cost_model.get_static_op_time(
+            "conv2d", dtype="float16")
+        conv2d_fp16_op_time = conv2d_fp16_op_cost["op_time"]
+        conv2d_fp16_op_config = conv2d_fp16_op_cost["config"]
+        self.assertGreater(float(conv2d_fp16_op_time), 0)
+        print("conv2d_fp16_op_time:", conv2d_fp16_op_time)
+        print("conv2d_fp16_op_config:", conv2d_fp16_op_config)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_cuda_cudnn_version.py b/python/paddle/fluid/tests/unittests/test_cuda_cudnn_version.py
new file mode 100644
index 0000000000000..d8229247a817f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_cuda_cudnn_version.py
@@ -0,0 +1,27 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+
+
+class TestCPUVersion(unittest.TestCase):
+    def test_cuda_cudnn_version_in_cpu_package(self):
+        if not paddle.is_compiled_with_cuda():
+            self.assertEqual(paddle.version.cuda(), 'False')
+            self.assertEqual(paddle.version.cudnn(), 'False')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_cuda_graph.py b/python/paddle/fluid/tests/unittests/test_cuda_graph.py
index 7d1317473531e..8b4eae8ada4e8 100644
--- a/python/paddle/fluid/tests/unittests/test_cuda_graph.py
+++ b/python/paddle/fluid/tests/unittests/test_cuda_graph.py
@@ -17,15 +17,21 @@
 from paddle.device.cuda.graphs import CUDAGraph
 import unittest
 import numpy as np
+import os
+import pathlib
+import shutil
 from paddle.fluid.dygraph.base import switch_to_static_graph
 from simple_nets import simple_fc_net_with_inputs
 
 
+def can_use_cuda_graph():
+    return paddle.is_compiled_with_cuda() and not paddle.is_compiled_with_rocm()
+
+
 class TestCUDAGraph(unittest.TestCase):
     def setUp(self):
-        if paddle.is_compiled_with_cuda() and not paddle.is_compiled_with_rocm(
-        ):
-            fluid.set_flags({
+        if can_use_cuda_graph():
+            paddle.set_flags({
                 'FLAGS_allocator_strategy': 'auto_growth',
                 'FLAGS_sync_nccl_allreduce': False,
                 'FLAGS_cudnn_deterministic': True
@@ -38,7 +44,7 @@ def random_tensor(self, shape):
 
     @switch_to_static_graph
     def test_cuda_graph_static_graph(self):
-        if not paddle.is_compiled_with_cuda() or paddle.is_compiled_with_rocm():
+        if not can_use_cuda_graph():
             return
 
         seed = 100
@@ -116,7 +122,7 @@ def cuda_graph_static_graph_main(self, seed, use_cuda_graph):
         return np.array(loss_t)
 
     def test_cuda_graph_dynamic_graph(self):
-        if not paddle.is_compiled_with_cuda() or paddle.is_compiled_with_rocm():
+        if not can_use_cuda_graph():
             return
 
         shape = [2, 3]
@@ -142,6 +148,45 @@ def test_cuda_graph_dynamic_graph(self):
 
         g.reset()
 
+    def test_concat_and_split(self):
+        if not can_use_cuda_graph():
+            return
+
+        concat_num = 100
+        xs = []
+        xs_np = []
+
+        for i in range(concat_num):
+            x_np = np.random.random(size=[1]).astype(np.float32)
+            xs.append(paddle.to_tensor(x_np))
+            xs_np.append(x_np)
+
+        graph = CUDAGraph()
+        graph.capture_begin()
+        y = paddle.concat(xs)
+        zs = paddle.split(y, len(xs))
+        graph.capture_end()
+        graph.replay()
+
+        y_np = y.numpy()
+        y_np_expected = np.concatenate(xs_np)
+        self.assertTrue(np.array_equal(y_np, y_np_expected))
+        self.assertEqual(len(zs), len(xs_np))
+        for i, z in enumerate(zs):
+            self.assertTrue(np.array_equal(z.numpy(), xs_np[i]))
+
+        output_dir = 'cuda_graph_dot_{}'.format(os.getpid())
+        try:
+            graph.print_to_dot_files(pathlib.Path(output_dir))
+            graph.reset()
+            shutil.rmtree(output_dir)
+        except Exception as e:
+            msg = str(e)
+            sub_msg = "The print_to_dot_files() method is only supported when CUDA version >= 11.3"
+            self.assertTrue(sub_msg in msg)
+        finally:
+            graph.reset()
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_eigvalsh_op.py b/python/paddle/fluid/tests/unittests/test_eigvalsh_op.py
new file mode 100644
index 0000000000000..db02372267677
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_eigvalsh_op.py
@@ -0,0 +1,192 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+from op_test import OpTest
+from gradient_checker import grad_check
+
+
+class TestEigvalshOp(OpTest):
+    def setUp(self):
+        paddle.enable_static()
+        self.op_type = "eigvalsh"
+        self.init_input()
+        self.init_config()
+        np.random.seed(123)
+        out_w, out_v = np.linalg.eigh(self.x_np, self.UPLO)
+        self.inputs = {"X": self.x_np}
+        self.attrs = {"UPLO": self.UPLO, "is_test": False}
+        self.outputs = {'Eigenvalues': out_w, 'Eigenvectors': out_v}
+
+    def init_config(self):
+        self.UPLO = 'L'
+
+    def init_input(self):
+        self.x_shape = (10, 10)
+        self.x_type = np.float64
+        self.x_np = np.random.random(self.x_shape).astype(self.x_type)
+
+    def test_check_output(self):
+        # Vectors in posetive or negative is equivalent
+        self.check_output(no_check_set=['Eigenvectors'])
+
+    def test_grad(self):
+        self.check_grad(["X"], ["Eigenvalues"])
+
+
+class TestEigvalshUPLOCase(TestEigvalshOp):
+    def init_config(self):
+        self.UPLO = 'U'
+
+
+class TestEigvalshGPUCase(unittest.TestCase):
+    def setUp(self):
+        self.x_shape = [32, 32]
+        self.dtype = "float32"
+        np.random.seed(123)
+        self.x_np = np.random.random(self.x_shape).astype(self.dtype)
+        self.rtol = 1e-5
+        self.atol = 1e-5
+
+    def test_check_output_gpu(self):
+        if paddle.is_compiled_with_cuda():
+            paddle.disable_static(place=paddle.CUDAPlace(0))
+            input_real_data = paddle.to_tensor(self.x_np)
+            expected_w = np.linalg.eigvalsh(self.x_np)
+            actual_w = paddle.linalg.eigvalsh(input_real_data)
+            np.testing.assert_allclose(
+                actual_w, expected_w, rtol=self.rtol, atol=self.atol)
+
+
+class TestEigvalshAPI(unittest.TestCase):
+    def setUp(self):
+        self.init_input_shape()
+        self.dtype = "float32"
+        self.UPLO = 'L'
+        self.rtol = 1e-6
+        self.atol = 1e-6
+        self.place = paddle.CUDAPlace(0) if paddle.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+        np.random.seed(123)
+        self.real_data = np.random.random(self.x_shape).astype(self.dtype)
+        self.complex_data = np.random.random(self.x_shape).astype(
+            self.dtype) + 1J * np.random.random(self.x_shape).astype(self.dtype)
+        self.trans_dims = list(range(len(self.x_shape) - 2)) + [
+            len(self.x_shape) - 1, len(self.x_shape) - 2
+        ]
+
+    def init_input_shape(self):
+        self.x_shape = [5, 5]
+
+    def compare_result(self, actual_w, expected_w):
+        np.testing.assert_allclose(
+            actual_w, expected_w, rtol=self.rtol, atol=self.atol)
+
+    def check_static_float_result(self):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        with paddle.static.program_guard(main_prog, startup_prog):
+            input_x = paddle.static.data(
+                'input_x', shape=self.x_shape, dtype=self.dtype)
+            output_w = paddle.linalg.eigvalsh(input_x)
+            exe = paddle.static.Executor(self.place)
+            expected_w = exe.run(main_prog,
+                                 feed={"input_x": self.real_data},
+                                 fetch_list=[output_w])
+
+            actual_w = np.linalg.eigvalsh(self.real_data)
+            self.compare_result(actual_w, expected_w[0])
+
+    def check_static_complex_result(self):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        with paddle.static.program_guard(main_prog, startup_prog):
+            x_dtype = np.complex64 if self.dtype == "float32" else np.complex128
+            input_x = paddle.static.data(
+                'input_x', shape=self.x_shape, dtype=x_dtype)
+            output_w = paddle.linalg.eigvalsh(input_x)
+            exe = paddle.static.Executor(self.place)
+            expected_w = exe.run(main_prog,
+                                 feed={"input_x": self.complex_data},
+                                 fetch_list=[output_w])
+            actual_w = np.linalg.eigvalsh(self.complex_data)
+            self.compare_result(actual_w, expected_w[0])
+
+    def test_in_static_mode(self):
+        paddle.enable_static()
+        self.check_static_float_result()
+        self.check_static_complex_result()
+
+    def test_in_dynamic_mode(self):
+        paddle.disable_static(self.place)
+        input_real_data = paddle.to_tensor(self.real_data)
+        expected_w = np.linalg.eigvalsh(self.real_data)
+        actual_w = paddle.linalg.eigvalsh(input_real_data)
+        self.compare_result(actual_w, expected_w)
+
+        input_complex_data = paddle.to_tensor(self.complex_data)
+        expected_w = np.linalg.eigvalsh(self.complex_data)
+        actual_w = paddle.linalg.eigvalsh(input_complex_data)
+        self.compare_result(actual_w, expected_w)
+
+    def test_eigvalsh_grad(self):
+        paddle.disable_static(self.place)
+        x = paddle.to_tensor(self.complex_data, stop_gradient=False)
+        w = paddle.linalg.eigvalsh(x)
+        (w.sum()).backward()
+        np.testing.assert_allclose(
+            abs(x.grad.numpy()),
+            abs(x.grad.numpy().conj().transpose(self.trans_dims)),
+            rtol=self.rtol,
+            atol=self.atol)
+
+
+class TestEigvalshBatchAPI(TestEigvalshAPI):
+    def init_input_shape(self):
+        self.x_shape = [2, 5, 5]
+
+
+class TestEigvalshAPIError(unittest.TestCase):
+    def test_error(self):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        with paddle.static.program_guard(main_prog, startup_prog):
+            #input maxtrix must greater than 2 dimensions
+            input_x = paddle.static.data(
+                name='x_1', shape=[12], dtype='float32')
+            self.assertRaises(ValueError, paddle.linalg.eigvalsh, input_x)
+
+            #input matrix must be square matrix
+            input_x = paddle.static.data(
+                name='x_2', shape=[12, 32], dtype='float32')
+            self.assertRaises(ValueError, paddle.linalg.eigvalsh, input_x)
+
+            #uplo must be in 'L' or 'U'
+            input_x = paddle.static.data(
+                name='x_3', shape=[4, 4], dtype="float32")
+            uplo = 'R'
+            self.assertRaises(ValueError, paddle.linalg.eigvalsh, input_x, uplo)
+
+            #x_data cannot be integer
+            input_x = paddle.static.data(
+                name='x_4', shape=[4, 4], dtype="int32")
+            self.assertRaises(TypeError, paddle.linalg.eigvalsh, input_x)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_base_2.py b/python/paddle/fluid/tests/unittests/test_fleet_base_2.py
index 7ca08bcb9d7f9..64b8744472d39 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_base_2.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_base_2.py
@@ -36,8 +36,13 @@ def test_ps_minimize(self):
 
         input_x = paddle.fluid.layers.data(
             name="x", shape=[32], dtype='float32')
+        input_slot = paddle.fluid.layers.data(
+            name="slot", shape=[1], dtype='int64')
         input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
 
+        emb = paddle.fluid.layers.embedding(
+            input=input_slot, size=[10, 9], is_sparse=True)
+        input_x = paddle.concat(x=[input_x, emb], axis=1)
         fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
         fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
         prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
@@ -63,11 +68,14 @@ def test_ps_minimize(self):
         compiled_prog = fluid.compiler.CompiledProgram(
             fluid.default_main_program())
 
+        fleet.init_worker()
         fleet.fleet.save(dirname="/tmp", feed=['x', 'y'], fetch=[avg_cost])
         fleet.fleet.save(
             dirname="/tmp", feed=[input_x, input_y], fetch=[avg_cost])
         fleet.fleet.save(dirname="/tmp")
 
+        fleet.load_model(path="/tmp", mode=0)
+
         self.assertRaises(
             Exception,
             fleet.save_inference_model,
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_executor.py b/python/paddle/fluid/tests/unittests/test_fleet_executor.py
new file mode 100644
index 0000000000000..1d042547e2067
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fleet_executor.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+
+
+class TestFleetExecutor(unittest.TestCase):
+    def run_fleet_executor(self, place):
+        exe = paddle.static.Executor(place)
+        empty_program = paddle.static.Program()
+        with fluid.program_guard(empty_program, empty_program):
+            x = fluid.layers.data(name='x', shape=[1], dtype=paddle.float32)
+        empty_program._pipeline_opt = {
+            "fleet_opt": True,
+            "section_program": empty_program
+        }
+        exe.run(empty_program, feed={'x': [1]})
+
+    def test_executor_on_multi_devices(self):
+        places = [fluid.CPUPlace()]
+        if fluid.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for place in places:
+            self.run_fleet_executor(place)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_launch_rank_mapping.sh b/python/paddle/fluid/tests/unittests/test_fleet_launch_rank_mapping.sh
new file mode 100755
index 0000000000000..eb84f9f6e847a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fleet_launch_rank_mapping.sh
@@ -0,0 +1,64 @@
+#!/bin/bash
+
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+
+# use single node
+echo "begin test"
+
+RANK_MAPPING_FILE_NAME="rank_mapping_file.json"
+cat > ${RANK_MAPPING_FILE_NAME} <<EOF
+{
+    "ip_ranks": [
+        {
+            "ip": "127.0.0.1",
+            "ranks": [0,1]
+        }
+    ]
+}
+EOF
+
+export FLAGS_START_PORT=35789
+distributed_args="--rank_mapping_file ${RANK_MAPPING_FILE_NAME} --enable_auto_mapping true --log_dir=testlog"
+CUDA_VISIBLE_DEVICES=0,1 python -m paddle.distributed.fleet.launch ${distributed_args} multi_process.py fleetlaunchcloud_rank_mapping
+
+str1="selected_gpus:0 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790 trainers_num:2 current_endpoint:127.0.0.1:35789 trainer_id:0"
+str2="selected_gpus:1 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790 trainers_num:2 current_endpoint:127.0.0.1:35790 trainer_id:1"
+file_0="multi_process_fleetlaunchcloud_rank_mapping.check_0.log"
+file_1="multi_process_fleetlaunchcloud_rank_mapping.check_1.log"
+
+echo "paddlecloud params test"
+if grep -q "$str1" "$file_0"; then
+    echo "find trainer 0"
+else
+    echo "not find trainer 0"
+    exit -1
+fi
+
+if grep -q "$str2" "$file_1"; then
+    echo "find trainer 1"
+else
+    echo "not find trainer 1"
+    exit -1
+fi
+
+# test async poll process
+if [ -f $file_0 ]; then
+    rm $file_0
+fi
+if [ -f $file_1 ]; then
+    rm $file_1
+fi
diff --git a/python/paddle/fluid/tests/unittests/test_fused_attention_op.py b/python/paddle/fluid/tests/unittests/test_fused_attention_op.py
index 7359adff62021..c0b3e27e6719d 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_attention_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_attention_op.py
@@ -65,7 +65,8 @@ def setUp(self):
     def config(self):
         self.x_type = np.float32
         self.attn_mask_type = np.float64
-        self.pre_layer_norm = True
+        self.pre_layer_norm = False
+        self.has_attn_mask = True
         self.training = True
 
         self.batch_size = 8
@@ -84,16 +85,20 @@ def config(self):
     def generate_input_data(self):
         self.query = np.random.rand(self.batch_size, self.query_length,
                                     self.embed_dim).astype(self.x_type)
-        self.attn_mask = np.ones(
-            (self.batch_size, self.num_heads, self.query_length,
-             self.key_length),
-            dtype=self.attn_mask_type)
-        if self.attn_mask_type == np.int64:
-            self.attn_mask = np.tril(self.attn_mask)
-        elif self.attn_mask_type == np.float64:
-            self.attn_mask = (np.tril(self.attn_mask) - 1.0) * 1e9
+        if self.has_attn_mask:
+            self.attn_mask = np.ones(
+                (self.batch_size, self.num_heads, self.query_length,
+                 self.key_length),
+                dtype=self.attn_mask_type)
+            if self.attn_mask_type == np.int64:
+                self.attn_mask = np.tril(self.attn_mask)
+            elif self.attn_mask_type == np.float64:
+                self.attn_mask = (np.tril(self.attn_mask) - 1.0) * 1e9
+            else:
+                raise ValueError(
+                    "'attn_mask_type' should be 'int64' or 'float64'.")
         else:
-            raise ValueError("'attn_mask_type' should be 'int64' or 'float64'.")
+            self.attn_mask = None
         self.key, self.value = self.query, self.query
 
         self.dout = np.random.random((self.batch_size, self.query_length,
@@ -102,7 +107,10 @@ def generate_input_data(self):
     def GetBaselineOut(self):
         paddle.disable_static(place=paddle.CUDAPlace(0))
         tensor_query = paddle.to_tensor(self.query, stop_gradient=False)
-        attn_mask = paddle.to_tensor(self.attn_mask, stop_gradient=False)
+        if self.has_attn_mask:
+            attn_mask = paddle.to_tensor(self.attn_mask, stop_gradient=False)
+        else:
+            attn_mask = None
         residual = tensor_query
 
         ln1_out = tensor_query
@@ -147,8 +155,8 @@ def GetBaselineOut(self):
         residual_out = residual + self.dropout(out)
         if not self.pre_layer_norm:
             final_out = self.norm1(residual_out)
-        if self.pre_layer_norm:
-            final_out = self.norm2(residual_out)
+        else:
+            final_out = residual_out
         paddle.autograd.backward(
             [final_out], [paddle.to_tensor(self.dout)], retain_graph=True)
         return final_out, tensor_query.grad
@@ -187,7 +195,10 @@ def GetFusedAttentionOut(self):
         qkv_bias = qkv_bias.reshape((3, self.num_heads, self.head_dim))
 
         x = paddle.to_tensor(self.query, stop_gradient=False)
-        attn_mask = paddle.to_tensor(self.attn_mask, stop_gradient=False)
+        if self.has_attn_mask:
+            attn_mask = paddle.to_tensor(self.attn_mask, stop_gradient=False)
+        else:
+            attn_mask = None
         qkv_weight_tensor = paddle.to_tensor(qkv_weight, stop_gradient=False)
         qkv_bias_tensor = paddle.to_tensor(qkv_bias, stop_gradient=False)
         epsilon = 1e-05
@@ -208,16 +219,77 @@ def test_fused_attention_op(self):
         final_out_ref, x_grad_ref = self.GetBaselineOut()
         final_out, x_grad = self.GetFusedAttentionOut()
         np.testing.assert_allclose(
-            final_out_ref, final_out.numpy(), rtol=1e-5, atol=1e-5)
+            final_out_ref, final_out.numpy(), rtol=1e-5, atol=1e-4)
+        np.testing.assert_allclose(
+            x_grad_ref, x_grad.numpy(), rtol=1e-5, atol=1e-4)
+
+
+class TestFusedAttentionOpPreLn(TestFusedAttentionOp):
+    def config(self):
+        self.x_type = np.float32
+        self.attn_mask_type = np.float64
+        self.pre_layer_norm = True
+        self.has_attn_mask = True
+        self.training = True
+
+        self.batch_size = 8
+        self.query_length = 128
+        self.head_dim = 64
+        self.num_heads = 16
+        self.embed_dim = self.head_dim * self.num_heads
+
+        self.dropout_prob = 0.0
+        self.attn_dropout_prob = 0.0
+        self.weight_attr = None
+        self.bias_attr = None
+        self.kdim, self.vdim = self.embed_dim, self.embed_dim
+        self.key_length, self.value_length = self.query_length, self.query_length
+
+    def test_fused_attention_op(self):
+        final_out_ref, x_grad_ref = self.GetBaselineOut()
+        final_out, x_grad = self.GetFusedAttentionOut()
+        np.testing.assert_allclose(
+            final_out_ref, final_out.numpy(), rtol=1e-5, atol=1e-4)
         np.testing.assert_allclose(
-            x_grad_ref, x_grad.numpy(), rtol=1e-5, atol=1e-5)
+            x_grad_ref, x_grad.numpy(), rtol=1e-5, atol=1e-4)
+
+
+class TestFusedAttentionOpNoneAttnMask(TestFusedAttentionOp):
+    def config(self):
+        self.x_type = np.float32
+        self.attn_mask_type = np.float64
+        self.pre_layer_norm = True
+        self.has_attn_mask = False
+        self.training = True
+
+        self.batch_size = 8
+        self.query_length = 128
+        self.head_dim = 64
+        self.num_heads = 16
+        self.embed_dim = self.head_dim * self.num_heads
+
+        self.dropout_prob = 0.0
+        self.attn_dropout_prob = 0.0
+        self.weight_attr = None
+        self.bias_attr = None
+        self.kdim, self.vdim = self.embed_dim, self.embed_dim
+        self.key_length, self.value_length = self.query_length, self.query_length
+
+    def test_fused_attention_op(self):
+        final_out_ref, x_grad_ref = self.GetBaselineOut()
+        final_out, x_grad = self.GetFusedAttentionOut()
+        np.testing.assert_allclose(
+            final_out_ref, final_out.numpy(), rtol=1e-5, atol=1e-4)
+        np.testing.assert_allclose(
+            x_grad_ref, x_grad.numpy(), rtol=1e-5, atol=1e-4)
 
 
 class TestFusedAttentionOpFp16(TestFusedAttentionOp):
     def config(self):
         self.x_type = np.float16
         self.attn_mask_type = np.float64
-        self.pre_layer_norm = True
+        self.pre_layer_norm = False
+        self.has_attn_mask = True
         self.training = True
 
         self.batch_size = 8
diff --git a/python/paddle/fluid/tests/unittests/test_fused_attention_op_api.py b/python/paddle/fluid/tests/unittests/test_fused_attention_op_api.py
index e59ecc19d05cb..92acb5925a1b1 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_attention_op_api.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_attention_op_api.py
@@ -107,7 +107,7 @@ def compute_reference(pre_layer_norm, query, attn_mask, ln_scale, ln_bias,
 
     q = qkv[0:1, ::]
     q = q.reshape(batch_size, num_head, seq_len, head_dim)
-    k = qkv[1:2, ::]  #[1, batch_size, num_head, seq_len, head_dim] 
+    k = qkv[1:2, ::]  #[1, batch_size, num_head, seq_len, head_dim]
     k = k.reshape(batch_size, num_head, seq_len, head_dim)
     v = qkv[2::]
     v = v.reshape(batch_size, num_head, seq_len, head_dim)
@@ -138,9 +138,11 @@ def compute_reference(pre_layer_norm, query, attn_mask, ln_scale, ln_bias,
     out_linear_bias_out = out_linear_out + out_linear_bias
     out_linear_bias_dropout_out = out_linear_bias_out
     out_linear_bias_dropout_residual_out = query + out_linear_bias_dropout_out
-    out_linear_bias_dropout_residual_ln_out = layer_norm(
-        out_linear_bias_dropout_residual_out, True, True, ln_2_scale, ln_2_bias)
-    return out_linear_bias_dropout_residual_ln_out
+    if not pre_layer_norm:
+        out_linear_bias_dropout_residual_out = layer_norm(
+            out_linear_bias_dropout_residual_out, True, True, ln_2_scale,
+            ln_2_bias)
+    return out_linear_bias_dropout_residual_out
 
 
 class TestFusedAttentionAPI(unittest.TestCase):
@@ -152,6 +154,7 @@ def config(self):
         self.x_type = np.float32
         self.attn_mask_type = np.float64
         self.pre_layer_norm = True
+        self.has_attn_mask = True
         self.training = True
         self.need_weight = False
 
@@ -172,19 +175,27 @@ def config(self):
     def generate_input_data(self):
         self.query = np.random.rand(self.batch_size, self.query_length,
                                     self.embed_dim).astype(self.x_type)
-        self.attn_mask = np.ones(
-            (self.batch_size, self.num_heads, self.query_length,
-             self.key_length),
-            dtype=self.attn_mask_type)
-        if self.attn_mask_type == np.int64:
-            self.attn_mask = np.tril(self.attn_mask)
-        elif self.attn_mask_type == np.float64:
-            self.attn_mask = (np.tril(self.attn_mask) - 1.0) * 1e9
+        if self.has_attn_mask:
+            self.attn_mask = np.ones(
+                (self.batch_size, self.num_heads, self.query_length,
+                 self.key_length),
+                dtype=self.attn_mask_type)
+            if self.attn_mask_type == np.int64:
+                self.attn_mask = np.tril(self.attn_mask)
+            elif self.attn_mask_type == np.float64:
+                self.attn_mask = (np.tril(self.attn_mask) - 1.0) * 1e9
+            else:
+                raise ValueError(
+                    "'attn_mask_type' should be 'int64' or 'float64'.")
         else:
-            raise ValueError("'attn_mask_type' should be 'int64' or 'float64'.")
+            self.attn_mask = None
         self.key, self.value = self.query, self.query
 
     def run_imperative(self):
+        if self.has_attn_mask:
+            attn_mask_tensor = paddle.to_tensor(self.attn_mask)
+        else:
+            attn_mask_tensor = None
         fused_attn = FusedMultiHeadAttention(
             self.embed_dim, self.num_heads, self.dropout_prob,
             self.attn_dropout_prob, self.kdim, self.vdim, self.pre_layer_norm,
@@ -192,7 +203,7 @@ def run_imperative(self):
         out = fused_attn(
             paddle.to_tensor(self.query),
             paddle.to_tensor(self.query),
-            paddle.to_tensor(self.query), paddle.to_tensor(self.attn_mask))
+            paddle.to_tensor(self.query), attn_mask_tensor)
         ref_out = compute_reference(self.pre_layer_norm, self.query,
                                     self.attn_mask,
                                     fused_attn.pre_ln_scale.numpy(),
@@ -203,7 +214,7 @@ def run_imperative(self):
                                     fused_attn.qkv_bias.numpy(),
                                     fused_attn.linear_weight.numpy(),
                                     fused_attn.linear_bias.numpy())
-        self.assertTrue(np.allclose(ref_out, out, rtol=1e-5, atol=1e-5))
+        np.testing.assert_allclose(ref_out, out.numpy(), rtol=1e-5, atol=1e-5)
 
     def run_static(self):
         fused_attn = FusedMultiHeadAttention(
@@ -215,29 +226,42 @@ def run_static(self):
             name='X',
             shape=[self.batch_size, self.query_length, self.embed_dim],
             dtype=self.x_type)
-        attn_mask = paddle.static.data(
-            name='SrcMask',
-            shape=[
-                self.batch_size, self.num_heads, self.query_length,
-                self.key_length
-            ],
-            dtype=self.attn_mask_type)
-        final_out = fused_attn(x, x, x, attn_mask)
+        if self.has_attn_mask:
+            attn_mask = paddle.static.data(
+                name='SrcMask',
+                shape=[
+                    self.batch_size, self.num_heads, self.query_length,
+                    self.key_length
+                ],
+                dtype=self.attn_mask_type)
+            final_out = fused_attn(x, x, x, attn_mask)
+        else:
+            final_out = fused_attn(x, x, x)
 
         place = paddle.CUDAPlace(0)
         exe = paddle.static.Executor(place)
         exe.run(paddle.static.default_startup_program())
-        out, qkv_weight, qkv_bias, out_linear_weight, linear_bias, ln_scale, ln_bias, ln_2_scale, ln_2_bias = exe.run(
-            paddle.static.default_main_program(),
-            feed={"X": self.query,
-                  "SrcMask": self.attn_mask},
-            fetch_list=[
-                final_out, fused_attn.qkv_weight, fused_attn.qkv_bias,
-                fused_attn.linear_weight, fused_attn.linear_bias,
-                fused_attn.pre_ln_scale, fused_attn.pre_ln_bias,
-                fused_attn.ln_scale, fused_attn.ln_bias
-            ])
-
+        if self.has_attn_mask:
+            out, qkv_weight, qkv_bias, out_linear_weight, linear_bias, ln_scale, ln_bias, ln_2_scale, ln_2_bias = exe.run(
+                paddle.static.default_main_program(),
+                feed={"X": self.query,
+                      "SrcMask": self.attn_mask},
+                fetch_list=[
+                    final_out, fused_attn.qkv_weight, fused_attn.qkv_bias,
+                    fused_attn.linear_weight, fused_attn.linear_bias,
+                    fused_attn.pre_ln_scale, fused_attn.pre_ln_bias,
+                    fused_attn.ln_scale, fused_attn.ln_bias
+                ])
+        else:
+            out, qkv_weight, qkv_bias, out_linear_weight, linear_bias, ln_scale, ln_bias, ln_2_scale, ln_2_bias = exe.run(
+                paddle.static.default_main_program(),
+                feed={"X": self.query, },
+                fetch_list=[
+                    final_out, fused_attn.qkv_weight, fused_attn.qkv_bias,
+                    fused_attn.linear_weight, fused_attn.linear_bias,
+                    fused_attn.pre_ln_scale, fused_attn.pre_ln_bias,
+                    fused_attn.ln_scale, fused_attn.ln_bias
+                ])
         return out, qkv_weight, qkv_bias, out_linear_weight, linear_bias, ln_scale, ln_bias, ln_2_scale, ln_2_bias
 
     def test_static_api(self):
@@ -249,14 +273,36 @@ def test_static_api(self):
                                     self.attn_mask, ln_scale, ln_bias,
                                     ln_2_scale, ln_2_bias, qkv_weight, qkv_bias,
                                     linear_weight, linear_bias)
-        self.assertTrue(
-            np.allclose(
-                np.array(ref_out), np.array(out), rtol=1e-5, atol=1e-5))
+        np.testing.assert_allclose(ref_out, out, rtol=1e-5, atol=1e-5)
 
     def test_dynamic_api(self):
         paddle.disable_static(place=paddle.CUDAPlace(0))
         self.run_imperative()
 
 
+class TestFusedAttentionAPINoneAttnMask(TestFusedAttentionAPI):
+    def config(self):
+        self.x_type = np.float32
+        self.attn_mask_type = np.float64
+        self.pre_layer_norm = True
+        self.has_attn_mask = False
+        self.training = True
+        self.need_weight = False
+
+        self.batch_size = 1
+        self.query_length = 2
+        self.head_dim = 2
+        self.num_heads = 2
+        self.embed_dim = self.head_dim * self.num_heads
+
+        self.dropout_prob = 0.0
+        self.attn_dropout_prob = 0.0
+        self.weight_attr = None
+        self.bias_attr = None
+
+        self.kdim, self.vdim = self.embed_dim, self.embed_dim
+        self.key_length, self.value_length = self.query_length, self.query_length
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_triple_grad.py b/python/paddle/fluid/tests/unittests/test_imperative_triple_grad.py
new file mode 100644
index 0000000000000..d61a08308360f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_imperative_triple_grad.py
@@ -0,0 +1,150 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid as fluid
+import paddle
+from paddle.fluid.wrapped_decorator import wrap_decorator
+from paddle.vision.models import resnet50, resnet101
+import unittest
+from unittest import TestCase
+import numpy as np
+
+
+def _dygraph_guard_(func):
+    def __impl__(*args, **kwargs):
+        if fluid.in_dygraph_mode():
+            return func(*args, **kwargs)
+        else:
+            with fluid.dygraph.guard():
+                return func(*args, **kwargs)
+
+    return __impl__
+
+
+dygraph_guard = wrap_decorator(_dygraph_guard_)
+
+
+def random_var(size, low=-1, high=1, dtype='float32'):
+    np.random.seed(2021)
+    x_np = np.random.uniform(low=low, high=high, size=size).astype(dtype)
+    return fluid.dygraph.to_variable(x_np)
+
+
+class TestDygraphTripleGrad(TestCase):
+    def setUp(self):
+        self.sort_sum_gradient = False
+        self.shape = [5, 5]
+
+    def grad(self,
+             outputs,
+             inputs,
+             grad_outputs=None,
+             no_grad_vars=None,
+             retain_graph=None,
+             create_graph=False,
+             allow_unused=False):
+        fluid.set_flags({'FLAGS_sort_sum_gradient': self.sort_sum_gradient})
+        return fluid.dygraph.grad(
+            outputs=outputs,
+            inputs=inputs,
+            grad_outputs=grad_outputs,
+            no_grad_vars=no_grad_vars,
+            retain_graph=retain_graph,
+            create_graph=create_graph,
+            allow_unused=allow_unused)
+
+    @dygraph_guard
+    def test_exception(self):
+        with self.assertRaises(AssertionError):
+            self.grad(None, None)
+
+        shape = self.shape
+
+        with self.assertRaises(AssertionError):
+            self.grad(1, random_var(shape))
+
+        with self.assertRaises(AssertionError):
+            self.grad(random_var(shape), 1)
+
+        with self.assertRaises(AssertionError):
+            self.grad([1], [random_var(shape)])
+
+        with self.assertRaises(AssertionError):
+            self.grad([random_var(shape)], [1])
+
+        with self.assertRaises(AssertionError):
+            self.grad([random_var(shape), random_var(shape)],
+                      [random_var(shape)], [random_var(shape)])
+
+        with self.assertRaises(AssertionError):
+            self.grad(
+                [random_var(shape)], [random_var(shape)], no_grad_vars=[1])
+
+        with self.assertRaises(AssertionError):
+            self.grad([random_var(shape)], [random_var(shape)], no_grad_vars=1)
+
+    @dygraph_guard
+    def test_example_with_gradient_and_create_graph(self):
+        x = random_var(self.shape)
+        x_np = x.numpy()
+        x.stop_gradient = False
+
+        y = random_var(self.shape)
+        y_np = y.numpy()
+        y.stop_gradient = False
+
+        z = random_var(self.shape)
+        z_np = z.numpy()
+        numel = z_np.size
+        z.stop_gradient = False
+
+        out = fluid.layers.sigmoid(paddle.matmul(x, y) + z)
+        out_np = out.numpy()
+
+        dx_actual, = self.grad([out], [x], create_graph=True)
+        # Theoritical result based on math calculation
+        dout = np.ones(self.shape).astype('float32')
+        dx_expected = np.matmul(dout * out_np * (1 - out_np),
+                                np.transpose(y_np))
+        self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected))
+
+        ddx_actual, = self.grad([dx_actual], [x], create_graph=True)
+        # Theoritical result based on math calculation
+        DDY = np.zeros(self.shape).astype('float32')
+        DDX = np.ones(self.shape).astype('float32')
+        double_grad_tmp1 = np.matmul(dout * out_np * (1 - out_np),
+                                     np.transpose(DDY))
+        double_grad_tmp2 = np.matmul(DDX, y_np) + np.matmul(x_np, DDY)
+        double_grad_tmp3 = (
+            1 - 2 * out_np) * dout * double_grad_tmp2 * out_np * (1 - out_np)
+        ddx_expected = double_grad_tmp1 + np.matmul(double_grad_tmp3,
+                                                    np.transpose(y_np))
+        self.assertTrue(np.allclose(ddx_actual.numpy(), ddx_expected))
+
+        # Theoritical result based on math calculation
+        d_ddout = np.zeros(self.shape).astype('float32')
+        tmp0 = np.matmul(DDX, y_np) + np.matmul(x_np, DDY)
+        tmp1 = (1 - 2 * out_np) * ((1 - 2 * out_np) * dout * tmp0 * tmp0)
+        tmp2 = tmp0 * (1 - 2 * out_np) * d_ddout - 2 * dout * (
+            1 - out_np) * out_np * tmp0 * tmp0
+        dddx_expected = np.matmul(((tmp1 + tmp2) * out_np * (1 - out_np)),
+                                  np.transpose(y_np))
+
+        ddx_actual.backward()
+        dddx_grad_actual = x.gradient()
+        self.assertTrue(np.allclose(dddx_grad_actual, dddx_expected))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_mean_op.py b/python/paddle/fluid/tests/unittests/test_mean_op.py
index e2a2dcf44f056..d5cc81456b84b 100644
--- a/python/paddle/fluid/tests/unittests/test_mean_op.py
+++ b/python/paddle/fluid/tests/unittests/test_mean_op.py
@@ -254,4 +254,5 @@ def test_errors(self):
 
 
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_multinomial_op.py b/python/paddle/fluid/tests/unittests/test_multinomial_op.py
index 957c06eca89c3..cdb89bb964055 100644
--- a/python/paddle/fluid/tests/unittests/test_multinomial_op.py
+++ b/python/paddle/fluid/tests/unittests/test_multinomial_op.py
@@ -141,6 +141,14 @@ def test_dygraph3(self):
             "replacement is False. categories can't be sampled repeatedly")
         paddle.enable_static()
 
+    def test_dygraph4(self):
+        paddle.disable_static()
+        logits = -1 * paddle.ones([2800])
+        # Categorical.sample API will call multinomial op with replacement=True
+        cat = paddle.distribution.Categorical(logits.exp())
+        cat.sample([1])
+        paddle.enable_static()
+
     def test_static(self):
         paddle.enable_static()
         startup_program = fluid.Program()
diff --git a/python/paddle/fluid/tests/unittests/test_nn_matmul_v2_grad.py b/python/paddle/fluid/tests/unittests/test_nn_matmul_v2_grad.py
index 16e8a4a8b00fb..6dbabda1f4c34 100644
--- a/python/paddle/fluid/tests/unittests/test_nn_matmul_v2_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_nn_matmul_v2_grad.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -146,5 +146,427 @@ def test_grad(self):
             self.func(p)
 
 
+class TestMatmulTripleGradCheckDotCase(unittest.TestCase):
+    def setUp(self):
+        self.init_test()
+
+
+def init_test(self):
+    self.x_shape = [2]
+    self.y_shape = [2]
+    self.transpose_x = False
+    self.transpose_y = False
+
+
+@prog_scope()
+def func(self, place):
+    eps = 0.005
+    dtype = np.float64
+    typename = "float64"
+    x = paddle.static.create_parameter(
+        dtype=typename, shape=self.x_shape, name='x')
+    y = paddle.static.create_parameter(
+        dtype=typename, shape=self.y_shape, name='y')
+    out = paddle.matmul(x, y, self.transpose_x, self.transpose_y, name='out')
+    np.random.seed(2021)
+    x_arr = np.random.uniform(-1, 1, self.x_shape).astype(dtype)
+    y_arr = np.random.uniform(-1, 1, self.y_shape).astype(dtype)
+    gradient_checker.triple_grad_check(
+        [x, y], out, x_init=[x_arr, y_arr], place=place, eps=eps)
+
+
+def test_grad(self):
+    places = [fluid.CPUPlace()]
+    if core.is_compiled_with_cuda():
+        places.append(fluid.CUDAPlace(0))
+    for p in places:
+        self.func(p)
+
+
+class TestMatmulTripleGradCheckNormalCase1(unittest.TestCase):
+    def setUp(self):
+        self.init_test()
+
+    def init_test(self):
+        self.x_shape = [2, 2]
+        self.y_shape = [2, 2]
+        self.transpose_x = False
+        self.transpose_y = False
+
+    @prog_scope()
+    def func(self, place):
+        eps = 0.005
+        dtype = np.float64
+        typename = "float64"
+        x = paddle.static.create_parameter(
+            dtype=typename, shape=self.x_shape, name='x')
+        y = paddle.static.create_parameter(
+            dtype=typename, shape=self.y_shape, name='y')
+        out = paddle.matmul(
+            x, y, self.transpose_x, self.transpose_y, name='out')
+        np.random.seed(2021)
+        x_arr = np.random.uniform(-1, 1, self.x_shape).astype(dtype)
+        y_arr = np.random.uniform(-1, 1, self.y_shape).astype(dtype)
+        gradient_checker.triple_grad_check(
+            [x, y], out, x_init=[x_arr, y_arr], place=place, eps=eps)
+
+    def test_grad(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
+class TestMatmulTripleGradCheckNormalCase2(unittest.TestCase):
+    def setUp(self):
+        self.init_test()
+
+    def init_test(self):
+        self.x_shape = [2, 2]
+        self.y_shape = [2, 2]
+        self.transpose_x = True
+        self.transpose_y = False
+
+    @prog_scope()
+    def func(self, place):
+        eps = 0.005
+        dtype = np.float64
+        typename = "float64"
+        x = paddle.static.create_parameter(
+            dtype=typename, shape=self.x_shape, name='x')
+        y = paddle.static.create_parameter(
+            dtype=typename, shape=self.y_shape, name='y')
+        out = paddle.matmul(
+            x, y, self.transpose_x, self.transpose_y, name='out')
+        np.random.seed(2021)
+        x_arr = np.random.uniform(-1, 1, self.x_shape).astype(dtype)
+        y_arr = np.random.uniform(-1, 1, self.y_shape).astype(dtype)
+        gradient_checker.triple_grad_check(
+            [x, y], out, x_init=[x_arr, y_arr], place=place, eps=eps)
+
+    def test_grad(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
+class TestMatmulTripleGradCheckNormalCase3(unittest.TestCase):
+    def setUp(self):
+        self.init_test()
+
+    def init_test(self):
+        self.x_shape = [2, 2]
+        self.y_shape = [2, 2]
+        self.transpose_x = False
+        self.transpose_y = True
+
+    @prog_scope()
+    def func(self, place):
+        eps = 0.005
+        dtype = np.float64
+        typename = "float64"
+        x = paddle.static.create_parameter(
+            dtype=typename, shape=self.x_shape, name='x')
+        y = paddle.static.create_parameter(
+            dtype=typename, shape=self.y_shape, name='y')
+        out = paddle.matmul(
+            x, y, self.transpose_x, self.transpose_y, name='out')
+        np.random.seed(2021)
+        x_arr = np.random.uniform(-1, 1, self.x_shape).astype(dtype)
+        y_arr = np.random.uniform(-1, 1, self.y_shape).astype(dtype)
+        gradient_checker.triple_grad_check(
+            [x, y], out, x_init=[x_arr, y_arr], place=place, eps=eps)
+
+    def test_grad(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
+class TestMatmulTripleGradCheckNormalCase4(unittest.TestCase):
+    def setUp(self):
+        self.init_test()
+
+    def init_test(self):
+        self.x_shape = [2, 2]
+        self.y_shape = [2, 2]
+        self.transpose_x = True
+        self.transpose_y = True
+
+    @prog_scope()
+    def func(self, place):
+        eps = 0.005
+        dtype = np.float64
+        typename = "float64"
+        x = paddle.static.create_parameter(
+            dtype=typename, shape=self.x_shape, name='x')
+        y = paddle.static.create_parameter(
+            dtype=typename, shape=self.y_shape, name='y')
+        out = paddle.matmul(
+            x, y, self.transpose_x, self.transpose_y, name='out')
+        np.random.seed(2021)
+        x_arr = np.random.uniform(-1, 1, self.x_shape).astype(dtype)
+        y_arr = np.random.uniform(-1, 1, self.y_shape).astype(dtype)
+        gradient_checker.triple_grad_check(
+            [x, y], out, x_init=[x_arr, y_arr], place=place, eps=eps)
+
+    def test_grad(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
+class TestMatmulTripleGradCheckBroadcastCase1(unittest.TestCase):
+    def setUp(self):
+        self.init_test()
+
+    def init_test(self):
+        self.x_shape = [3, 2, 2]
+        self.y_shape = [1, 2, 2]
+        self.transpose_x = False
+        self.transpose_y = False
+
+    @prog_scope()
+    def func(self, place):
+        eps = 0.005
+        dtype = np.float64
+        typename = "float64"
+        x = paddle.static.create_parameter(
+            dtype=typename, shape=self.x_shape, name='x')
+        y = paddle.static.create_parameter(
+            dtype=typename, shape=self.y_shape, name='y')
+        out = paddle.matmul(
+            x, y, self.transpose_x, self.transpose_y, name='out')
+        np.random.seed(2021)
+        x_arr = np.random.uniform(-1, 1, self.x_shape).astype(dtype)
+        y_arr = np.random.uniform(-1, 1, self.y_shape).astype(dtype)
+        gradient_checker.triple_grad_check(
+            [x, y], out, x_init=[x_arr, y_arr], place=place, eps=eps)
+
+    def test_grad(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
+class TestMatmulTripleGradCheckBroadcastCase2(unittest.TestCase):
+    def setUp(self):
+        self.init_test()
+
+    def init_test(self):
+        self.x_shape = [1, 2, 2]
+        self.y_shape = [3, 2, 2]
+        self.transpose_x = False
+        self.transpose_y = False
+
+    @prog_scope()
+    def func(self, place):
+        eps = 0.005
+        dtype = np.float64
+        typename = "float64"
+        x = paddle.static.create_parameter(
+            dtype=typename, shape=self.x_shape, name='x')
+        y = paddle.static.create_parameter(
+            dtype=typename, shape=self.y_shape, name='y')
+        out = paddle.matmul(
+            x, y, self.transpose_x, self.transpose_y, name='out')
+        np.random.seed(2021)
+        x_arr = np.random.uniform(-1, 1, self.x_shape).astype(dtype)
+        y_arr = np.random.uniform(-1, 1, self.y_shape).astype(dtype)
+        gradient_checker.triple_grad_check(
+            [x, y], out, x_init=[x_arr, y_arr], place=place, eps=eps)
+
+    def test_grad(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
+class TestMatmulTripleGradCheckBroadcastCase3(unittest.TestCase):
+    def setUp(self):
+        self.init_test()
+
+    def init_test(self):
+        self.x_shape = [1, 2, 2]
+        self.y_shape = [3, 2, 2]
+        self.transpose_x = True
+        self.transpose_y = False
+
+    @prog_scope()
+    def func(self, place):
+        eps = 0.005
+        dtype = np.float64
+        typename = "float64"
+        x = paddle.static.create_parameter(
+            dtype=typename, shape=self.x_shape, name='x')
+        y = paddle.static.create_parameter(
+            dtype=typename, shape=self.y_shape, name='y')
+        out = paddle.matmul(
+            x, y, self.transpose_x, self.transpose_y, name='out')
+        np.random.seed(2021)
+        x_arr = np.random.uniform(-1, 1, self.x_shape).astype(dtype)
+        y_arr = np.random.uniform(-1, 1, self.y_shape).astype(dtype)
+        gradient_checker.triple_grad_check(
+            [x, y], out, x_init=[x_arr, y_arr], place=place, eps=eps)
+
+    def test_grad(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
+class TestMatmulTripleGradCheckBroadcastCase4(unittest.TestCase):
+    def setUp(self):
+        self.init_test()
+
+    def init_test(self):
+        self.x_shape = [1, 2, 2]
+        self.y_shape = [3, 2, 2]
+        self.transpose_x = False
+        self.transpose_y = True
+
+    @prog_scope()
+    def func(self, place):
+        eps = 0.005
+        dtype = np.float64
+        typename = "float64"
+        x = paddle.static.create_parameter(
+            dtype=typename, shape=self.x_shape, name='x')
+        y = paddle.static.create_parameter(
+            dtype=typename, shape=self.y_shape, name='y')
+        out = paddle.matmul(
+            x, y, self.transpose_x, self.transpose_y, name='out')
+        np.random.seed(2021)
+        x_arr = np.random.uniform(-1, 1, self.x_shape).astype(dtype)
+        y_arr = np.random.uniform(-1, 1, self.y_shape).astype(dtype)
+        gradient_checker.triple_grad_check(
+            [x, y], out, x_init=[x_arr, y_arr], place=place, eps=eps)
+
+    def test_grad(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
+class TestMatmulTripleGradCheckBroadcastCase5(unittest.TestCase):
+    def setUp(self):
+        self.init_test()
+
+    def init_test(self):
+        self.x_shape = [1, 2, 2]
+        self.y_shape = [3, 2, 2]
+        self.transpose_x = True
+        self.transpose_y = True
+
+    @prog_scope()
+    def func(self, place):
+        eps = 0.005
+        dtype = np.float64
+        typename = "float64"
+        x = paddle.static.create_parameter(
+            dtype=typename, shape=self.x_shape, name='x')
+        y = paddle.static.create_parameter(
+            dtype=typename, shape=self.y_shape, name='y')
+        out = paddle.matmul(
+            x, y, self.transpose_x, self.transpose_y, name='out')
+        np.random.seed(2021)
+        x_arr = np.random.uniform(-1, 1, self.x_shape).astype(dtype)
+        y_arr = np.random.uniform(-1, 1, self.y_shape).astype(dtype)
+        gradient_checker.triple_grad_check(
+            [x, y], out, x_init=[x_arr, y_arr], place=place, eps=eps)
+
+    def test_grad(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
+class TestMatmulTripleGradCheckSpecialCase1(unittest.TestCase):
+    def setUp(self):
+        self.init_test()
+
+    def init_test(self):
+        self.x_shape = [3, 4, 5]
+        self.y_shape = [5]
+        self.transpose_x = False
+        self.transpose_y = False
+
+    @prog_scope()
+    def func(self, place):
+        eps = 0.005
+        dtype = np.float64
+        typename = "float64"
+        x = paddle.static.create_parameter(
+            dtype=typename, shape=self.x_shape, name='x')
+        y = paddle.static.create_parameter(
+            dtype=typename, shape=self.y_shape, name='y')
+        out = paddle.matmul(
+            x, y, self.transpose_x, self.transpose_y, name='out')
+        np.random.seed(2021)
+        x_arr = np.random.uniform(-1, 1, self.x_shape).astype(dtype)
+        y_arr = np.random.uniform(-1, 1, self.y_shape).astype(dtype)
+        gradient_checker.triple_grad_check(
+            [x, y], out, x_init=[x_arr, y_arr], place=place, eps=eps)
+
+    def test_grad(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
+class TestMatmulTripleGradCheckSpecialCase2(unittest.TestCase):
+    def setUp(self):
+        self.init_test()
+
+    def init_test(self):
+        self.x_shape = [4, 5, 5]
+        self.y_shape = [5]
+        self.transpose_x = True
+        self.transpose_y = False
+
+    @prog_scope()
+    def func(self, place):
+        eps = 0.005
+        dtype = np.float64
+        typename = "float64"
+        x = paddle.static.create_parameter(
+            dtype=typename, shape=self.x_shape, name='x')
+        y = paddle.static.create_parameter(
+            dtype=typename, shape=self.y_shape, name='y')
+        out = paddle.matmul(
+            x, y, self.transpose_x, self.transpose_y, name='out')
+        np.random.seed(2021)
+        x_arr = np.random.uniform(-1, 1, self.x_shape).astype(dtype)
+        y_arr = np.random.uniform(-1, 1, self.y_shape).astype(dtype)
+        gradient_checker.triple_grad_check(
+            [x, y], out, x_init=[x_arr, y_arr], place=place, eps=eps)
+
+    def test_grad(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_nvprof.py b/python/paddle/fluid/tests/unittests/test_nvprof.py
deleted file mode 100644
index da943d64da6cf..0000000000000
--- a/python/paddle/fluid/tests/unittests/test_nvprof.py
+++ /dev/null
@@ -1,48 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import os
-import numpy as np
-import paddle.fluid as fluid
-import paddle.fluid.profiler as profiler
-import paddle.fluid.layers as layers
-import paddle.fluid.core as core
-
-
-class TestNVProf(unittest.TestCase):
-    def test_nvprof(self):
-        if not fluid.core.is_compiled_with_cuda():
-            return
-        epoc = 8
-        dshape = [4, 3, 28, 28]
-        data = layers.data(name='data', shape=[3, 28, 28], dtype='float32')
-        conv = layers.conv2d(data, 20, 3, stride=[1, 1], padding=[1, 1])
-
-        place = fluid.CUDAPlace(0)
-        exe = fluid.Executor(place)
-        exe.run(fluid.default_startup_program())
-
-        output_file = 'cuda_profiler.txt'
-        with profiler.cuda_profiler(output_file, 'csv') as nvprof:
-            for i in range(epoc):
-                input = np.random.random(dshape).astype('float32')
-                exe.run(fluid.default_main_program(), feed={'data': input})
-        os.remove(output_file)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_run_cinn.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_run_cinn.py
index d26c7a1bb441e..d9ae3cf5e757d 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_run_cinn.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_run_cinn.py
@@ -16,14 +16,17 @@
 
 import logging
 import numpy as np
+import os
 import paddle
+import shutil
+import tempfile
 import unittest
 
 paddle.enable_static()
 
 logging.basicConfig(
     format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)
-logger = logging.getLogger(__name__)
+logger = logging.getLogger("paddle_with_cinn")
 
 
 def set_cinn_flag(val):
@@ -36,34 +39,87 @@ def set_cinn_flag(val):
     return cinn_compiled
 
 
+def reader(limit):
+    for _ in range(limit):
+        yield np.random.random([1, 28]).astype('float32'), \
+            np.random.randint(0, 2, size=[1]).astype('int64')
+
+
+def rand_data(img, label, loop_num=10):
+    feed = []
+    data = reader(loop_num)
+    for _ in range(loop_num):
+        d, l = next(data)
+        feed.append({img: d, label: l})
+    return feed
+
+
+def build_program(main_program, startup_program):
+    with paddle.static.program_guard(main_program, startup_program):
+        img = paddle.static.data(name='img', shape=[1, 28], dtype='float32')
+        param = paddle.create_parameter(
+            name="bias",
+            shape=[1, 28],
+            dtype="float32",
+            attr=paddle.ParamAttr(initializer=paddle.nn.initializer.Assign(
+                np.random.rand(1, 28).astype(np.float32))))
+        label = paddle.static.data(name="label", shape=[1], dtype='int64')
+
+        hidden = paddle.add(img, param)
+        prediction = paddle.nn.functional.relu(hidden)
+
+        loss = paddle.nn.functional.cross_entropy(input=prediction, label=label)
+        avg_loss = paddle.mean(loss)
+        adam = paddle.optimizer.Adam(learning_rate=0.001)
+        adam.minimize(avg_loss)
+    return img, label, avg_loss
+
+
+def train(dot_save_dir, prefix, seed=1234):
+    np.random.seed(seed)
+    paddle.seed(seed)
+    if paddle.is_compiled_with_cuda():
+        paddle.set_flags({'FLAGS_cudnn_deterministic': 1})
+
+    startup_program = paddle.static.Program()
+    main_program = paddle.static.Program()
+    img, label, loss = build_program(main_program, startup_program)
+
+    place = paddle.CUDAPlace(0) if paddle.is_compiled_with_cuda(
+    ) else paddle.CPUPlace()
+    exe = paddle.static.Executor(place)
+    exe.run(startup_program)
+
+    build_strategy = paddle.static.BuildStrategy()
+    build_strategy.debug_graphviz_path = os.path.join(dot_save_dir, prefix)
+    compiled_program = paddle.static.CompiledProgram(
+        main_program, build_strategy).with_data_parallel(loss_name=loss.name)
+
+    iters = 100
+    feed = rand_data(img.name, label.name, iters)
+    loss_values = []
+    for step in range(iters):
+        loss_v = exe.run(compiled_program,
+                         feed=feed[step],
+                         fetch_list=[loss],
+                         return_merged=False)
+        loss_values.append(loss_v[0][0][0])
+    return loss_values
+
+
 @unittest.skipIf(not set_cinn_flag(True), "Paddle is not compiled with CINN.")
 class TestParallelExecutorRunCinn(unittest.TestCase):
-    def test_run_from_cinn(self):
-        main_program = paddle.static.Program()
-        startup_program = paddle.static.Program()
-        with paddle.static.program_guard(main_program, startup_program):
-            data = paddle.static.data(
-                name='X', shape=[None, 1], dtype='float32')
-            prediction = paddle.static.nn.fc(data, 2)
-            loss = paddle.mean(prediction)
-            adam = paddle.optimizer.Adam()
-            adam.minimize(loss)
-
-        place = paddle.CUDAPlace(0) if paddle.is_compiled_with_cuda(
-        ) else paddle.CPUPlace()
-        exe = paddle.static.Executor(place)
-        exe.run(startup_program)
-        compiled_program = paddle.static.CompiledProgram(
-            main_program).with_data_parallel(loss_name=loss.name)
-
-        batch_size = 16
-        x = np.random.random(size=(batch_size, 1)).astype('float32')
-        fetch = exe.run(compiled_program,
-                        feed={'X': x},
-                        fetch_list=[prediction.name],
-                        return_merged=False)
+    def setUp(self):
+        self.tmpdir = tempfile.mkdtemp(prefix="dots_")
+
+    def tearDown(self):
+        shutil.rmtree(self.tmpdir)
 
+    def test_run_with_cinn(self):
+        cinn_losses = train(self.tmpdir, "paddle")
         set_cinn_flag(False)
+        pd_losses = train(self.tmpdir, "cinn")
+        self.assertTrue(np.allclose(cinn_losses, pd_losses, atol=1e-5))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_randint_like.py b/python/paddle/fluid/tests/unittests/test_randint_like.py
new file mode 100644
index 0000000000000..c716fd549244b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_randint_like.py
@@ -0,0 +1,221 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+from paddle.static import program_guard, Program
+
+
+# Test python API
+class TestRandintLikeAPI(unittest.TestCase):
+    def setUp(self):
+        self.x_bool = np.zeros((10, 12)).astype("bool")
+        self.x_int32 = np.zeros((10, 12)).astype("int32")
+        self.x_int64 = np.zeros((10, 12)).astype("int64")
+        self.x_float16 = np.zeros((10, 12)).astype("float16")
+        self.x_float32 = np.zeros((10, 12)).astype("float32")
+        self.x_float64 = np.zeros((10, 12)).astype("float64")
+
+        self.dtype = ["bool", "int32", "int64", "float16", "float32", "float64"]
+        self.place=paddle.CUDAPlace(0) if paddle.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def test_static_api(self):
+        paddle.enable_static()
+        with program_guard(Program(), Program()):
+            # results are from [-100, 100).
+            x_bool = paddle.fluid.data(
+                name="x_bool", shape=[10, 12], dtype="bool")
+            x_int32 = paddle.fluid.data(
+                name="x_int32", shape=[10, 12], dtype="int32")
+            x_int64 = paddle.fluid.data(
+                name="x_int64", shape=[10, 12], dtype="int64")
+            x_float16 = paddle.fluid.data(
+                name="x_float16", shape=[10, 12], dtype="float16")
+            x_float32 = paddle.fluid.data(
+                name="x_float32", shape=[10, 12], dtype="float32")
+            x_float64 = paddle.fluid.data(
+                name="x_float64", shape=[10, 12], dtype="float64")
+
+            exe = paddle.static.Executor(self.place)
+
+            # x dtype is bool output dtype in ["bool", "int32", "int64", "float16", "float32", "float64"]
+            outlist1 = [
+                paddle.randint_like(
+                    x_bool, low=-10, high=10, dtype=dtype)
+                for dtype in self.dtype
+            ]
+            outs1 = exe.run(feed={'x_bool': self.x_bool}, fetch_list=outlist1)
+            for out, dtype in zip(outs1, self.dtype):
+                self.assertTrue(out.dtype, np.dtype(dtype))
+                self.assertTrue(((out >= -10) & (out <= 10)).all(), True)
+
+            # x dtype is int32 output dtype in ["bool", "int32", "int64", "float16", "float32", "float64"]
+            outlist2 = [
+                paddle.randint_like(
+                    x_int32, low=-5, high=10, dtype=dtype)
+                for dtype in self.dtype
+            ]
+            outs2 = exe.run(feed={'x_int32': self.x_int32}, fetch_list=outlist2)
+            for out, dtype in zip(outs2, self.dtype):
+                self.assertTrue(out.dtype, np.dtype(dtype))
+                self.assertTrue(((out >= -5) & (out <= 10)).all(), True)
+
+            # x dtype is int64 output dtype in ["bool", "int32", "int64", "float16", "float32", "float64"]
+            outlist3 = [
+                paddle.randint_like(
+                    x_int64, low=-100, high=100, dtype=dtype)
+                for dtype in self.dtype
+            ]
+            outs3 = exe.run(feed={'x_int64': self.x_int64}, fetch_list=outlist3)
+            for out, dtype in zip(outs3, self.dtype):
+                self.assertTrue(out.dtype, np.dtype(dtype))
+                self.assertTrue(((out >= -100) & (out <= 100)).all(), True)
+
+            # x dtype is float16 output dtype in ["bool", "int32", "int64", "float16", "float32", "float64"]
+            outlist4 = [
+                paddle.randint_like(
+                    x_float16, low=-3, high=25, dtype=dtype)
+                for dtype in self.dtype
+            ]
+            outs4 = exe.run(feed={'x_float16': self.x_float16},
+                            fetch_list=outlist4)
+            for out, dtype in zip(outs4, self.dtype):
+                self.assertTrue(out.dtype, np.dtype(dtype))
+                self.assertTrue(((out >= -3) & (out <= 25)).all(), True)
+
+            # x dtype is float32 output dtype in ["bool", "int32", "int64", "float16", "float32", "float64"]
+            outlist5 = [
+                paddle.randint_like(
+                    x_float32, low=-25, high=25, dtype=dtype)
+                for dtype in self.dtype
+            ]
+            outs5 = exe.run(feed={'x_float32': self.x_float32},
+                            fetch_list=outlist5)
+            for out, dtype in zip(outs5, self.dtype):
+                self.assertTrue(out.dtype, np.dtype(dtype))
+                self.assertTrue(((out >= -25) & (out <= 25)).all(), True)
+
+            # x dtype is float64 output dtype in ["bool", "int32", "int64", "float16", "float32", "float64"]
+            outlist6 = [
+                paddle.randint_like(
+                    x_float64, low=-16, high=16, dtype=dtype)
+                for dtype in self.dtype
+            ]
+            outs6 = exe.run(feed={'x_float64': self.x_float64},
+                            fetch_list=outlist6)
+            for out, dtype in zip(outs6, self.dtype):
+                self.assertTrue(out.dtype, dtype)
+                self.assertTrue(((out >= -16) & (out <= 16)).all(), True)
+
+    def test_dygraph_api(self):
+        paddle.disable_static(self.place)
+        # x dtype ["bool", "int32", "int64", "float16", "float32", "float64"]
+        for x in [
+                self.x_bool, self.x_int32, self.x_int64, self.x_float16,
+                self.x_float32, self.x_float64
+        ]:
+            x_inputs = paddle.to_tensor(x)
+            # self.dtype ["bool", "int32", "int64", "float16", "float32", "float64"]
+            for dtype in self.dtype:
+                out = paddle.randint_like(
+                    x_inputs, low=-100, high=100, dtype=dtype)
+                self.assertTrue(out.numpy().dtype, np.dtype(dtype))
+                self.assertTrue(((out.numpy() >= -100) &
+                                 (out.numpy() <= 100)).all(), True)
+
+        paddle.enable_static()
+
+    def test_errors(self):
+        paddle.enable_static()
+        with program_guard(Program(), Program()):
+            x_bool = paddle.fluid.data(
+                name="x_bool", shape=[10, 12], dtype="bool")
+            x_int32 = paddle.fluid.data(
+                name="x_int32", shape=[10, 12], dtype="int32")
+            x_int64 = paddle.fluid.data(
+                name="x_int64", shape=[10, 12], dtype="int64")
+            x_float16 = paddle.fluid.data(
+                name="x_float16", shape=[10, 12], dtype="float16")
+            x_float32 = paddle.fluid.data(
+                name="x_float32", shape=[10, 12], dtype="float32")
+            x_float64 = paddle.fluid.data(
+                name="x_float64", shape=[10, 12], dtype="float64")
+
+            # x dtype is bool
+            # low is 5 and high is 5, low must less then high
+            self.assertRaises(
+                ValueError, paddle.randint_like, x_bool, low=5, high=5)
+            # low(default value) is 0 and high is -5, low must less then high
+            self.assertRaises(ValueError, paddle.randint_like, x_bool, high=-5)
+            # if high is None, low must be greater than 0
+            self.assertRaises(ValueError, paddle.randint_like, x_bool, low=-5)
+
+            # x dtype is int32
+            # low is 5 and high is 5, low must less then high
+            self.assertRaises(
+                ValueError, paddle.randint_like, x_int32, low=5, high=5)
+            # low(default value) is 0 and high is -5, low must less then high
+            self.assertRaises(ValueError, paddle.randint_like, x_int32, high=-5)
+            # if high is None, low must be greater than 0
+            self.assertRaises(ValueError, paddle.randint_like, x_int32, low=-5)
+
+            # x dtype is int64
+            # low is 5 and high is 5, low must less then high
+            self.assertRaises(
+                ValueError, paddle.randint_like, x_int64, low=5, high=5)
+            # low(default value) is 0 and high is -5, low must less then high
+            self.assertRaises(ValueError, paddle.randint_like, x_int64, high=-5)
+            # if high is None, low must be greater than 0
+            self.assertRaises(ValueError, paddle.randint_like, x_int64, low=-5)
+
+            # x dtype is float16
+            # low is 5 and high is 5, low must less then high
+            self.assertRaises(
+                ValueError, paddle.randint_like, x_float16, low=5, high=5)
+            # low(default value) is 0 and high is -5, low must less then high
+            self.assertRaises(
+                ValueError, paddle.randint_like, x_float16, high=-5)
+            # if high is None, low must be greater than 0
+            self.assertRaises(
+                ValueError, paddle.randint_like, x_float16, low=-5)
+
+            # x dtype is float32
+            # low is 5 and high is 5, low must less then high
+            self.assertRaises(
+                ValueError, paddle.randint_like, x_float32, low=5, high=5)
+            # low(default value) is 0 and high is -5, low must less then high
+            self.assertRaises(
+                ValueError, paddle.randint_like, x_float32, high=-5)
+            # if high is None, low must be greater than 0
+            self.assertRaises(
+                ValueError, paddle.randint_like, x_float32, low=-5)
+
+            # x dtype is float64
+            # low is 5 and high is 5, low must less then high
+            self.assertRaises(
+                ValueError, paddle.randint_like, x_float64, low=5, high=5)
+            # low(default value) is 0 and high is -5, low must less then high
+            self.assertRaises(
+                ValueError, paddle.randint_like, x_float64, high=-5)
+            # if high is None, low must be greater than 0
+            self.assertRaises(
+                ValueError, paddle.randint_like, x_float64, low=-5)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_resnet50_with_cinn.py b/python/paddle/fluid/tests/unittests/test_resnet50_with_cinn.py
new file mode 100644
index 0000000000000..7f865f55878e2
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_resnet50_with_cinn.py
@@ -0,0 +1,111 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import logging
+import numpy as np
+import paddle
+import unittest
+
+paddle.enable_static()
+
+logging.basicConfig(
+    format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+def set_cinn_flag(val):
+    cinn_compiled = False
+    try:
+        paddle.set_flags({'FLAGS_use_cinn': val})
+        cinn_compiled = True
+    except ValueError:
+        logger.warning("The used paddle is not compiled with CINN.")
+    return cinn_compiled
+
+
+@unittest.skipIf(not set_cinn_flag(True), "Paddle is not compiled with CINN.")
+class TestResnet50Accuracy(unittest.TestCase):
+    def reader(self, limit):
+        for _ in range(limit):
+            yield np.random.randint(0, 256, size=[32, 3, 224, 224]).astype('float32'), \
+                  np.random.randint(0, 1000, size=[32]).astype('int64')
+
+    def generate_random_data(self, loop_num=10):
+        feed = []
+        data = self.reader(loop_num)
+        for _ in range(loop_num):
+            x, y = next(data)
+            feed.append({'image': x, 'label': y})
+        return feed
+
+    def build_program(self, main_program, startup_program):
+        with paddle.static.program_guard(main_program, startup_program):
+            image = paddle.static.data(
+                name='image', shape=[32, 3, 224, 224], dtype='float32')
+            label = paddle.static.data(name='label', shape=[32], dtype='int64')
+
+            model = paddle.vision.models.resnet50()
+            prediction = model(image)
+
+            loss = paddle.nn.functional.cross_entropy(
+                input=prediction, label=label)
+            loss = paddle.mean(loss)
+            adam = paddle.optimizer.Adam(learning_rate=0.001)
+            adam.minimize(loss)
+        return loss
+
+    def train(self, place, iters, feed, use_cinn=False, seed=1234):
+        np.random.seed(seed)
+        paddle.seed(seed)
+        if paddle.is_compiled_with_cuda():
+            paddle.set_flags({'FLAGS_cudnn_deterministic': 1})
+        set_cinn_flag(use_cinn)
+
+        startup_program = paddle.static.Program()
+        main_program = paddle.static.Program()
+
+        loss = self.build_program(main_program, startup_program)
+        exe = paddle.static.Executor(place)
+
+        compiled_prog = paddle.static.CompiledProgram(
+            main_program).with_data_parallel(loss_name=loss.name)
+        loss_vals = []
+        scope = paddle.static.Scope()
+
+        with paddle.static.scope_guard(scope):
+            exe.run(startup_program)
+            for step in range(iters):
+                loss_v = exe.run(compiled_prog,
+                                 feed=feed[step],
+                                 fetch_list=[loss],
+                                 return_numpy=True)
+                loss_vals.append(loss_v[0][0])
+        return loss_vals
+
+    def test_check_resnet50_accuracy(self):
+        place = paddle.CUDAPlace(0) if paddle.is_compiled_with_cuda(
+        ) else paddle.CPUPlace()
+
+        loop_num = 10
+        feed = self.generate_random_data(loop_num)
+
+        loss_c = self.train(place, loop_num, feed, use_cinn=True)
+        loss_p = self.train(place, loop_num, feed, use_cinn=False)
+        self.assertTrue(np.allclose(loss_c, loss_p, atol=1e-5))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_rnn_op.py b/python/paddle/fluid/tests/unittests/test_rnn_op.py
index 763ec3e7038a4..79e33166bb6f8 100644
--- a/python/paddle/fluid/tests/unittests/test_rnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rnn_op.py
@@ -168,5 +168,35 @@ def set_attrs(self):
         self.is_bidirec = True
 
 
+class TestRNNOp5(TestRNNOp):
+    def set_attrs(self):
+        self.num_layers = 2
+
+
+class TestRNNOp6(TestRNNOp):
+    def set_attrs(self):
+        self.num_layers = 2
+        self.is_bidirec = True
+
+
+class TestRNNOp7(TestRNNOp):
+    def set_attrs(self):
+        self.num_layers = 2
+        self.is_bidirec = True
+        self.is_test = True
+
+
+class TestRNNOp8(TestRNNOp):
+    def set_attrs(self):
+        self.num_layers = 2
+        self.is_bidirec = True
+        self.sequence_length = None
+
+
+class TestRNNOp9(TestRNNOp):
+    def set_attrs(self):
+        self.num_layers = 3
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_scale_op.py b/python/paddle/fluid/tests/unittests/test_scale_op.py
index c1ce032f50612..baedc2b095914 100644
--- a/python/paddle/fluid/tests/unittests/test_scale_op.py
+++ b/python/paddle/fluid/tests/unittests/test_scale_op.py
@@ -109,7 +109,9 @@ def check_with_place(self, place, in_name, out_name):
 
         assert (in_array * scale == result_array).all()
         assert in_height == out_height
-        assert in_rows == out_rows
+        # TODO(chenweihang): output rows and height cannot be shared into
+        # fluid output tensor
+        # assert in_rows == out_rows
 
     def test_scale_selected_rows(self):
         places = [core.CPUPlace()]
diff --git a/python/paddle/fluid/tests/unittests/test_set_value_op.py b/python/paddle/fluid/tests/unittests/test_set_value_op.py
index 21f506d03ce68..e9809318cb393 100644
--- a/python/paddle/fluid/tests/unittests/test_set_value_op.py
+++ b/python/paddle/fluid/tests/unittests/test_set_value_op.py
@@ -1154,6 +1154,18 @@ def set_value5(t, value):
             msg="The gradient of input should be \n{},\n but reveived {}".
             format(value_grad, value.grad.numpy()))
 
+        # case 6: pass stop_gradient from value to x
+        x = paddle.zeros([8, 8], dtype='float32')
+        value = paddle.to_tensor([10], dtype='float32', stop_gradient=False)
+
+        self.assertTrue(x.stop_gradient)
+        self.assertTrue(x.is_leaf)
+
+        x[0, :] = value
+
+        self.assertTrue(~x.stop_gradient)
+        self.assertTrue(~x.is_leaf)
+
     def test_static_graph(self):
         paddle.enable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/test_sign_op.py b/python/paddle/fluid/tests/unittests/test_sign_op.py
index da5080eabddc9..bd145a968ed85 100644
--- a/python/paddle/fluid/tests/unittests/test_sign_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sign_op.py
@@ -83,4 +83,5 @@ def test_static(self):
 
 
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_softmax_mask_fuse_upper_triangle_op.py b/python/paddle/fluid/tests/unittests/test_softmax_mask_fuse_upper_triangle_op.py
index 8b6d37882ba1a..a73ebd73e4946 100644
--- a/python/paddle/fluid/tests/unittests/test_softmax_mask_fuse_upper_triangle_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_mask_fuse_upper_triangle_op.py
@@ -43,7 +43,7 @@ def _get_softmax_upper(x, fp16=True):
 class TestSoftmaxMaskFuseOp(OpTest):
     def setUp(self):
         self.op_type = "fused_softmax_mask_upper_triangle"
-        x = np.random.random((1, 1, 32, 32)).astype("float16")
+        x = np.random.random((1, 4, 32, 32)).astype("float16")
         self.inputs = {'X': x}
         rst = _get_softmax_upper(x)
         self.outputs = {'Out': rst}
@@ -60,7 +60,7 @@ def test_check_grad(self):
 class TestSoftmaxMaskFuseOp1(OpTest):
     def setUp(self):
         self.op_type = "fused_softmax_mask_upper_triangle"
-        x = np.random.random((1, 1, 32, 32))
+        x = np.random.random((1, 4, 32, 32))
         self.inputs = {'X': x}
         rst = _get_softmax_upper(x)
         self.outputs = {'Out': rst}
@@ -90,10 +90,10 @@ def test_static(self):
         for dtype in self.dtypes:
             with fluid.program_guard(fluid.Program(), fluid.Program()):
                 input_x = fluid.data(
-                    name="x", shape=[1, 1, 32, 32], dtype=dtype)
+                    name="x", shape=[1, 4, 32, 32], dtype=dtype)
                 rst = incubate.softmax_mask_fuse_upper_triangle(input_x)
 
-                x_in_np = np.random.random((1, 1, 32, 32)).astype(dtype)
+                x_in_np = np.random.random((1, 4, 32, 32)).astype(dtype)
                 rst_np = _get_softmax_upper(x_in_np, dtype == 'float16')
 
                 exe = fluid.Executor(fluid.CUDAPlace(0))
@@ -105,7 +105,7 @@ def test_static(self):
     def test_dygraph(self):
         for dtype in self.dtypes:
             with fluid.dygraph.guard(fluid.CUDAPlace(0)):
-                x_in_np = np.random.random((1, 1, 32, 32)).astype(dtype)
+                x_in_np = np.random.random((1, 4, 32, 32)).astype(dtype)
                 rst_np = _get_softmax_upper(x_in_np, dtype == 'float16')
                 input_x = fluid.dygraph.to_variable(x_in_np)
 
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_attention_op.py b/python/paddle/fluid/tests/unittests/test_sparse_attention_op.py
index 5134b885f3307..cce4742f16455 100644
--- a/python/paddle/fluid/tests/unittests/test_sparse_attention_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_attention_op.py
@@ -128,8 +128,8 @@ def init_csr_format(batch_size, num_heads, rows, blocksize):
 
 
 @unittest.skipIf(
-    not core.is_compiled_with_cuda() or get_cuda_version() < 11020,
-    "core is not compiled with CUDA and cuda version need larger than or equal to 11.2"
+    not core.is_compiled_with_cuda() or get_cuda_version() < 11030,
+    "core is not compiled with CUDA and cuda version need larger than or equal to 11.3"
 )
 class TestSparseAttentionOp(OpTest):
     def config(self):
@@ -190,8 +190,8 @@ def config(self):
 
 
 @unittest.skipIf(
-    not core.is_compiled_with_cuda() or get_cuda_version() < 11020,
-    "core is not compiled with CUDA and cuda version need larger than or equal to 11.2"
+    not core.is_compiled_with_cuda() or get_cuda_version() < 11030,
+    "core is not compiled with CUDA and cuda version need larger than or equal to 11.3"
 )
 class TestSparseAttentionAPI(unittest.TestCase):
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/test_tensor_copy_from.py b/python/paddle/fluid/tests/unittests/test_tensor_copy_from.py
new file mode 100644
index 0000000000000..6a91c2182d1c5
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_tensor_copy_from.py
@@ -0,0 +1,39 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import unittest
+import numpy as np
+from paddle.fluid.core import LoDTensor as Tensor
+
+
+class TestTensorCopyFrom(unittest.TestCase):
+    def test_main(self):
+        place = paddle.CPUPlace()
+        np_value = np.random.random(size=[10, 30]).astype('float32')
+        t_src = Tensor()
+        t_src.set(np_value, place)
+        self.assertTrue(np.array_equal(np_value, t_src))
+
+        t_dst1 = Tensor()
+        t_dst1._copy_from(t_src, place)
+        self.assertTrue(np.array_equal(np_value, t_dst1))
+
+        t_dst2 = Tensor()
+        t_dst2._copy_from(t_src, place, 5)
+        self.assertTrue(np.array_equal(np.array(np_value[0:5]), t_dst2))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_triangular_solve_op.py b/python/paddle/fluid/tests/unittests/test_triangular_solve_op.py
new file mode 100644
index 0000000000000..45e88d681d8e0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_triangular_solve_op.py
@@ -0,0 +1,339 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.w
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+
+import sys
+sys.path.append("..")
+import paddle
+from op_test import OpTest
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard, core
+
+paddle.enable_static()
+
+
+# 2D + 2D , test 'upper'
+class TestTriangularSolveOp(OpTest):
+    """
+    case 1
+    """
+
+    def config(self):
+        self.x_shape = [12, 12]
+        self.y_shape = [12, 10]
+        self.upper = True
+        self.transpose = False
+        self.unitriangular = False
+        self.dtype = "float64"
+
+    def set_output(self):
+        self.output = np.linalg.solve(
+            np.triu(self.inputs['X']), self.inputs['Y'])
+
+    def setUp(self):
+        self.op_type = "triangular_solve"
+        self.config()
+
+        self.inputs = {
+            'X': np.random.random(self.x_shape).astype(self.dtype),
+            'Y': np.random.random(self.y_shape).astype(self.dtype)
+        }
+        self.attrs = {
+            'upper': self.upper,
+            'transpose': self.transpose,
+            'unitriangular': self.unitriangular,
+        }
+        self.set_output()
+        self.outputs = {'Out': self.output}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Y'], 'Out')
+
+
+# 2D(broadcast) + 3D, test 'transpose'
+class TestTriangularSolveOp2(TestTriangularSolveOp):
+    """
+    case 2
+    """
+
+    def config(self):
+        self.x_shape = [10, 10]
+        self.y_shape = [3, 10, 8]
+        self.upper = False
+        self.transpose = True
+        self.unitriangular = False
+        self.dtype = "float64"
+
+    def set_output(self):
+        x = np.tril(self.inputs['X']).transpose(1, 0)
+        y = self.inputs['Y']
+        self.output = np.linalg.solve(x, y)
+
+
+# 3D(broadcast) + 3D
+class TestTriangularSolveOp3(TestTriangularSolveOp):
+    """
+    case 3
+    """
+
+    def config(self):
+        self.x_shape = [1, 10, 10]
+        self.y_shape = [6, 10, 12]
+        self.upper = False
+        self.transpose = False
+        self.unitriangular = False
+        self.dtype = "float64"
+
+    def set_output(self):
+        x = np.tril(self.inputs['X'])
+        y = self.inputs['Y']
+        self.output = np.linalg.solve(x, y)
+
+
+# 3D + 3D(broadcast), test 'transpose'
+class TestTriangularSolveOp4(TestTriangularSolveOp):
+    """
+    case 4
+    """
+
+    def config(self):
+        self.x_shape = [3, 10, 10]
+        self.y_shape = [1, 10, 12]
+        self.upper = True
+        self.transpose = True
+        self.unitriangular = False
+        self.dtype = "float64"
+
+    def set_output(self):
+        x = np.triu(self.inputs['X']).transpose(0, 2, 1)
+        y = self.inputs['Y']
+        self.output = np.linalg.solve(x, y)
+
+
+# 2D + 2D , test 'unitriangular' specially
+class TestTriangularSolveOp5(TestTriangularSolveOp):
+    """
+    case 5
+    """
+
+    def config(self):
+        self.x_shape = [10, 10]
+        self.y_shape = [10, 10]
+        self.upper = True
+        self.transpose = False
+        self.unitriangular = True
+        self.dtype = "float64"
+
+    def set_output(self):
+        x = np.triu(self.inputs['X'])
+        np.fill_diagonal(x, 1.)
+        y = self.inputs['Y']
+        self.output = np.linalg.solve(x, y)
+
+    def test_check_grad_normal(self):
+        x = np.triu(self.inputs['X'])
+        np.fill_diagonal(x, 1.)
+        grad_out = np.ones([10, 10]).astype('float64')
+        grad_y = np.linalg.solve(x.transpose(1, 0), grad_out)
+
+        grad_x = -np.matmul(grad_y, self.output.transpose(1, 0))
+        grad_x = np.triu(grad_x)
+        np.fill_diagonal(grad_x, 0.)
+
+        self.check_grad(
+            ['X', 'Y'],
+            'Out',
+            user_defined_grads=[grad_x, grad_y],
+            user_defined_grad_outputs=[grad_out])
+
+
+# 4D(broadcast) + 4D(broadcast)
+class TestTriangularSolveOp6(TestTriangularSolveOp):
+    """
+    case 6
+    """
+
+    def config(self):
+        self.x_shape = [1, 3, 10, 10]
+        self.y_shape = [2, 1, 10, 5]
+        self.upper = False
+        self.transpose = False
+        self.unitriangular = False
+        self.dtype = "float64"
+
+    def set_output(self):
+        x = np.tril(self.inputs['X'])
+        y = self.inputs['Y']
+        self.output = np.linalg.solve(x, y)
+
+
+# 3D(broadcast) + 4D(broadcast), test 'upper'
+class TestTriangularSolveOp7(TestTriangularSolveOp):
+    """
+    case 7
+    """
+
+    def config(self):
+        self.x_shape = [2, 10, 10]
+        self.y_shape = [5, 1, 10, 2]
+        self.upper = True
+        self.transpose = True
+        self.unitriangular = False
+        self.dtype = "float64"
+
+    def set_output(self):
+        x = np.triu(self.inputs['X']).transpose(0, 2, 1)
+        y = self.inputs['Y']
+        self.output = np.linalg.solve(x, y)
+
+
+# 3D(broadcast) + 5D
+class TestTriangularSolveOp8(TestTriangularSolveOp):
+    """
+    case 8
+    """
+
+    def config(self):
+        self.x_shape = [12, 3, 3]
+        self.y_shape = [2, 3, 12, 3, 2]
+        self.upper = False
+        self.transpose = False
+        self.unitriangular = False
+        self.dtype = "float64"
+
+    def set_output(self):
+        x = np.tril(self.inputs['X'])
+        y = self.inputs['Y']
+        self.output = np.linalg.solve(x, y)
+
+
+# 5D + 4D(broadcast)
+class TestTriangularSolveOp9(TestTriangularSolveOp):
+    """
+    case 9
+    """
+
+    def config(self):
+        self.x_shape = [2, 4, 2, 3, 3]
+        self.y_shape = [4, 1, 3, 10]
+        self.upper = False
+        self.transpose = False
+        self.unitriangular = False
+        self.dtype = "float64"
+
+    def set_output(self):
+        x = np.tril(self.inputs['X'])
+        y = self.inputs['Y']
+        self.output = np.matmul(np.linalg.inv(x), y)
+
+
+class TestTriangularSolveAPI(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(2021)
+        self.place = [paddle.CPUPlace()]
+        self.dtype = "float64"
+        if core.is_compiled_with_cuda():
+            self.place.append(paddle.CUDAPlace(0))
+
+    def check_static_result(self, place):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            x = fluid.data(name="x", shape=[3, 3], dtype=self.dtype)
+            y = fluid.data(name="y", shape=[3, 2], dtype=self.dtype)
+            z = paddle.linalg.triangular_solve(x, y)
+
+            x_np = np.random.random([3, 3]).astype(self.dtype)
+            y_np = np.random.random([3, 2]).astype(self.dtype)
+            z_np = np.linalg.solve(np.triu(x_np), y_np)
+
+            exe = fluid.Executor(place)
+            fetches = exe.run(fluid.default_main_program(),
+                              feed={"x": x_np,
+                                    "y": y_np},
+                              fetch_list=[z])
+            self.assertTrue(np.allclose(fetches[0], z_np))
+
+    def test_static(self):
+        for place in self.place:
+            self.check_static_result(place=place)
+
+    def test_dygraph(self):
+        def run(place):
+            paddle.disable_static(place)
+            x_np = np.random.random([3, 3]).astype(self.dtype)
+            y_np = np.random.random([3, 2]).astype(self.dtype)
+            z_np = np.linalg.solve(np.tril(x_np), y_np)
+
+            x = paddle.to_tensor(x_np)
+            y = paddle.to_tensor(y_np)
+            z = paddle.linalg.triangular_solve(x, y, upper=False)
+
+            self.assertTrue(np.allclose(z_np, z.numpy()))
+            self.assertEqual(z_np.shape, z.numpy().shape)
+            paddle.enable_static()
+
+        for place in self.place:
+            run(place)
+
+
+class TestTriangularSolveOpError(unittest.TestCase):
+    def test_errors(self):
+        with program_guard(Program(), Program()):
+            # The input type of solve_op must be Variable.
+            x1 = fluid.create_lod_tensor(
+                np.array([[-1]]), [[1]], fluid.CPUPlace())
+            y1 = fluid.create_lod_tensor(
+                np.array([[-1]]), [[1]], fluid.CPUPlace())
+            self.assertRaises(TypeError, paddle.linalg.triangular_solve, x1, y1)
+
+            # The data type of input must be float32 or float64.        
+            x2 = fluid.data(name="x2", shape=[30, 30], dtype="bool")
+            y2 = fluid.data(name="y2", shape=[30, 10], dtype="bool")
+            self.assertRaises(TypeError, paddle.linalg.triangular_solve, x2, y2)
+
+            x3 = fluid.data(name="x3", shape=[30, 30], dtype="int32")
+            y3 = fluid.data(name="y3", shape=[30, 10], dtype="int32")
+            self.assertRaises(TypeError, paddle.linalg.triangular_solve, x3, y3)
+
+            x4 = fluid.data(name="x4", shape=[30, 30], dtype="float16")
+            y4 = fluid.data(name="y4", shape=[30, 10], dtype="float16")
+            self.assertRaises(TypeError, paddle.linalg.triangular_solve, x4, y4)
+
+            # The number of dimensions of input'X must be >= 2.
+            x5 = fluid.data(name="x5", shape=[30], dtype="float64")
+            y5 = fluid.data(name="y5", shape=[30, 30], dtype="float64")
+            self.assertRaises(ValueError, paddle.linalg.triangular_solve, x5,
+                              y5)
+
+            # The number of dimensions of input'Y must be >= 2.
+            x6 = fluid.data(name="x6", shape=[30, 30], dtype="float64")
+            y6 = fluid.data(name="y6", shape=[30], dtype="float64")
+            self.assertRaises(ValueError, paddle.linalg.triangular_solve, x6,
+                              y6)
+
+            # The inner-most 2 dimensions of input'X should be equal to each other
+            x7 = fluid.data(name="x7", shape=[2, 3, 4], dtype="float64")
+            y7 = fluid.data(name="y7", shape=[2, 4, 3], dtype="float64")
+            self.assertRaises(ValueError, paddle.linalg.triangular_solve, x7,
+                              y7)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_warpctc_op.py b/python/paddle/fluid/tests/unittests/test_warpctc_op.py
index 6358cbcf0bbb2..53f3b3cf53d76 100644
--- a/python/paddle/fluid/tests/unittests/test_warpctc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_warpctc_op.py
@@ -18,7 +18,6 @@
 import unittest
 import numpy as np
 from op_test import OpTest
-from op_test import skip_check_grad_ci
 from test_softmax_op import stable_softmax
 import paddle.fluid as fluid
 import paddle.fluid.core as core
@@ -457,220 +456,6 @@ def test_check_grad(self):
         self.check_grad(["Logits"], "Loss")
 
 
-@skip_check_grad_ci(reason="For warpctc, not check grad.")
-class TestWarpCTCOpAttr(OpTest):
-    def config(self):
-        self.batch_size = 4
-        self.num_classes = 8
-        self.logits_lod = [[4, 1, 5, 5]]
-        self.labels_lod = [[3, 1, 4, 2]]
-        self.logits_length = np.array([4, 1, 5, 5], dtype=np.int64)
-        self.labels_length = np.array([3, 1, 4, 2], dtype=np.int64)
-        self.blank = self.num_classes - 1
-        self.norm_by_times = False
-        self.norm_by_batchsize = False
-        self.norm_by_total_logits_len = False
-
-    def setUp(self):
-        self.op_type = "warpctc"
-        self.config()
-
-        logits = np.random.uniform(
-            0.1, 1.0,
-            [sum(self.logits_length), self.num_classes]).astype("float64")
-        softmax = np.apply_along_axis(stable_softmax, 1, logits)
-        # labels should not be blank
-        labels = np.random.randint(
-            0,
-            self.num_classes - 1, [sum(self.labels_length), 1],
-            dtype="int32")
-
-        ctc = CTCForward(softmax, self.logits_lod, labels, self.labels_lod,
-                         self.num_classes, self.batch_size, self.blank,
-                         self.norm_by_times)
-        loss = ctc.forward()
-
-        max_sequence_length = 0
-        for i in range(self.batch_size):
-            max_sequence_length = max(max_sequence_length,
-                                      self.logits_length[i])
-        # reshape logits to T*N*S
-        new_logits = np.zeros(
-            [max_sequence_length, self.batch_size, self.num_classes],
-            dtype=logits.dtype)
-
-        cur = 0
-        for batch_id in range(self.batch_size):
-            for i in range(self.logits_length[batch_id]):
-                for j in range(self.num_classes):
-                    new_logits[i, batch_id, j] = logits[cur + i, j]
-            cur = cur + self.logits_length[batch_id]
-
-        # reshape labels to N*S
-        max_target_seq_length = 0
-        for i in range(self.batch_size):
-            max_target_seq_length = max(max_target_seq_length,
-                                        self.labels_length[i])
-        new_labels = np.zeros(
-            [self.batch_size, max_target_seq_length], dtype="int32")
-
-        cur = 0
-        for batch_id in range(self.batch_size):
-            for i in range(self.labels_length[batch_id]):
-                new_labels[batch_id, i] = labels[cur + i]
-            cur = cur + self.labels_length[batch_id]
-
-        self.gradient = np.zeros(
-            [max_sequence_length, self.batch_size, self.num_classes],
-            dtype=logits.dtype)
-
-        self.inputs = {
-            "Logits": new_logits,
-            "Label": new_labels,
-            "LogitsLength": self.logits_length,
-            "LabelLength": self.labels_length
-        }
-        self.outputs = {"Loss": loss}
-        self.attrs = {
-            "blank": self.blank,
-            "norm_by_times": self.norm_by_times,
-            "norm_by_batchsize": self.norm_by_batchsize,
-            "norm_by_total_logits_len": self.norm_by_total_logits_len,
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-
-@skip_check_grad_ci(reason="For warpctc, not check grad.")
-class TestWarpCTCOpFp64NormByTimes(TestWarpCTCOpAttr):
-    def config(self):
-        self.batch_size = 4
-        self.num_classes = 8
-        self.logits_lod = [[4, 1, 5, 5]]
-        self.labels_lod = [[3, 1, 4, 2]]
-        self.logits_length = np.array([4, 1, 5, 5], dtype=np.int64)
-        self.labels_length = np.array([3, 1, 4, 2], dtype=np.int64)
-        self.blank = self.num_classes - 1
-        self.norm_by_times = True
-        self.norm_by_batchsize = False
-        self.norm_by_total_logits_len = False
-
-
-@skip_check_grad_ci(reason="For warpctc, not check grad.")
-class TestWarpCTCOpFp64SizeAverage(TestWarpCTCOpAttr):
-    def config(self):
-        self.batch_size = 4
-        self.num_classes = 8
-        self.logits_lod = [[4, 1, 5, 5]]
-        self.labels_lod = [[3, 1, 4, 2]]
-        self.logits_length = np.array([4, 1, 5, 5], dtype=np.int64)
-        self.labels_length = np.array([3, 1, 4, 2], dtype=np.int64)
-        self.blank = self.num_classes - 1
-        self.norm_by_times = False
-        self.norm_by_batchsize = True
-        self.norm_by_total_logits_len = False
-
-
-@skip_check_grad_ci(reason="For warpctc, not check grad.")
-class TestWarpCTCOpFp64LengthAverage(TestWarpCTCOpAttr):
-    def config(self):
-        self.batch_size = 4
-        self.num_classes = 8
-        self.logits_lod = [[4, 1, 5, 5]]
-        self.labels_lod = [[3, 1, 4, 2]]
-        self.logits_length = np.array([4, 1, 5, 5], dtype=np.int64)
-        self.labels_length = np.array([3, 1, 4, 2], dtype=np.int64)
-        self.blank = self.num_classes - 1
-        self.norm_by_times = False
-        self.norm_by_batchsize = False
-        self.norm_by_total_logits_len = True
-
-
-class TestWarpCTCOpDygraph(unittest.TestCase):
-    def test_dygraph(self):
-        places = ['cpu']
-        if paddle.is_compiled_with_cuda():
-            places += ['gpu:0']
-
-        for p in places:
-            paddle.set_device(p)
-            paddle.disable_static()
-            paddle.seed(1)
-            np.random.seed(1)
-            #(B=2)
-            log_probs = np.array(
-                [[[4.17021990e-01, 7.20324516e-01, 1.14374816e-04],
-                  [3.02332580e-01, 1.46755889e-01, 9.23385918e-02]], [
-                      [1.86260208e-01, 3.45560730e-01, 3.96767467e-01],
-                      [5.38816750e-01, 4.19194520e-01, 6.85219526e-01]
-                  ], [[2.04452246e-01, 8.78117442e-01, 2.73875929e-02],
-                      [6.70467496e-01, 4.17304814e-01, 5.58689833e-01]],
-                 [[1.40386939e-01, 1.98101491e-01, 8.00744593e-01],
-                  [9.68261600e-01, 3.13424170e-01, 6.92322612e-01]],
-                 [[8.76389146e-01, 8.94606650e-01, 8.50442126e-02],
-                  [3.90547849e-02, 1.69830427e-01,
-                   8.78142476e-01]]]).astype("float32")
-            labels = np.array([[1, 2, 2], [1, 2, 2]]).astype("int32")
-            input_lengths = np.array([5, 5]).astype("int64")
-            label_lengths = np.array([3, 3]).astype("int64")
-
-            log_probs = paddle.to_tensor(log_probs, stop_gradient=False)
-            labels = paddle.to_tensor(labels)
-            input_lengths = paddle.to_tensor(input_lengths)
-            label_lengths = paddle.to_tensor(label_lengths)
-
-            loss = paddle.nn.CTCLoss(
-                blank=0, reduction='sum')(log_probs,
-                                          labels,
-                                          input_lengths,
-                                          label_lengths,
-                                          norm_by_times=False,
-                                          norm_by_batchsize=False,
-                                          norm_by_total_logits_len=False)
-            self.assertTrue(np.allclose(loss, [6.82563686], atol=1))
-            loss.backward()
-            log_probs.clear_gradient()
-
-            loss = paddle.nn.CTCLoss(
-                blank=0, reduction='sum')(log_probs,
-                                          labels,
-                                          input_lengths,
-                                          label_lengths,
-                                          norm_by_times=True,
-                                          norm_by_batchsize=False,
-                                          norm_by_total_logits_len=False)
-            self.assertTrue(np.allclose(loss, [6.82563686], atol=1))
-            loss.backward()
-            log_probs.clear_gradient()
-
-            loss = paddle.nn.CTCLoss(
-                blank=0, reduction='sum')(log_probs,
-                                          labels,
-                                          input_lengths,
-                                          label_lengths,
-                                          norm_by_times=False,
-                                          norm_by_batchsize=True,
-                                          norm_by_total_logits_len=False)
-            self.assertTrue(np.allclose(loss, [6.82563686], atol=1))
-            loss.backward()
-            log_probs.clear_gradient()
-
-            loss = paddle.nn.CTCLoss(
-                blank=0, reduction='sum')(log_probs,
-                                          labels,
-                                          input_lengths,
-                                          label_lengths,
-                                          norm_by_times=False,
-                                          norm_by_batchsize=False,
-                                          norm_by_total_logits_len=True)
-            self.assertTrue(np.allclose(loss, [6.82563686], atol=1))
-            loss.backward()
-            log_probs.clear_gradient()
-
-            paddle.enable_static()
-
-
 class TestWarpCTCOpError(unittest.TestCase):
     def test_errors(self):
         with program_guard(Program(), Program()):
diff --git a/python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py b/python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py
index fd87e7584cea5..23bbc377cae27 100644
--- a/python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py
+++ b/python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py
@@ -33,5 +33,6 @@
     'softmax_with_cross_entropy',
     'svd',
     'eigh',
+    'eigvalsh',
     'class_center_sample',
 ]
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py
index 9f807b06cb1a4..c2c69be45bf30 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py
@@ -95,6 +95,26 @@ def test_check_grad(self):
             self.check_grad_with_place(place, ['X'], 'Out')
 
 
+class TestXPUTanhFP16(TestXPUActivation):
+    def setUp(self):
+        self.op_type = "tanh"
+        self.init_dtype()
+        x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
+        out = np.tanh(x)
+
+        self.attrs = {'use_xpu': True}
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_grad(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(place, ['X'], 'Out')
+
+
 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                  "core is not compiled with XPU")
 class TestXPUSqrt(TestXPUActivation):
@@ -177,6 +197,27 @@ def test_check_grad(self):
             self.check_grad_with_place(place, ['X'], 'Out')
 
 
+class TestXPUGelu(TestXPUActivation):
+    def setUp(self):
+        self.op_type = "gelu"
+        self.init_dtype()
+        approximate = False
+        x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
+        out = gelu(x, approximate)
+
+        self.inputs = {'X': x}
+        self.outputs = {'Out': out}
+        self.attrs = {"approximate": approximate, 'use_xpu': True}
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_grad(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(place, ['X'], 'Out')
+
+
 def gelu(x, approximate):
     if approximate:
         y_ref = 0.5 * x * (1.0 + np.tanh(
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_add_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_add_op_xpu.py
index c4905a229b2e5..9ef8cc1e02790 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_add_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_add_op_xpu.py
@@ -28,17 +28,12 @@
 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                  "core is not compiled with XPU")
 class TestElementwiseAddOp(XPUOpTest):
-    def init_kernel_type(self):
-        self.use_mkldnn = False
-
     def setUp(self):
         self.op_type = "elementwise_add"
         self.init_dtype()
         self.init_input_output()
-        self.init_kernel_type()
         self.init_axis()
-        self.use_xpu = True
-
+        self.init_max_relative_error()
         self.inputs = {
             'X': OpTest.np_dtype_to_fluid_dtype(self.x),
             'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
@@ -55,7 +50,9 @@ def test_check_grad_normal(self):
         if paddle.is_compiled_with_xpu():
             place = paddle.XPUPlace(0)
             self.check_grad_with_place(
-                place, ['X', 'Y'], 'Out', max_relative_error=0.006)
+                place, ['X', 'Y'],
+                'Out',
+                max_relative_error=self.max_relative_error)
 
     def test_check_grad_ingore_x(self):
         if paddle.is_compiled_with_xpu():
@@ -64,7 +61,7 @@ def test_check_grad_ingore_x(self):
                 place, ['Y'],
                 'Out',
                 no_grad_set=set("X"),
-                max_relative_error=0.006)
+                max_relative_error=self.max_relative_error)
 
     def test_check_grad_ingore_y(self):
         if paddle.is_compiled_with_xpu():
@@ -73,7 +70,7 @@ def test_check_grad_ingore_y(self):
                 place, ['X'],
                 'Out',
                 no_grad_set=set("Y"),
-                max_relative_error=0.006)
+                max_relative_error=self.max_relative_error)
 
     def init_input_output(self):
         self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
@@ -86,6 +83,9 @@ def init_dtype(self):
     def init_axis(self):
         self.axis = -1
 
+    def init_max_relative_error(self):
+        self.max_relative_error = 0.006
+
 
 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                  "core is not compiled with XPU")
@@ -337,5 +337,170 @@ def test_dygraph(self):
             self.assertEqual((np_z == z_expected).all(), True)
 
 
+######## fp16 test
+class TestElementwiseAddFP16Op(TestElementwiseAddOp):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def init_max_relative_error(self):
+        self.max_relative_error = 0.01
+
+
+class TestElementwiseAddOp_scalarFP16(TestElementwiseAddFP16Op):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(1).astype(self.dtype)
+        self.out = self.x + self.y
+
+
+class TestElementwiseAddOp_scalar2FP16(TestElementwiseAddFP16Op):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(1, 1).astype(self.dtype)
+        self.out = self.x + self.y
+
+
+class TestElementwiseAddOp_VectorFP16(TestElementwiseAddFP16Op):
+    def init_input_output(self):
+        self.x = np.random.random((100, )).astype(self.dtype)
+        self.y = np.random.random((100, )).astype(self.dtype)
+        self.out = np.add(self.x, self.y)
+
+
+class TestElementwiseAddOp_broadcast_0FP16(TestElementwiseAddFP16Op):
+    def init_input_output(self):
+        self.x = np.random.rand(100, 2, 3).astype(self.dtype)
+        self.y = np.random.rand(100).astype(self.dtype)
+        self.out = self.x + self.y.reshape(100, 1, 1)
+
+    def init_axis(self):
+        self.axis = 0
+
+
+class TestElementwiseAddOp_broadcast_1FP16(TestElementwiseAddFP16Op):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 100, 3).astype(self.dtype)
+        self.y = np.random.rand(100).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 100, 1)
+
+    def init_axis(self):
+        self.axis = 1
+
+
+class TestElementwiseAddOp_broadcast_2FP16(TestElementwiseAddFP16Op):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 100).astype(self.dtype)
+        self.y = np.random.rand(100).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 1, 100)
+
+
+class TestElementwiseAddOp_broadcast_3FP16(TestElementwiseAddFP16Op):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 10, 12, 3).astype(self.dtype)
+        self.y = np.random.rand(10, 12).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 10, 12, 1)
+
+    def init_axis(self):
+        self.axis = 1
+
+
+class TestElementwiseAddOp_broadcast_4FP16(TestElementwiseAddFP16Op):
+    def init_input_output(self):
+        self.x = np.random.rand(100, 2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(100, 1).astype(self.dtype)
+        self.out = self.x + self.y.reshape(100, 1, 1, 1)
+
+    def init_axis(self):
+        self.axis = 0
+
+
+class TestElementwiseAddOp_broadcast_5FP16(TestElementwiseAddFP16Op):
+    def init_input_output(self):
+        self.x = np.random.rand(10, 3, 12).astype(self.dtype)
+        self.y = np.random.rand(10, 1, 12).astype(self.dtype)
+        self.out = self.x + self.y
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+class TestElementwiseAddOp_broadcast_6FP16(TestElementwiseAddFP16Op):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 12, 3, 5).astype(self.dtype)
+        self.y = np.random.rand(2, 12, 1, 5).astype(self.dtype)
+        self.out = self.x + self.y
+
+
+class TestElementwiseAddOp_broadcast_7FP16(TestElementwiseAddFP16Op):
+    def init_input_output(self):
+        self.x = np.random.rand(1, 1, 20, 5).astype(self.dtype)
+        self.y = np.random.rand(20, 5, 1, 1).astype(self.dtype)
+        self.out = self.x + self.y
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+class TestElementwiseAddOp_rowwise_add_0FP16(TestElementwiseAddFP16Op):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 10, 12).astype(self.dtype)
+        self.y = np.random.rand(10, 12).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 10, 12)
+
+    def init_axis(self):
+        self.axis = 1
+
+
+class TestElementwiseAddOp_rowwise_add_1FP16(TestElementwiseAddFP16Op):
+    def init_input_output(self):
+        self.x = np.random.rand(100, 1).astype(self.dtype)
+        self.y = np.random.rand(1).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 1)
+
+    def init_axis(self):
+        self.axis = 1
+
+
+class TestElementwiseAddOp_channelwise_addFP16(TestElementwiseAddFP16Op):
+    def init_input_output(self):
+        self.x = np.random.rand(100, 2, 3).astype(self.dtype)
+        self.y = np.random.rand(100, 1, 1).astype(self.dtype)
+        self.out = self.x + self.y
+
+    def init_axis(self):
+        self.axis = -1
+
+
+class TestElementwiseAddOp_commonuse_add1FP16(TestElementwiseAddFP16Op):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 100).astype(self.dtype)
+        self.y = np.random.rand(1, 1, 100).astype(self.dtype)
+        self.out = self.x + self.y
+
+    def init_axis(self):
+        self.axis = -1
+
+
+class TestElementwiseAddOp_commonuse_add2FP16(TestElementwiseAddFP16Op):
+    def init_input_output(self):
+        self.x = np.random.rand(10, 3, 1, 4).astype(self.dtype)
+        self.y = np.random.rand(10, 1, 12, 1).astype(self.dtype)
+        self.out = self.x + self.y
+
+    def init_axis(self):
+        self.axis = -1
+
+
+class TestElementwiseAddOp_xsize_lessthan_ysize_addFP16(
+        TestElementwiseAddFP16Op):
+    def init_input_output(self):
+        self.x = np.random.rand(10, 12).astype(self.dtype)
+        self.y = np.random.rand(2, 3, 10, 12).astype(self.dtype)
+        self.out = self.x + self.y
+
+    def init_axis(self):
+        self.axis = 2
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_gather_nd_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_gather_nd_op_xpu.py
new file mode 100644
index 0000000000000..0f9751cec4d92
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_gather_nd_op_xpu.py
@@ -0,0 +1,268 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+sys.path.append("..")
+from op_test import OpTest
+from op_test_xpu import XPUOpTest
+import paddle.fluid as fluid
+import paddle
+
+
+def gather_nd_grad(x, index):
+    dout_shape = index.shape[:-1] + x.shape[index.shape[-1]:]
+    numel = 1
+    for i in dout_shape:
+        numel = numel * i
+    dout = np.full(dout_shape, 1. / numel)
+    dx = np.full_like(x, 0)
+
+    index = tuple(index.reshape(-1, index.shape[-1]).T)
+    np.add.at(dx, index, dout)
+
+    return dx
+
+
+def test_class1(op_type, typename):
+    class TestGatherNdOpWithEmptyIndex(XPUOpTest):
+        #Index has empty element, which means copy entire tensor
+
+        def setUp(self):
+            self.set_xpu()
+            self.place = paddle.XPUPlace(0)
+            self.op_type = "gather_nd"
+            xnp = np.random.random((5, 20)).astype(typename)
+            self.inputs = {
+                'X': xnp,
+                'Index': np.array([[], []]).astype("int32")
+            }
+            self.outputs = {
+                'Out': np.vstack((xnp[np.newaxis, :], xnp[np.newaxis, :]))
+            }
+
+        def set_xpu(self):
+            self.__class__.use_xpu = True
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
+
+        def test_check_grad(self):
+            pass
+
+    cls_name = "{0}_{1}_1".format(op_type, typename)
+    TestGatherNdOpWithEmptyIndex.__name__ = cls_name
+    globals()[cls_name] = TestGatherNdOpWithEmptyIndex
+
+
+def test_class2(op_type, typename):
+    class TestGatherNdOpWithIndex1(OpTest):
+        def setUp(self):
+            self.set_xpu()
+            self.place = paddle.XPUPlace(0)
+            self.op_type = "gather_nd"
+            xnp = np.random.random((5, 20)).astype(typename)
+            self.inputs = {'X': xnp, 'Index': np.array([1]).astype("int32")}
+            self.outputs = {'Out': self.inputs["X"][self.inputs["Index"]]}
+
+        def set_xpu(self):
+            self.__class__.use_xpu = True
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
+
+        def test_check_grad(self):
+            pass
+
+    cls_name = "{0}_{1}_2".format(op_type, typename)
+    TestGatherNdOpWithIndex1.__name__ = cls_name
+    globals()[cls_name] = TestGatherNdOpWithIndex1
+
+
+def test_class3(op_type, typename):
+    class TestGatherNdOpWithLowIndex(OpTest):
+        #Index has low rank, X has high rank
+
+        def setUp(self):
+            self.set_xpu()
+            self.place = paddle.XPUPlace(0)
+            self.op_type = "gather_nd"
+            xnp = np.random.uniform(0, 100, (10, 10)).astype(typename)
+            index = np.array([[1], [2]]).astype("int64")
+
+            self.inputs = {'X': xnp, 'Index': index}
+            self.outputs = {'Out': xnp[tuple(index.T)]}
+            self.x_grad = gather_nd_grad(xnp, index)
+
+        def set_xpu(self):
+            self.__class__.use_xpu = True
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
+
+        def test_check_grad(self):
+            pass
+
+    cls_name = "{0}_{1}_3".format(op_type, typename)
+    TestGatherNdOpWithLowIndex.__name__ = cls_name
+    globals()[cls_name] = TestGatherNdOpWithLowIndex
+
+
+def test_class4(op_type, typename):
+    class TestGatherNdOpIndex1(OpTest):
+        #Index has low rank, X has high rank
+
+        def setUp(self):
+            self.set_xpu()
+            self.place = paddle.XPUPlace(0)
+            self.op_type = "gather_nd"
+            xnp = np.random.uniform(0, 100, (10, 10)).astype(typename)
+            index = np.array([1, 2]).astype("int64")
+
+            self.inputs = {'X': xnp, 'Index': index}
+
+            self.outputs = {'Out': xnp[tuple(index.T)]}
+
+        def set_xpu(self):
+            self.__class__.use_xpu = True
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
+
+        def test_check_grad(self):
+            pass
+
+    cls_name = "{0}_{1}_4".format(op_type, typename)
+    TestGatherNdOpIndex1.__name__ = cls_name
+    globals()[cls_name] = TestGatherNdOpIndex1
+
+
+def test_class5(op_type, typename):
+    class TestGatherNdOpWithSameIndexAsX(OpTest):
+        #Index has same rank as X's rank
+
+        def setUp(self):
+            self.set_xpu()
+            self.place = paddle.XPUPlace(0)
+            self.op_type = "gather_nd"
+            xnp = np.random.uniform(0, 100, (10, 10)).astype(typename)
+            index = np.array([[1, 1], [2, 1]]).astype("int64")
+
+            self.inputs = {'X': xnp, 'Index': index}
+            self.outputs = {'Out': xnp[tuple(index.T)]}  #[25, 22]
+
+        def set_xpu(self):
+            self.__class__.use_xpu = True
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
+
+        def test_check_grad(self):
+            pass
+
+    cls_name = "{0}_{1}_5".format(op_type, typename)
+    TestGatherNdOpWithSameIndexAsX.__name__ = cls_name
+    globals()[cls_name] = TestGatherNdOpWithSameIndexAsX
+
+
+def test_class6(op_type, typename):
+    class TestGatherNdOpWithHighRankSame(OpTest):
+        #Both Index and X have high rank, and Rank(Index) = Rank(X)
+
+        def setUp(self):
+            self.set_xpu()
+            self.place = paddle.XPUPlace(0)
+            self.op_type = "gather_nd"
+            shape = (5, 2, 3, 1, 10)
+            xnp = np.random.rand(*shape).astype(typename)
+            index = np.vstack([np.random.randint(
+                0, s, size=2) for s in shape]).T
+
+            self.inputs = {'X': xnp, 'Index': index.astype("int32")}
+            self.outputs = {'Out': xnp[tuple(index.T)]}
+
+        def set_xpu(self):
+            self.__class__.use_xpu = True
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
+
+        def test_check_grad(self):
+            pass
+
+    cls_name = "{0}_{1}_6".format(op_type, typename)
+    TestGatherNdOpWithHighRankSame.__name__ = cls_name
+    globals()[cls_name] = TestGatherNdOpWithHighRankSame
+
+
+def test_class7(op_type, typename):
+    class TestGatherNdOpWithHighRankDiff(OpTest):
+        #Both Index and X have high rank, Rank(Index) < Rank(X)
+
+        def setUp(self):
+            self.set_xpu()
+            self.place = paddle.XPUPlace(0)
+            self.op_type = "gather_nd"
+            shape = (2, 3, 4, 1, 10)
+            xnp = np.random.rand(*shape).astype(typename)
+            index = np.vstack(
+                [np.random.randint(
+                    0, s, size=200) for s in shape]).T
+            index_re = index.reshape([20, 5, 2, 5])
+
+            self.inputs = {'X': xnp, 'Index': index_re.astype("int32")}
+            self.outputs = {'Out': xnp[tuple(index.T)].reshape([20, 5, 2])}
+
+        def set_xpu(self):
+            self.__class__.use_xpu = True
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
+
+        def test_check_grad(self):
+            pass
+
+    cls_name = "{0}_{1}_7".format(op_type, typename)
+    TestGatherNdOpWithHighRankDiff.__name__ = cls_name
+    globals()[cls_name] = TestGatherNdOpWithHighRankDiff
+
+
+class TestGatherNdAPI(unittest.TestCase):
+    def test_imperative(self):
+        paddle.disable_static()
+        input_1 = np.array([[1, 2], [3, 4], [5, 6]])
+        index_1 = np.array([[1]])
+        input = fluid.dygraph.to_variable(input_1)
+        index = fluid.dygraph.to_variable(index_1)
+        output = paddle.fluid.layers.gather(input, index)
+        output_np = output.numpy()
+        expected_output = np.array([3, 4])
+        self.assertTrue(np.allclose(output_np, expected_output))
+        paddle.enable_static()
+
+
+for _typename in {'float32', 'int', 'int64'}:
+    test_class1('gather_nd', _typename)
+    test_class2('gather_nd', _typename)
+    test_class3('gather_nd', _typename)
+    test_class4('gather_nd', _typename)
+    test_class5('gather_nd', _typename)
+    test_class6('gather_nd', _typename)
+    test_class7('gather_nd', _typename)
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_gather_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_gather_op_xpu.py
index d33cb2157b03b..bdf74018abb58 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_gather_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_gather_op_xpu.py
@@ -36,7 +36,6 @@ def gather_numpy(x, index, axis):
 
 class TestXPUGatherOp(XPUOpTest):
     def setUp(self):
-        self.dtype = "float32"
         self.op_type = "gather"
         self.use_xpu = True
         self.use_mkldnn = False
@@ -50,6 +49,16 @@ def setUp(self):
         }
         self.outputs = {'Out': self.inputs["X"][self.inputs["Index"]]}
 
+    def config(self):
+        """
+        For multi-dimension input
+        """
+        self.dtype = np.float32
+        self.x_shape = (10, 20)
+        self.x_type = np.float32
+        self.index = [1, 3, 5]
+        self.index_type = np.int32
+
     def test_check_output(self):
         if paddle.is_compiled_with_xpu():
             place = paddle.XPUPlace(0)
@@ -60,25 +69,17 @@ def test_check_grad(self):
             place = paddle.XPUPlace(0)
             self.check_grad_with_place(place, ['X'], 'Out')
 
-    def config(self):
-        """
-        For multi-dimension input
-        """
-        self.x_shape = (10, 20)
-        self.x_type = "float32"
-        self.index = [1, 3, 5]
-        self.index_type = "int32"
-
 
 class TestCase1(TestXPUGatherOp):
     def config(self):
         """
         For one dimension input
         """
+        self.dtype = np.float32
         self.x_shape = (100)
-        self.x_type = "float32"
+        self.x_type = np.float32
         self.index = [1, 3, 5]
-        self.index_type = "int32"
+        self.index_type = np.int32
 
 
 class TestCase2(TestXPUGatherOp):
@@ -86,10 +87,11 @@ def config(self):
         """
         For int64_t index type
         """
+        self.dtype = np.float32
         self.x_shape = (100)
-        self.x_type = "float32"
+        self.x_type = np.float32
         self.index = [1, 3, 5]
-        self.index_type = "int32"
+        self.index_type = np.int64
 
 
 class TestCase3(TestXPUGatherOp):
@@ -97,46 +99,128 @@ def config(self):
         """
         For other input type
         """
+        self.dtype = np.float32
         self.x_shape = (10, 20)
-        self.x_type = "float32"
+        self.x_type = np.float32
         self.index = [1, 3, 5]
-        self.index_type = "int32"
+        self.index_type = np.int32
 
 
 class TestCase4(TestXPUGatherOp):
     def config(self):
+        self.dtype = np.float32
         self.x_shape = (10, 20)
         self.attrs = {'use_xpu': True, 'overwrite': False}
-        self.x_type = "float32"
+        self.x_type = np.float32
         self.index = [1, 1]
-        self.index_type = "int32"
+        self.index_type = np.int32
 
 
 class TestCase5(TestXPUGatherOp):
     def config(self):
+        self.dtype = np.float32
         self.x_shape = (10, 20)
         self.attrs = {'use_xpu': True, 'overwrite': False}
-        self.x_type = "float32"
+        self.x_type = np.float32
         self.index = [1, 1, 3]
-        self.index_type = "int32"
+        self.index_type = np.int32
 
 
 class TestCase6(TestXPUGatherOp):
     def config(self):
+        self.dtype = np.float32
         self.x_shape = (10, 20)
         self.attrs = {'use_xpu': True, 'overwrite': True}
-        self.x_type = "float32"
+        self.x_type = np.float32
         self.index = [1, 3]
-        self.index_type = "int32"
+        self.index_type = np.int32
 
 
 class TestCase7(TestXPUGatherOp):
     def config(self):
+        self.dtype = np.float32
+        self.x_shape = (10, 20)
+        self.attrs = {'use_xpu': True, 'overwrite': True}
+        self.x_type = np.float32
+        self.index = [1, 3]
+        self.index_type = np.int64
+
+
+## test fp16
+class TestCaseFP161(TestXPUGatherOp):
+    def config(self):
+        """
+        For one dimension input
+        """
+        self.dtype = np.float16
+        self.x_shape = (100)
+        self.x_type = np.float16
+        self.index = [1, 3, 5]
+        self.index_type = np.int32
+
+
+class TestCaseFP162(TestXPUGatherOp):
+    def config(self):
+        """
+        For int64_t index type
+        """
+        self.dtype = np.float16
+        self.x_shape = (100)
+        self.x_type = np.float16
+        self.index = [1, 3, 5]
+        self.index_type = np.int64
+
+
+class TestCaseFP163(TestXPUGatherOp):
+    def config(self):
+        """
+        For other input type
+        """
+        self.dtype = np.float16
+        self.x_shape = (10, 20)
+        self.x_type = np.float16
+        self.index = [1, 3, 5]
+        self.index_type = np.int32
+
+
+class TestCaseFP164(TestXPUGatherOp):
+    def config(self):
+        self.dtype = np.float16
+        self.x_shape = (10, 20)
+        self.attrs = {'use_xpu': True, 'overwrite': False}
+        self.x_type = np.float16
+        self.index = [1, 1]
+        self.index_type = np.int32
+
+
+class TestCaseFP165(TestXPUGatherOp):
+    def config(self):
+        self.dtype = np.float16
+        self.x_shape = (10, 20)
+        self.attrs = {'use_xpu': True, 'overwrite': False}
+        self.x_type = np.float16
+        self.index = [1, 1, 3]
+        self.index_type = np.int32
+
+
+class TestCaseFP166(TestXPUGatherOp):
+    def config(self):
+        self.dtype = np.float16
+        self.x_shape = (10, 20)
+        self.attrs = {'use_xpu': True, 'overwrite': True}
+        self.x_type = np.float16
+        self.index = [1, 3]
+        self.index_type = np.int32
+
+
+class TestCaseFP167(TestXPUGatherOp):
+    def config(self):
+        self.dtype = np.float16
         self.x_shape = (10, 20)
         self.attrs = {'use_xpu': True, 'overwrite': True}
-        self.x_type = "float32"
+        self.x_type = np.float16
         self.index = [1, 3]
-        self.index_type = "int64"
+        self.index_type = np.int64
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_matmul_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_matmul_op_xpu.py
index f5d3ace202692..59646f2db413e 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_matmul_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_matmul_op_xpu.py
@@ -127,45 +127,23 @@ def setUp(self):
         self.outputs = {'Out': Out}
 
     def test_check_output(self):
-
-        if paddle.is_compiled_with_xpu() and len(self.inputs['X'].shape) == len(
-                self.inputs['Y'].shape) and self.inputs['X'].shape[
-                    0] == self.inputs['Y'].shape[0]:
-            place = paddle.XPUPlace(0)
-            self.check_output_with_place(place, atol=1e-3)
+        place = paddle.XPUPlace(0)
+        self.check_output_with_place(place, atol=1e-3)
 
     def test_check_grad_normal(self):
-
-        if paddle.is_compiled_with_xpu() and len(self.inputs['X'].shape) == len(
-                self.inputs['Y'].shape) and self.inputs['X'].shape[
-                    0] == self.inputs['Y'].shape[0]:
-            place = paddle.XPUPlace(0)
-            self.check_grad_with_place(
-                place, ['X', 'Y'], 'Out', max_relative_error=5e-2)
+        place = paddle.XPUPlace(0)
+        self.check_grad_with_place(
+            place, ['X', 'Y'], 'Out', max_relative_error=5e-2)
 
     def test_check_grad_ignore_x(self):
-
-        if paddle.is_compiled_with_xpu() and len(self.inputs['X'].shape) == len(
-                self.inputs['Y'].shape) and self.inputs['X'].shape[
-                    0] == self.inputs['Y'].shape[0]:
-            place = paddle.XPUPlace(0)
-            self.check_grad_with_place(
-                place, ['Y'],
-                'Out',
-                max_relative_error=5e-2,
-                no_grad_set=set("X"))
+        place = paddle.XPUPlace(0)
+        self.check_grad_with_place(
+            place, ['Y'], 'Out', max_relative_error=5e-2, no_grad_set=set("X"))
 
     def test_check_grad_ignore_y(self):
-
-        if paddle.is_compiled_with_xpu() and len(self.inputs['X'].shape) == len(
-                self.inputs['Y'].shape) and self.inputs['X'].shape[
-                    0] == self.inputs['Y'].shape[0]:
-            place = paddle.XPUPlace(0)
-            self.check_grad_with_place(
-                place, ['X'],
-                'Out',
-                max_relative_error=5e-2,
-                no_grad_set=set('Y'))
+        place = paddle.XPUPlace(0)
+        self.check_grad_with_place(
+            place, ['X'], 'Out', max_relative_error=5e-2, no_grad_set=set('Y'))
 
 
 class TestMatmulOpError(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_mean_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_mean_op_xpu.py
index bbdb0984ed68a..896821552c9f7 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_mean_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_mean_op_xpu.py
@@ -18,6 +18,7 @@
 import numpy as np
 import sys
 sys.path.append("..")
+from op_test_xpu import XPUOpTest
 from op_test import OpTest
 import paddle
 import paddle.fluid.core as core
@@ -27,22 +28,27 @@
 np.random.seed(10)
 
 
-class TestMeanOp(OpTest):
+class TestMeanOp(XPUOpTest):
     def setUp(self):
         self.op_type = "mean"
-        self.dtype = np.float64
         self.init_dtype_type()
         self.inputs = {'X': np.random.random((10, 10)).astype(self.dtype)}
-        self.outputs = {'Out': np.mean(self.inputs["X"])}
+        self.outputs = {'Out': np.mean(self.inputs["X"]).astype(np.float16)}
 
     def init_dtype_type(self):
-        pass
+        self.dtype = np.float32
 
     def test_check_output(self):
-        self.check_output()
+        if paddle.is_compiled_with_xpu():
+            paddle.enable_static()
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place, atol=2e-3)
 
     def test_checkout_grad(self):
-        self.check_grad(['X'], 'Out')
+        if paddle.is_compiled_with_xpu():
+            paddle.enable_static()
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(place, ['X'], 'Out')
 
 
 class TestMeanOpError(unittest.TestCase):
@@ -77,5 +83,23 @@ def test_checkout_grad(self):
             self.check_grad_with_place(place, ['X'], 'Out')
 
 
+class TestXPUMeanOpFp16(TestMeanOp):
+    def init_dtype_type(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            paddle.enable_static()
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+    def test_checkout_grad(self):
+        if paddle.is_compiled_with_xpu():
+            paddle.enable_static()
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(
+                place, ['X'], 'Out', max_relative_error=1.e1)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_scale_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_scale_op_xpu.py
index 1f74fa5e2d685..761e5c2243c65 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_scale_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_scale_op_xpu.py
@@ -18,27 +18,27 @@
 import numpy as np
 import sys
 sys.path.append("..")
-from op_test import OpTest
+from op_test_xpu import XPUOpTest
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid.op import Operator
 import paddle
+from paddle.static import Program, program_guard
 
-paddle.enable_static()
 
-
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestXPUScaleOp(OpTest):
+class TestXPUScaleOp(XPUOpTest):
     def setUp(self):
         self.op_type = "scale"
-        self.dtype = np.float32
+        self.init_type()
         self.inputs = {'X': np.random.random((10, 10)).astype(self.dtype)}
         self.attrs = {'scale': -2.3, 'use_xpu': True}
         self.outputs = {
             'Out': self.inputs['X'] * self.dtype(self.attrs['scale'])
         }
 
+    def init_type(self):
+        self.dtype = np.float32
+
     def test_check_output(self):
         if paddle.is_compiled_with_xpu():
             place = paddle.XPUPlace(0)
@@ -50,5 +50,63 @@ def test_check_grad(self):
             self.check_grad_with_place(place, ['X'], 'Out')
 
 
+# class TestXPUScaleOpInt64(TestXPUScaleOp):
+#     def init_type(self):
+#         self.dtype = np.int64
+
+
+class TestScaleFp16Op(TestXPUScaleOp):
+    def init_dtype_type(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        place = core.XPUPlace(0)
+        self.check_output_with_place(place, atol=0.002)
+
+    def test_check_grad(self):
+        place = core.XPUPlace(0)
+        self.check_grad_with_place(place, ["X"], "Out", max_relative_error=0.05)
+
+
+class TestScaleApiStatic(unittest.TestCase):
+    def _executed_api(self, x, scale=1.0, bias=0.0):
+        return paddle.scale(x, scale, bias)
+
+    def test_api(self):
+        paddle.enable_static()
+        input = np.random.random([2, 25]).astype("float32")
+        main_prog = Program()
+        with program_guard(main_prog, Program()):
+            x = paddle.static.data(name="x", shape=[2, 25], dtype="float32")
+            out = self._executed_api(x, scale=2.0, bias=3.0)
+
+        exe = paddle.static.Executor(place=paddle.CPUPlace())
+        out = exe.run(main_prog, feed={"x": input}, fetch_list=[out])
+        self.assertEqual(np.array_equal(out[0], input * 2.0 + 3.0), True)
+
+
+class TestScaleInplaceApiStatic(TestScaleApiStatic):
+    def _executed_api(self, x, scale=1.0, bias=0.0):
+        return x.scale_(scale, bias)
+
+
+class TestScaleApiDygraph(unittest.TestCase):
+    def _executed_api(self, x, scale=1.0, bias=0.0):
+        return paddle.scale(x, scale, bias)
+
+    def test_api(self):
+        paddle.disable_static()
+        input = np.random.random([2, 25]).astype("float32")
+        x = paddle.to_tensor(input)
+        out = self._executed_api(x, scale=2.0, bias=3.0)
+        self.assertEqual(np.array_equal(out.numpy(), input * 2.0 + 3.0), True)
+        paddle.enable_static()
+
+
+class TestScaleInplaceApiDygraph(TestScaleApiDygraph):
+    def _executed_api(self, x, scale=1.0, bias=0.0):
+        return x.scale_(scale, bias)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_softmax_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_softmax_op_xpu.py
index 92842fbc2e65a..f0f0e3d86dfac 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_softmax_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_softmax_op_xpu.py
@@ -17,8 +17,7 @@
 import sys
 import unittest
 sys.path.append("..")
-from op_test import OpTest
-
+from op_test_xpu import XPUOpTest
 paddle.enable_static()
 np.random.seed(10)
 
@@ -41,15 +40,13 @@ def ref_softmax(x, axis=None, dtype=None):
     return np.apply_along_axis(stable_softmax, axis, x_t)
 
 
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestXPUSoftmaxOp(OpTest):
+class TestXPUSoftmaxOp(XPUOpTest):
     def setUp(self):
         self.op_type = "softmax"
-        self.dtype = np.float32
         self.shape = [2, 3, 4, 5]
         self.axis = -1
         self.set_attrs()
+        self.init_type()
 
         x = np.random.uniform(-1, 1, self.shape).astype(self.dtype)
         out = np.apply_along_axis(stable_softmax, self.axis, x)
@@ -58,6 +55,9 @@ def setUp(self):
         self.outputs = {'Out': out}
         self.attrs = {'axis': self.axis, 'use_xpu': True}
 
+    def init_type(self):
+        self.dtype = np.float16
+
     def set_attrs(self):
         pass
 
@@ -68,26 +68,35 @@ def test_check_grad(self):
         self.check_grad_with_place(paddle.XPUPlace(0), ['X'], 'Out')
 
 
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestXPUSoftmaxAxis3(TestXPUSoftmaxOp):
-    def set_attrs(self):
-        self.axis = 3
+# class TestXPUSoftmaxAxis3(TestXPUSoftmaxOp):
+#     def set_attrs(self):
+#         self.axis = 3
 
+# class TestXPUSoftmax2D(TestXPUSoftmaxOp):
+#     def set_attrs(self):
+#         self.shape = [10, 12]
 
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestXPUSoftmax2D(TestXPUSoftmaxOp):
-    def set_attrs(self):
-        self.shape = [10, 12]
+# class TestXPUSoftmax3D(TestXPUSoftmaxOp):
+#     def set_attrs(self):
+#         self.shape = [4, 5, 6]
 
+# class TestXPUSoftmaxAxis3FP16(TestXPUSoftmaxOp):
+#     def set_attrs(self):
+#         self.axis = 3
+#     def init_type(self):
+#         self.dtype = np.float16
 
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestXPUSoftmax3D(TestXPUSoftmaxOp):
-    def set_attrs(self):
-        self.shape = [4, 5, 6]
+# class TestXPUSoftmax2DFP16(TestXPUSoftmaxOp):
+#     def set_attrs(self):
+#         self.shape = [10, 12]
+#     def init_type(self):
+#         self.dtype = np.float16
 
+# class TestXPUSoftmax3DFP16(TestXPUSoftmaxOp):
+#     def set_attrs(self):
+#         self.shape = [4, 5, 6]
+#     def init_type(self):
+#         self.dtype = np.float16
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_sum_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_sum_op_xpu.py
index 3bafbf649e6ce..8ae588975a56a 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_sum_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_sum_op_xpu.py
@@ -13,27 +13,26 @@
 # limitations under the License.
 
 from __future__ import print_function
-
-import unittest
-import numpy as np
 import sys
 sys.path.append("..")
-from op_test import OpTest
+import unittest
+import numpy as np
+from op_test_xpu import XPUOpTest
 import paddle
+from paddle import enable_static
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid.op import Operator
-import paddle
+from paddle.fluid.tests.unittests.op_test import (
+    OpTest, convert_float_to_uint16, convert_uint16_to_float)
 
 paddle.enable_static()
 
 
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestXPUSumOp(OpTest):
+class TestSumOp(XPUOpTest):
     def setUp(self):
         self.op_type = "sum"
-        self.use_mkldnn = False
+        self.init_kernel_type()
         self.init_kernel_type()
         x0 = np.random.random((3, 40)).astype(self.dtype)
         x1 = np.random.random((3, 40)).astype(self.dtype)
@@ -41,21 +40,147 @@ def setUp(self):
         self.inputs = {"X": [("x0", x0), ("x1", x1), ("x2", x2)]}
         y = x0 + x1 + x2
         self.outputs = {'Out': y}
-        self.attrs = {'use_mkldnn': self.use_mkldnn, 'use_xpu': True}
 
     def init_kernel_type(self):
         self.dtype = np.float32
 
     def test_check_output(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_output_with_place(place)
+        self.check_output()
 
     def test_check_grad(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_grad_with_place(place, ['x0'], 'Out')
+        self.check_grad(['x0'], 'Out')
+
+
+#----------- test fp16 -----------
+class TestFP16SumOp(TestSumOp):
+    def init_kernel_type(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        place = core.XPUPlace(0)
+        # if core.is_float16_supported(place):
+        self.check_output_with_place(place, atol=2e-2)
+
+    # FIXME: Because of the precision fp16, max_relative_error
+    # should be 0.15 here.
+    def test_check_grad(self):
+        place = core.XPUPlace(0)
+        # if core.is_float16_supported(place):
+        self.check_grad_with_place(
+            place, ['x0'], 'Out', max_relative_error=0.15)
+
+
+def create_test_sum_fp16_class(parent):
+    class TestSumFp16Case(parent):
+        def init_kernel_type(self):
+            self.dtype = np.float16
+
+        def test_w_is_selected_rows(self):
+            place = core.XPUPlace(0)
+            # if core.is_float16_supported(place):
+            for inplace in [True, False]:
+                self.check_with_place(place, inplace)
+
+    cls_name = "{0}_{1}".format(parent.__name__, "SumFp16Test")
+    TestSumFp16Case.__name__ = cls_name
+    globals()[cls_name] = TestSumFp16Case
+
+
+class API_Test_Add_n(unittest.TestCase):
+    def test_api(self):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            input0 = fluid.layers.fill_constant(
+                shape=[2, 3], dtype='int64', value=5)
+            input1 = fluid.layers.fill_constant(
+                shape=[2, 3], dtype='int64', value=3)
+            expected_result = np.empty((2, 3))
+            expected_result.fill(8)
+            sum_value = paddle.add_n([input0, input1])
+            exe = fluid.Executor(fluid.XPUPlace(0))
+            result = exe.run(fetch_list=[sum_value])
+
+            self.assertEqual((result == expected_result).all(), True)
+
+        with fluid.dygraph.guard():
+            input0 = paddle.ones(shape=[2, 3], dtype='float32')
+            expected_result = np.empty((2, 3))
+            expected_result.fill(2)
+            sum_value = paddle.add_n([input0, input0])
+
+            self.assertEqual((sum_value.numpy() == expected_result).all(), True)
+
+
+class TestRaiseSumError(unittest.TestCase):
+    def test_errors(self):
+        def test_type():
+            fluid.layers.sum([11, 22])
+
+        self.assertRaises(TypeError, test_type)
+
+        def test_dtype():
+            data1 = fluid.data(name="input1", shape=[10], dtype="int8")
+            data2 = fluid.data(name="input2", shape=[10], dtype="int8")
+            fluid.layers.sum([data1, data2])
+
+        self.assertRaises(TypeError, test_dtype)
+
+        def test_dtype1():
+            data1 = fluid.data(name="input1", shape=[10], dtype="int8")
+            fluid.layers.sum(data1)
+
+        self.assertRaises(TypeError, test_dtype1)
+
+
+class TestRaiseSumsError(unittest.TestCase):
+    def test_errors(self):
+        def test_type():
+            fluid.layers.sums([11, 22])
+
+        self.assertRaises(TypeError, test_type)
+
+        def test_dtype():
+            data1 = fluid.data(name="input1", shape=[10], dtype="int8")
+            data2 = fluid.data(name="input2", shape=[10], dtype="int8")
+            fluid.layers.sums([data1, data2])
+
+        self.assertRaises(TypeError, test_dtype)
+
+        def test_dtype1():
+            data1 = fluid.data(name="input1", shape=[10], dtype="int8")
+            fluid.layers.sums(data1)
+
+        self.assertRaises(TypeError, test_dtype1)
+
+        def test_out_type():
+            data1 = fluid.data(name="input1", shape=[10], dtype="flaot32")
+            data2 = fluid.data(name="input2", shape=[10], dtype="float32")
+            fluid.layers.sums([data1, data2], out=[10])
+
+        self.assertRaises(TypeError, test_out_type)
+
+        def test_out_dtype():
+            data1 = fluid.data(name="input1", shape=[10], dtype="flaot32")
+            data2 = fluid.data(name="input2", shape=[10], dtype="float32")
+            out = fluid.data(name="out", shape=[10], dtype="int8")
+            fluid.layers.sums([data1, data2], out=out)
+
+        self.assertRaises(TypeError, test_out_dtype)
+
+
+class TestSumOpError(unittest.TestCase):
+    def test_errors(self):
+        def test_empty_list_input():
+            with fluid.dygraph.guard():
+                fluid.core.ops.sum([])
+
+        def test_list_of_none_input():
+            with fluid.dygraph.guard():
+                fluid.core.ops.sum([None])
+
+        self.assertRaises(Exception, test_empty_list_input)
+        self.assertRaises(Exception, test_list_of_none_input)
 
 
 if __name__ == "__main__":
+    enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_tile_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_tile_op_xpu.py
new file mode 100644
index 0000000000000..d010e1633578e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_tile_op_xpu.py
@@ -0,0 +1,267 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+sys.path.append("..")
+from op_test import OpTest
+from op_test_xpu import XPUOpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import compiler, Program, program_guard
+from paddle.fluid import core
+
+paddle.enable_static()
+np.random.seed(10)
+
+
+#Situation 1: repeat_times is a list (without tensor)
+class TestTileOpRank1(XPUOpTest):
+    def setUp(self):
+        self.set_xpu()
+        self.place = paddle.XPUPlace(0)
+        self.op_type = "tile"
+        self.init_data()
+
+        self.inputs = {'X': np.random.random(self.ori_shape).astype("float32")}
+        self.attrs = {'repeat_times': self.repeat_times}
+        output = np.tile(self.inputs['X'], self.repeat_times)
+        self.outputs = {'Out': output}
+
+    def set_xpu(self):
+        self.__class__.use_xpu = True
+
+    def init_data(self):
+        self.ori_shape = [100]
+        self.repeat_times = [2]
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad(self):
+        pass
+
+
+#with dimension expanding
+class TestTileOpRank2Expanding(TestTileOpRank1):
+    def init_data(self):
+        self.ori_shape = [120]
+        self.repeat_times = [2, 2]
+
+
+class TestTileOpRank2(TestTileOpRank1):
+    def init_data(self):
+        self.ori_shape = [12, 14]
+        self.repeat_times = [2, 3]
+
+
+class TestTileOpRank3_Corner(TestTileOpRank1):
+    def init_data(self):
+        self.ori_shape = (2, 10, 5)
+        self.repeat_times = (1, 1, 1)
+
+
+class TestTileOpRank3_Corner2(TestTileOpRank1):
+    def init_data(self):
+        self.ori_shape = (2, 10, 5)
+        self.repeat_times = (2, 2)
+
+
+class TestTileOpRank3(TestTileOpRank1):
+    def init_data(self):
+        self.ori_shape = (2, 4, 15)
+        self.repeat_times = (2, 1, 4)
+
+
+class TestTileOpRank4(TestTileOpRank1):
+    def init_data(self):
+        self.ori_shape = (2, 4, 5, 7)
+        self.repeat_times = (3, 2, 1, 2)
+
+
+# Situation 2: repeat_times is a list (with tensor)
+class TestTileOpRank1_tensor_attr(XPUOpTest):
+    def setUp(self):
+        self.set_xpu()
+        self.place = paddle.XPUPlace(0)
+        self.op_type = "tile"
+        self.init_data()
+        repeat_times_tensor = []
+        for index, ele in enumerate(self.repeat_times):
+            repeat_times_tensor.append(("x" + str(index), np.ones(
+                (1)).astype('int32') * ele))
+
+        self.inputs = {
+            'X': np.random.random(self.ori_shape).astype("float32"),
+            'repeat_times_tensor': repeat_times_tensor,
+        }
+        self.attrs = {"repeat_times": self.infer_repeat_times}
+        output = np.tile(self.inputs['X'], self.repeat_times)
+        self.outputs = {'Out': output}
+
+    def set_xpu(self):
+        self.__class__.use_xpu = True
+
+    def init_data(self):
+        self.ori_shape = [100]
+        self.repeat_times = [2]
+        self.infer_repeat_times = [-1]
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad(self):
+        pass
+
+
+class TestTileOpRank2_Corner_tensor_attr(TestTileOpRank1_tensor_attr):
+    def init_data(self):
+        self.ori_shape = [12, 14]
+        self.repeat_times = [1, 1]
+        self.infer_repeat_times = [1, -1]
+
+
+class TestTileOpRank2_attr_tensor(TestTileOpRank1_tensor_attr):
+    def init_data(self):
+        self.ori_shape = [12, 14]
+        self.repeat_times = [2, 3]
+        self.infer_repeat_times = [-1, 3]
+
+
+# Situation 3: repeat_times is a tensor
+class TestTileOpRank1_tensor(XPUOpTest):
+    def setUp(self):
+        self.set_xpu()
+        self.place = paddle.XPUPlace(0)
+        self.op_type = "tile"
+        self.init_data()
+
+        self.inputs = {
+            'X': np.random.random(self.ori_shape).astype("float32"),
+            'RepeatTimes': np.array(self.repeat_times).astype("int32"),
+        }
+        self.attrs = {}
+        output = np.tile(self.inputs['X'], self.repeat_times)
+        self.outputs = {'Out': output}
+
+    def set_xpu(self):
+        self.__class__.use_xpu = True
+
+    def init_data(self):
+        self.ori_shape = [100]
+        self.repeat_times = [2]
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad(self):
+        pass
+
+
+class TestTileOpRank2_tensor(TestTileOpRank1_tensor):
+    def init_data(self):
+        self.ori_shape = [12, 14]
+        self.repeat_times = [2, 3]
+
+
+# Situation 4: input x is Integer
+class TestTileOpInteger(XPUOpTest):
+    def setUp(self):
+        self.set_xpu()
+        self.place = paddle.XPUPlace(0)
+        self.op_type = "tile"
+        self.inputs = {
+            'X': np.random.randint(
+                10, size=(4, 4, 5)).astype("int32")
+        }
+        self.attrs = {'repeat_times': [2, 1, 4]}
+        output = np.tile(self.inputs['X'], (2, 1, 4))
+        self.outputs = {'Out': output}
+
+    def set_xpu(self):
+        self.__class__.use_xpu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+
+# Situation 5: input x is Integer
+class TestTileOpInt64_t(XPUOpTest):
+    def setUp(self):
+        self.set_xpu()
+        self.place = paddle.XPUPlace(0)
+        self.op_type = "tile"
+        self.inputs = {
+            'X': np.random.randint(
+                10, size=(2, 4, 5)).astype("int64")
+        }
+        self.attrs = {'repeat_times': [2, 1, 4]}
+        output = np.tile(self.inputs['X'], (2, 1, 4))
+        self.outputs = {'Out': output}
+
+    def set_xpu(self):
+        self.__class__.use_xpu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+
+# Situation 6: input x is Bool
+class TestTileOpBool(XPUOpTest):
+    def setUp(self):
+        self.set_xpu()
+        self.place = paddle.XPUPlace(0)
+        self.op_type = "tile"
+        self.inputs = {
+            'X': np.random.randint(
+                10, size=(2, 4, 5)).astype("bool")
+        }
+        self.attrs = {'repeat_times': [2, 1, 4]}
+        output = np.tile(self.inputs['X'], (2, 1, 4))
+        self.outputs = {'Out': output}
+
+    def set_xpu(self):
+        self.__class__.use_xpu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+
+# Test python API
+class TestTileAPI(unittest.TestCase):
+    def test_api(self):
+        with fluid.dygraph.guard(paddle.XPUPlace(0)):
+            np_x = np.random.random([12, 14]).astype("float32")
+            x = paddle.to_tensor(np_x)
+
+            positive_2 = np.array([2]).astype("int32")
+            positive_2 = paddle.to_tensor(positive_2)
+
+            repeat_times = np.array([2, 3]).astype("int32")
+            repeat_times = paddle.to_tensor(repeat_times)
+
+            out_1 = paddle.tile(x, repeat_times=[2, 3])
+            out_2 = paddle.tile(x, repeat_times=[positive_2, 3])
+            out_3 = paddle.tile(x, repeat_times=repeat_times)
+
+            assert np.array_equal(out_1.numpy(), np.tile(np_x, (2, 3)))
+            assert np.array_equal(out_2.numpy(), np.tile(np_x, (2, 3)))
+            assert np.array_equal(out_3.numpy(), np.tile(np_x, (2, 3)))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/hapi/dynamic_flops.py b/python/paddle/hapi/dynamic_flops.py
index 2c59ee67d4a8e..5c48501c07d7e 100644
--- a/python/paddle/hapi/dynamic_flops.py
+++ b/python/paddle/hapi/dynamic_flops.py
@@ -17,6 +17,7 @@
 import paddle.nn as nn
 import numpy as np
 from .static_flops import static_flops, Table
+from paddle.fluid.dygraph.dygraph_to_static.program_translator import unwrap_decorators
 
 __all__ = []
 
@@ -100,6 +101,10 @@ def count_leaky_relu(m, x, y):
             #Total Flops: 347560     Total Params: 61610
     """
     if isinstance(net, nn.Layer):
+        # If net is a dy2stat model, net.forward is StaticFunction instance,
+        # we set net.forward to original forward function.
+        _, net.forward = unwrap_decorators(net.forward)
+
         inputs = paddle.randn(input_size)
         return dynamic_flops(
             net,
diff --git a/python/paddle/incubate/__init__.py b/python/paddle/incubate/__init__.py
index 644b934814020..f44e38347e538 100644
--- a/python/paddle/incubate/__init__.py
+++ b/python/paddle/incubate/__init__.py
@@ -23,6 +23,8 @@
 from .tensor import segment_max
 from .tensor import segment_min
 
+from . import nn  #noqa: F401
+
 __all__ = [
     'LookAhead',
     'ModelAverage',
diff --git a/python/paddle/incubate/nn/__init__.py b/python/paddle/incubate/nn/__init__.py
index aada78e4ec6a4..f359ec1e0d842 100644
--- a/python/paddle/incubate/nn/__init__.py
+++ b/python/paddle/incubate/nn/__init__.py
@@ -12,8 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .layer.fused_transformer import FusedMultiHeadAttention  # noqa: F401 
+from .layer.fused_transformer import FusedMultiHeadAttention  # noqa: F401
+from .layer.fused_transformer import FusedFeedForward  # noqa: F401
+from .layer.fused_transformer import FusedTransformerEncoderLayer  # noqa: F401
 
 __all__ = [  #noqa
     'FusedMultiHeadAttention',
+    'FusedFeedForward',
+    'FusedTransformerEncoderLayer',
+
 ]
diff --git a/python/paddle/incubate/nn/functional/fused_transformer.py b/python/paddle/incubate/nn/functional/fused_transformer.py
index 68109b4ae694a..6c447a73c5251 100644
--- a/python/paddle/incubate/nn/functional/fused_transformer.py
+++ b/python/paddle/incubate/nn/functional/fused_transformer.py
@@ -194,31 +194,34 @@ def fused_multi_head_attention(x,
     Multi-Head Attention performs multiple parallel attention to jointly attending
     to information from different representation subspaces. This API only
     support self_attention. The pseudo code is as follows:
-    if pre_layer_norm:
-    	out = layer_norm(x);
-        out = linear(out) + qkv)bias
-    else:
-	out = linear(x) + bias;
-    out = transpose(out, perm=[2, 0, 3, 1, 4]);
-    # extract q, k and v from out.
-    q = out[0:1,::]
-    k = out[1:2,::]
-    v = out[2:3,::]
-    out = q * k^t;
-    out = attn_mask + out;
-    out = softmax(out);
-    out = dropout(out);
-    out = out * v;
-    out = transpose(out, perm=[0, 2, 1, 3]);
-    out = out_linear(out);
-    out = layer_norm(x + dropout(linear_bias + out));
+
+    .. code-block:: python
+
+    	if pre_layer_norm:
+    	    out = layer_norm(x)
+            out = linear(out) + qkv) + bias
+    	else:
+	    out = linear(x) + bias
+    	out = transpose(out, perm=[2, 0, 3, 1, 4])
+    	# extract q, k and v from out.
+    	q = out[0:1,::]
+    	k = out[1:2,::]
+    	v = out[2:3,::]
+    	out = q * k^t
+    	out = attn_mask + out
+    	out = softmax(out)
+    	out = dropout(out)
+    	out = out * v
+    	out = transpose(out, perm=[0, 2, 1, 3])
+    	out = out_linear(out)
+    	out = layer_norm(x + dropout(linear_bias + out))
 
     Parameters:
         x (Tensor): The input tensor of fused_multi_head_attention. The shape is
             `[batch\_size, sequence\_len, embed\_dim]`.
         qkv_weight (Tensor): The qkv weight tensor. The shape is `[3, num_head, dim_head, dim_embed]`.
         linear_weight (Tensor): The linear weight tensor. The shape is `[embed_dim, embed_dim]`.
-        pre_layer_norm (bool, optional): whether it is pre_layer_norm (True) or post_layer_norm architecture 
+        pre_layer_norm (bool, optional): whether it is pre_layer_norm (True) or post_layer_norm architecture
 	    (False). Default False.
         pre_ln_scale (Tensor, optional): The weight tensor of pre layernorm. Default None.
         pre_ln_bias (Tensor, optional): The bias tensor of pre layernorm. Default None.
@@ -229,12 +232,12 @@ def fused_multi_head_attention(x,
         qkv_bias (Tensor, optional): The bias of qkv computation. The shape is `[3, num_head, dim_head]`.
             Default None.
         linear_bias (Tensor, optional): The bias of linear. The shape is `[embed_dim]`. Default None.
-        attn_mask (Tensor, optional):  A tensor used in multi-head attention to prevents attention to 
- 	    some unwanted positions, usually the paddings or the subsequent positions. It is a tensor 
-            with shape broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`. When the 
-            data type is bool, the unwanted positions have `False` values and the others have `True` values. 
-            When the data type is int, the unwanted positions have 0 values and the others have 1 values. 
-            When the data type is float, the unwanted positions have `-INF` values and the others have 0 values. 
+        attn_mask (Tensor, optional):  A tensor used in multi-head attention to prevents attention to
+ 	    some unwanted positions, usually the paddings or the subsequent positions. It is a tensor
+            with shape broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`. When the
+            data type is bool, the unwanted positions have `False` values and the others have `True` values.
+            When the data type is int, the unwanted positions have 0 values and the others have 1 values.
+            When the data type is float, the unwanted positions have `-INF` values and the others have 0 values.
             It can be None when nothing wanted or needed to be prevented attention to. Default None.
         dropout_rate (float, optional): The dropout probability used on attention
             weights to drop some attention targets for the dropout after attention.
@@ -245,6 +248,9 @@ def fused_multi_head_attention(x,
         ln_epsilon (float, optional): Small float value added to denominator of layer_norm
             to avoid dividing by zero. Default is 1e-5.
 
+    Returns:
+        Tensor: The output Tensor, the data type and shape is same as `x`.
+
     Examples:
 
         .. code-block:: python
diff --git a/python/paddle/incubate/nn/layer/fused_transformer.py b/python/paddle/incubate/nn/layer/fused_transformer.py
index 16588dcef3d27..a3d8a74844b19 100644
--- a/python/paddle/incubate/nn/layer/fused_transformer.py
+++ b/python/paddle/incubate/nn/layer/fused_transformer.py
@@ -11,14 +11,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-import copy
 from paddle.nn import functional as F
 from paddle.incubate.nn import functional as incubate_f
 from paddle.nn import Layer
 from paddle.framework import ParamAttr
 import paddle
-from paddle.nn.layer.transformer import _convert_attention_mask
+from paddle.nn.layer.transformer import _convert_attention_mask, _convert_param_attr_to_list
 from paddle.nn.initializer import Constant
 
 import collections
@@ -26,37 +24,42 @@
 
 class FusedMultiHeadAttention(Layer):
     """
-   Attention mapps queries and a set of key-value pairs to outputs, and
+    Attention mapps queries and a set of key-value pairs to outputs, and
     Multi-Head Attention performs multiple parallel attention to jointly attending
     to information from different representation subspaces.
     Please refer to `Attention Is All You Need <https://arxiv.org/pdf/1706.03762.pdf>`_
     for more details.
+
     Parameters:
         embed_dim (int): The expected feature size in the input and output.
         num_heads (int): The number of heads in multi-head attention.
         dropout_rate (float, optional): The dropout probability used on attention
-            weights to drop some attention targets for the dropout after attention. 
+            weights to drop some attention targets for the dropout after attention.
             0 for no dropout. Default 0.5.
         attn_dropout_rate (float, optional): The dropout probability used on attention
-            weights to drop some attention targets for the dropout in attention. 
+            weights to drop some attention targets for the dropout in attention.
             0 for no dropout. Default 0.5.
         kdim (int, optional): The feature size in key. If None, assumed equal to
             `embed_dim`. Default None.
         vdim (int, optional): The feature size in value. If None, assumed equal to
             `embed_dim`. Default None.
-        normalize_before (bool, optional): Indicate  whether it is pre_layer_norm (True) 
-            or post_layer_norm architecture (False). Default False.
+        normalize_before (bool, optional): Indicate  whether it is pre_layer_norm 
+            (True) or post_layer_norm architecture (False). Default False.
         need_weights (bool, optional): Indicate whether to return the attention
             weights. Now, only False is supported. Default False.
         weight_attr(ParamAttr, optional):  To specify the weight parameter property.
             Default: None, which means the default weight parameter property is used.
-            See usage for details in :code:`ParamAttr` .
+            See usage for details in :code:`ParamAttr`.
         bias_attr (ParamAttr|bool, optional): To specify the bias parameter property.
             Default: None, which means the default bias parameter property is used.
             If it is set to False, this layer will not have trainable bias parameter.
-            See usage for details in :code:`ParamAttr` .
+            See usage for details in :code:`ParamAttr`.
+
     Examples:
+
         .. code-block:: python
+
+            # required: gpu
             import paddle
             # input: [batch_size, sequence_length, embed_dim]
             query = paddle.rand((2, 4, 128))
@@ -138,6 +141,7 @@ def forward(self, query, key=None, value=None, attn_mask=None, cache=None):
         """
         Applies multi-head attention to map queries and a set of key-value pairs
         to outputs.
+
         Parameters:
             query (Tensor): The queries for multi-head attention. It is a
                 tensor with shape `[batch_size, query_length, embed_dim]`. The
@@ -154,17 +158,18 @@ def forward(self, query, key=None, value=None, attn_mask=None, cache=None):
                 to prevents attention to some unwanted positions, usually the
                 paddings or the subsequent positions. It is a tensor with shape
                 broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`.
-                When the data type is bool, the unwanted positions have `False` 
-                values and the others have `True` values. When the data type is 
-                int, the unwanted positions have 0 values and the others have 1 
-                values. When the data type is float, the unwanted positions have 
-                `-INF` values and the others have 0 values. It can be None when 
+                When the data type is bool, the unwanted positions have `False`
+                values and the others have `True` values. When the data type is
+                int, the unwanted positions have 0 values and the others have 1
+                values. When the data type is float, the unwanted positions have
+                `-INF` values and the others have 0 values. It can be None when
                 nothing wanted or needed to be prevented attention to. Default None.
             cache (MultiHeadAttention.Cache|MultiHeadAttention.StaticCache, optional):
                 Now, only None is supported. Default None.
+
         Returns:
             Tensor|tuple: It is a tensor that has the same shape and data type \
-                as `query`, representing attention output. 
+                as `query`, representing attention output.
         """
         if attn_mask is not None:
             # Support bool or int mask
@@ -192,26 +197,114 @@ def forward(self, query, key=None, value=None, attn_mask=None, cache=None):
 
 
 class FusedFeedForward(Layer):
+    """
+    Parameters:
+        d_model (int): The expected feature size in the input and output.
+        dim_feedforward (int): The hidden layer size.
+        dropout_rate (float, optional): The dropout probability used in pre-process
+            and post-precess. Default 0.1
+        activation (str, optional): The activation function. Default relu.
+        act_dropout_rate (float, optional): The dropout probability after activition.
+            If None, use the value of `dropout_rate`. Default None
+        normalize_before (bool, optional): Indicate whether to put layer normalization
+            into, preprocessing or postprocessing. Default False
+        weight_attr (ParamAttr, optional): The attribute for the learnable weight of this layer.
+            The default value is None and the weight will be initialized to zero. For detailed
+            information, please refer to paddle.ParamAttr.
+        bias_attr (ParamAttr|bool, optional): The attribute for the learnable bias of thi layer.
+            If it is set to False, no bias will be added to the output. If it is set to None or one
+            kind of ParamAttr, a bias parameter will be created according to ParamAttr. For detailed
+            information, please refer to paddle.ParamAttr. The default value is None and the bias
+            will be initialized to zero.
+
+    Examples:
+        .. code-block:: python
+
+            # required: gpu
+            import paddle
+            from paddle.incubate.nn import FusedFeedForward
+
+            fused_feedforward_layer = FusedFeedForward(8, 8)
+            x = paddle.rand((1, 8, 8))
+            out = fused_feedforward_layer(x)
+            print(out.numpy().shape)
+            # (1, 8, 8)
+    """
+
     def __init__(self,
                  d_model,
                  dim_feedforward,
-                 dropout=0.1,
+                 dropout_rate=0.1,
                  activation="relu",
-                 act_dropout=None,
+                 act_dropout_rate=None,
                  normalize_before=False,
                  weight_attr=None,
                  bias_attr=None):
 
         super(FusedFeedForward, self).__init__()
-        raise NotImplementedError()
+        assert d_model > 0, (
+            "Expected d_model to be greater than 0, but recieved {}".format(
+                d_model))
+        assert dim_feedforward > 0, (
+            "Expected dim_feedforward to be greater than 0, but recieved {}".
+            format(dim_feedforward))
+
+        self._dtype = self._helper.get_default_dtype()
+        self._d_model = d_model
+        self._dim_feedforward = dim_feedforward
+        self._dropout_rate = dropout_rate
+        self._act_dropout_rate = dropout_rate if act_dropout_rate is None else act_dropout_rate
+        self._act_method = activation
+        self._normalize_before = normalize_before
+
+        self._linear1_weight = self.create_parameter(
+            shape=[d_model, dim_feedforward],
+            attr=weight_attr,
+            dtype=self._dtype,
+            is_bias=False)
+        self._linear1_bias = self.create_parameter(
+            shape=[dim_feedforward],
+            attr=bias_attr,
+            dtype=self._dtype,
+            is_bias=True)
+
+        self._linear2_weight = self.create_parameter(
+            shape=[dim_feedforward, d_model],
+            attr=weight_attr,
+            dtype=self._dtype,
+            is_bias=False)
+
+        self._linear2_bias = self.create_parameter(
+            shape=[d_model], attr=bias_attr, dtype=self._dtype, is_bias=True)
+
+        self._ln1_scale = self.create_parameter(
+            shape=[d_model],
+            attr=None,
+            is_bias=False,
+            default_initializer=Constant(1.0))
+        self._ln1_bias = self.create_parameter(
+            shape=[d_model], attr=None, is_bias=True)
+
+        self._ln2_scale = self.create_parameter(
+            shape=[d_model],
+            attr=None,
+            is_bias=False,
+            default_initializer=Constant(1.0))
+        self._ln2_bias = self.create_parameter(
+            shape=[d_model], attr=None, is_bias=True)
 
     def forward(self, src, cache=None):
-        raise NotImplementedError()
+        out = incubate_f.fused_feedforward(
+            src, self._linear1_weight, self._linear2_weight, self._linear1_bias,
+            self._linear2_bias, self._ln1_scale, self._ln1_bias,
+            self._ln2_scale, self._ln2_bias, self._dropout_rate,
+            self._act_dropout_rate, self._act_method, self._normalize_before)
+        return out
 
 
 class FusedTransformerEncoderLayer(Layer):
     """
-    TransformerEncoderLayer is composed of two sub-layers which are self (multi-head)
+    FusedTransformerEncoderLayer is composed of two sub-layers which are self (multi-head)
     attention and feedforward network. Before and after each sub-layer, pre-process
     and post-precess would be applied on the input and output accordingly. If
     `normalize_before` is True, pre-process is layer normalization and post-precess
@@ -222,14 +315,14 @@ class FusedTransformerEncoderLayer(Layer):
         d_model (int): The expected feature size in the input and output.
         nhead (int): The number of heads in multi-head attention(MHA).
         dim_feedforward (int): The hidden layer size in the feedforward network(FFN).
-        dropout (float, optional): The dropout probability used in pre-process
+        dropout_rate (float, optional): The dropout probability used in pre-process
             and post-precess of MHA and FFN sub-layer. Default 0.1
         activation (str, optional): The activation function in the feedforward
             network. Default relu.
-        attn_dropout (float, optional): The dropout probability used
+        attn_dropout_rate (float, optional): The dropout probability used
             in MHA to drop some attention target. If None, use the value of
             `dropout`. Default None
-        act_dropout (float, optional): The dropout probability used after FFN
+        act_dropout_rate (float, optional): The dropout probability used after FFN
             activition.  If None, use the value of `dropout`. Default None
         normalize_before (bool, optional): Indicate whether to put layer normalization
             into preprocessing of MHA and FFN sub-layers. If True, pre-process is layer
@@ -241,7 +334,7 @@ class FusedTransformerEncoderLayer(Layer):
             MHA, and `weight_attr[1]` would be used as `weight_attr` for linear in FFN.
             Otherwise, MHA and FFN both use it as `weight_attr` to create parameters.
             Default: None, which means the default weight parameter property is used.
-            See usage for details in :code:`ParamAttr` . 
+            See usage for details in :code:`ParamAttr` .
         bias_attr (ParamAttr|list|tuple|bool, optional): To specify the bias parameter property.
             If it is a list/tuple, `bias_attr[0]` would be used as `bias_attr` for
             MHA, and `bias_attr[1]` would be used as `bias_attr` for linear in FFN.
@@ -249,21 +342,21 @@ class FusedTransformerEncoderLayer(Layer):
             The `False` value means the corresponding layer would not have trainable
             bias parameter. See usage for details in :code:`ParamAttr` . Default: None,
             which means the default bias parameter property is used.
-            
+
 
     Examples:
 
         .. code-block:: python
-	    
+
 	    # required: gpu
             import paddle
-            from paddle.nn import TransformerEncoderLayer
+            from paddle.incubate.nn import FusedTransformerEncoderLayer
 
             # encoder input: [batch_size, src_len, d_model]
             enc_input = paddle.rand((2, 4, 128))
             # self attention mask: [batch_size, n_head, src_len, src_len]
             attn_mask = paddle.rand((2, 2, 4, 4))
-            encoder_layer = TransformerEncoderLayer(128, 2, 512)
+            encoder_layer = FusedTransformerEncoderLayer(128, 2, 512)
             enc_output = encoder_layer(enc_input, attn_mask)  # [2, 4, 128]
     """
 
@@ -271,10 +364,10 @@ def __init__(self,
                  d_model,
                  nhead,
                  dim_feedforward,
-                 dropout=0.1,
+                 dropout_rate=0.1,
                  activation="relu",
-                 attn_dropout=None,
-                 act_dropout=None,
+                 attn_dropout_rate=None,
+                 act_dropout_rate=None,
                  normalize_before=False,
                  weight_attr=None,
                  bias_attr=None):
@@ -283,7 +376,35 @@ def __init__(self,
         self._config.pop("__class__", None)  # py3
 
         super(FusedTransformerEncoderLayer, self).__init__()
-        raise NotImplementedError()
+        assert d_model > 0, ("Expected d_model to be greater than 0, "
+                             "but recieved {}".format(d_model))
+        assert nhead > 0, ("Expected nhead to be greater than 0, "
+                           "but recieved {}".format(nhead))
+        assert dim_feedforward > 0, (
+            "Expected dim_feedforward to be greater than 0, "
+            "but recieved {}".format(dim_feedforward))
+        attn_dropout_rate = dropout_rate if attn_dropout_rate is None else attn_dropout_rate
+        act_dropout_rate = dropout_rate if act_dropout_rate is None else act_dropout_rate
+        self.normalize_before = normalize_before
+
+        weight_attrs = _convert_param_attr_to_list(weight_attr, 2)
+        bias_attrs = _convert_param_attr_to_list(bias_attr, 2)
+
+        self.fused_attn = FusedMultiHeadAttention(
+            d_model,
+            nhead,
+            dropout_rate=attn_dropout_rate,
+            weight_attr=weight_attrs[0],
+            bias_attr=bias_attrs[0])
+
+        self.ffn = FusedFeedForward(
+            d_model,
+            dim_feedforward,
+            dropout_rate=dropout_rate,
+            act_dropout_rate=act_dropout_rate,
+            normalize_before=self.normalize_before,
+            weight_attr=weight_attrs[1],
+            bias_attr=bias_attrs[1])
 
     def forward(self, src, src_mask=None, cache=None):
         """
@@ -296,11 +417,11 @@ def forward(self, src, src_mask=None, cache=None):
                 to prevents attention to some unwanted positions, usually the
                 paddings or the subsequent positions. It is a tensor with shape
                 broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`.
-                When the data type is bool, the unwanted positions have `False` 
-                values and the others have `True` values. When the data type is 
-                int, the unwanted positions have 0 values and the others have 1 
-                values. When the data type is float, the unwanted positions have 
-                `-INF` values and the others have 0 values. It can be None when 
+                When the data type is bool, the unwanted positions have `False`
+                values and the others have `True` values. When the data type is
+                int, the unwanted positions have 0 values and the others have 1
+                values. When the data type is float, the unwanted positions have
+                `-INF` values and the others have 0 values. It can be None when
                 nothing wanted or needed to be prevented attention to. Default None.
             cache (Tensor, optional): It is an instance of `MultiHeadAttention.Cache`.
                 See `TransformerEncoderLayer.gen_cache` for more details. It is
@@ -315,7 +436,16 @@ def forward(self, src, src_mask=None, cache=None):
                 incremental length. See `MultiHeadAttention.gen_cache` and \
                 `MultiHeadAttention.forward` for more details.
         """
-        raise NotImplementedError()
+        src_mask = _convert_attention_mask(src_mask, src.dtype)
+        if cache is None:
+            attn_out = self.fused_attn(src, attn_mask=src_mask)
+        else:
+            attn_out, incremental_cache = self.fused_attn(
+                src, attn_mask=src_mask, cache=cache)
+
+        ffn_out = self.ffn(attn_out)
+
+        return ffn_out if cache is None else (ffn_out, incremental_cache)
 
 
 class FusedTransformer(Layer):
@@ -326,12 +456,12 @@ class FusedTransformer(Layer):
 
     Please refer to `Attention is all you need <http://papers.nips.cc/paper/7181-attention-is-all-you-need.pdf>`_ ,
     and see `TransformerEncoder` and `TransformerDecoder` for more details.
-    
+
     Users can configurate the model architecture with corresponding parameters.
     Note the usage of `normalize_before` representing where to apply layer
     normalization (in pre-process or post-precess of multi-head attention or FFN),
     and some transformer like models are different on this, such as
-    `BERT <https://arxiv.org/abs/1810.04805>`_ and `GPT2 <https://d4mucfpksywv.cloudfront.net/better-language-models/language-models.pdf>`_ . 
+    `BERT <https://arxiv.org/abs/1810.04805>`_ and `GPT2 <https://d4mucfpksywv.cloudfront.net/better-language-models/language-models.pdf>`_ .
     The default architecture here places layer normalization in post-process and
     applies another layer normalization on the output of last encoder/decoder layer.
 
@@ -357,30 +487,30 @@ class FusedTransformer(Layer):
             Otherwise, no pre-process and post-precess includes dropout, residual
             connection, layer normalization. Default False
         weight_attr(ParamAttr|list|tuple, optional): To specify the weight parameter property.
-            If it is a list/tuple, the length of `weight_attr` could be 1, 2 or 3. If it is 3, 
-            `weight_attr[0]` would be used as `weight_attr` for self attention, `weight_attr[1]` 
-            would be used as `weight_attr` for cross attention of `TransformerDecoder`, 
-            and `weight_attr[2]` would be used as `weight_attr` for linear in FFN. 
-            If it is 2, `weight_attr[0]` would be used as `weight_attr` both for self attention 
-            and cross attntion and `weight_attr[1]` would be used as `weight_attr` for 
-            linear in FFN. If it is 1, `weight_attr[0]` would be used as `weight_attr` 
-            for self attention, cross attention and linear in FFN. Otherwise, 
-            the three sub-layers all uses it as `weight_attr` to create parameters. 
-            Default: None, which means the default weight parameter property is used. 
+            If it is a list/tuple, the length of `weight_attr` could be 1, 2 or 3. If it is 3,
+            `weight_attr[0]` would be used as `weight_attr` for self attention, `weight_attr[1]`
+            would be used as `weight_attr` for cross attention of `TransformerDecoder`,
+            and `weight_attr[2]` would be used as `weight_attr` for linear in FFN.
+            If it is 2, `weight_attr[0]` would be used as `weight_attr` both for self attention
+            and cross attntion and `weight_attr[1]` would be used as `weight_attr` for
+            linear in FFN. If it is 1, `weight_attr[0]` would be used as `weight_attr`
+            for self attention, cross attention and linear in FFN. Otherwise,
+            the three sub-layers all uses it as `weight_attr` to create parameters.
+            Default: None, which means the default weight parameter property is used.
             See usage for details
-            in :code:`ParamAttr` . 
+            in :code:`ParamAttr` .
         bias_attr (ParamAttr|list|tuple|bool, optional): To specify the bias parameter property.
-            If it is a list/tuple, the length of `bias_attr` could be 1, 2 or 3. If it is 3, 
-            `bias_attr[0]` would be used as `bias_attr` for self attention, `bias_attr[1]` 
-            would be used as `bias_attr` for cross attention of `TransformerDecoder`, 
-            and `bias_attr[2]` would be used as `bias_attr` for linear in FFN. 
-            If it is 2, `bias_attr[0]` would be used as `bias_attr` both for self attention 
-            and cross attntion and `bias_attr[1]` would be used as `bias_attr` for 
-            linear in FFN. If it is 1, `bias_attr[0]` would be used as `bias_attr` 
-            for self attention, cross attention and linear in FFN. Otherwise, 
-            the three sub-layers all uses it as `bias_attr` to create parameters. 
-            The `False` value means the corresponding layer would not have trainable 
-            bias parameter. See usage for details in :code:`ParamAttr` . 
+            If it is a list/tuple, the length of `bias_attr` could be 1, 2 or 3. If it is 3,
+            `bias_attr[0]` would be used as `bias_attr` for self attention, `bias_attr[1]`
+            would be used as `bias_attr` for cross attention of `TransformerDecoder`,
+            and `bias_attr[2]` would be used as `bias_attr` for linear in FFN.
+            If it is 2, `bias_attr[0]` would be used as `bias_attr` both for self attention
+            and cross attntion and `bias_attr[1]` would be used as `bias_attr` for
+            linear in FFN. If it is 1, `bias_attr[0]` would be used as `bias_attr`
+            for self attention, cross attention and linear in FFN. Otherwise,
+            the three sub-layers all uses it as `bias_attr` to create parameters.
+            The `False` value means the corresponding layer would not have trainable
+            bias parameter. See usage for details in :code:`ParamAttr` .
             Default: None,which means the default bias parameter property is used.
         custom_encoder (Layer, optional): If custom encoder is provided, use it as the encoder.
             Default None
diff --git a/python/paddle/linalg.py b/python/paddle/linalg.py
index 06b512150cee8..119db0894f917 100644
--- a/python/paddle/linalg.py
+++ b/python/paddle/linalg.py
@@ -23,11 +23,13 @@
 from .tensor.linalg import multi_dot  # noqa: F401
 from .tensor.linalg import matrix_rank
 from .tensor.linalg import svd
+from .tensor.linalg import eigvalsh
 from .tensor.linalg import qr
 from .tensor.linalg import eigh  # noqa: F401
 from .tensor.linalg import det
 from .tensor.linalg import slogdet
 from .tensor.linalg import pinv
+from .tensor.linalg import triangular_solve
 
 __all__ = [
     'cholesky',  #noqa
@@ -44,6 +46,8 @@
     'det',
     'slogdet',
     'eigh',
+    'eigvalsh',
     'pinv',
-    'solve'
+    'solve',
+    'triangular_solve',
 ]
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index adf93b24d3926..87eb564f60ea6 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -1001,9 +1001,7 @@ def ctc_loss(log_probs,
              label_lengths,
              blank=0,
              reduction='mean',
-             norm_by_times=False,
-             norm_by_batchsize=False,
-             norm_by_total_logits_len=False):
+             norm_by_times=False):
     """
 
     An operator integrating the open source Warp-CTC library (https://github.com/baidu-research/warp-ctc)
@@ -1019,9 +1017,7 @@ def ctc_loss(log_probs,
         blank (int, optional): The blank label index of Connectionist Temporal Classification (CTC) loss, which is in the half-opened interval [0, num_classes + 1). The data type must be int32. Default is 0.
         reduction (string, optional): Indicate how to average the loss, the candicates are ``'none'`` | ``'mean'`` | ``'sum'``. If :attr:`reduction` is ``'mean'``, the output loss will be divided by the label_lengths, and then return the mean of quotient; If :attr:`reduction` is ``'sum'``, return the sum of loss; If :attr:`reduction` is ``'none'``, no reduction will be applied. Default is ``'mean'``.
         norm_by_times (bool, default False) – Whether to normalize the gradients by the number of time-step, which is also the sequence’s length. There is no need to normalize the gradients if reduction mode is 'mean'.
-        norm_by_batchsize (bool): normalize the loss by the batch size (default: `False`). If `True`, supersedes `norm_by_times` (default: `False`)
-        norm_by_total_logits_len (bool): normalize the loss by the total number of frames in the batch. If `True`, supersedes `norm_by_batchsize` and `norm_by_times` (default: `False`)
-            
+
     Returns:
         Tensor, The Connectionist Temporal Classification (CTC) loss between ``log_probs`` and  ``labels``. If attr:`reduction` is ``'none'``, the shape of loss is [batch_size], otherwise, the shape of loss is [1]. Data type is the same as ``log_probs``.
 
@@ -1029,7 +1025,6 @@ def ctc_loss(log_probs,
 
         .. code-block:: python
 
-            # required: skiptest
             # declarative mode
             import paddle.nn.functional as F
             import numpy as np
@@ -1086,10 +1081,9 @@ def ctc_loss(log_probs,
     """
 
     loss_out = fluid.layers.warpctc(log_probs, labels, blank, norm_by_times,
-                                    input_lengths, label_lengths,
-                                    norm_by_batchsize, norm_by_total_logits_len)
+                                    input_lengths, label_lengths)
 
-    loss_out = fluid.layers.squeeze(loss_out, [-1])  # (B)
+    loss_out = fluid.layers.squeeze(loss_out, [-1])
     assert reduction in ['mean', 'sum', 'none']
     if reduction == 'mean':
         loss_out = paddle.mean(loss_out / label_lengths)
@@ -1544,7 +1538,7 @@ def cross_entropy(input,
             Indicate how to average the loss by batch_size,
             the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
             If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
-            If :attr:`norm_by_batchsize` is ``'sum'``, the reduced sum loss is returned.
+            If :attr:`size_average` is ``'sum'``, the reduced sum loss is returned.
             If :attr:`reduction` is ``'none'``, the unreduced loss is returned.
             Default is ``'mean'``.
 
diff --git a/python/paddle/nn/layer/loss.py b/python/paddle/nn/layer/loss.py
index 781e13867f243..3ac0d675fb72c 100644
--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -1119,9 +1119,7 @@ def forward(self,
                 labels,
                 input_lengths,
                 label_lengths,
-                norm_by_times=False,
-                norm_by_batchsize=False,
-                norm_by_total_logits_len=False):
+                norm_by_times=False):
         return paddle.nn.functional.ctc_loss(
             log_probs,
             labels,
@@ -1129,9 +1127,7 @@ def forward(self,
             label_lengths,
             self.blank,
             self.reduction,
-            norm_by_times=norm_by_times,
-            norm_by_batchsize=norm_by_batchsize,
-            norm_by_total_logits_len=norm_by_total_logits_len)
+            norm_by_times=norm_by_times)
 
 
 class SmoothL1Loss(Layer):
diff --git a/python/paddle/optimizer/adamw.py b/python/paddle/optimizer/adamw.py
index 55aaac8dc4852..5fdcc0cd0d270 100644
--- a/python/paddle/optimizer/adamw.py
+++ b/python/paddle/optimizer/adamw.py
@@ -297,9 +297,8 @@ def _append_optimize_op(self, block, param_and_grad):
                 moment1, moment2, beta1_pow_acc, beta2_pow_acc, master_weight,
                 'epsilon', self._epsilon, 'lazy_mode', self._lazy_mode,
                 'min_row_size_to_use_multithread', 1000, 'beta1', _beta1,
-                'beta2', _beta2, 'coeff', self._coeff, 'multi_precision',
-                find_master, 'lr_ratio', lr_ratio_)
-
+                'beta2', _beta2, "with_decay", with_decay, 'coeff', self._coeff,
+                'multi_precision', find_master, 'lr_ratio', lr_ratio_)
             return None
 
         inputs = {
diff --git a/python/paddle/static/__init__.py b/python/paddle/static/__init__.py
index 20af4158df48f..92aa5000dfa58 100644
--- a/python/paddle/static/__init__.py
+++ b/python/paddle/static/__init__.py
@@ -1,4 +1,5 @@
 #   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#   Copyright (c) 2021 NVIDIA Corporation. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,6 +14,7 @@
 # limitations under the License.
 
 from . import amp  # noqa: F401
+from . import sparsity  # noqa: F401
 from . import nn  # noqa: F401
 from .io import save_inference_model  # noqa: F401
 from .io import load_inference_model  # noqa: F401
diff --git a/python/paddle/static/sparsity/__init__.py b/python/paddle/static/sparsity/__init__.py
new file mode 100644
index 0000000000000..59f794ef28aa4
--- /dev/null
+++ b/python/paddle/static/sparsity/__init__.py
@@ -0,0 +1,28 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2021 NVIDIA Corporation.  All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...fluid.contrib.sparsity import calculate_density  #noqa: F401
+from ...fluid.contrib.sparsity import decorate  #noqa: F401
+from ...fluid.contrib.sparsity import prune_model  #noqa: F401
+from ...fluid.contrib.sparsity import set_excluded_layers  #noqa: F401
+from ...fluid.contrib.sparsity import reset_excluded_layers  #noqa: F401
+
+__all__ = [     #noqa
+    'calculate_density',
+    'decorate',
+    'prune_model',
+    'set_excluded_layers',
+    'reset_excluded_layers'
+]
diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py
index 04d0a3c745f10..cb7b2928d028f 100755
--- a/python/paddle/tensor/__init__.py
+++ b/python/paddle/tensor/__init__.py
@@ -52,6 +52,7 @@
 from .linalg import eigvals  # noqa: F401
 from .linalg import multi_dot  # noqa: F401
 from .linalg import svd  # noqa: F401
+from .linalg import eigvalsh  # noqa: F401
 from .linalg import eigh  # noqa: F401
 from .linalg import pinv  # noqa: F401
 from .linalg import solve  # noqa: F401
@@ -196,6 +197,7 @@
 from .random import randn  # noqa: F401
 from .random import rand  # noqa: F401
 from .random import randint  # noqa: F401
+from .random import randint_like  # noqa: F401
 from .random import randperm  # noqa: F401
 from .search import argmax  # noqa: F401
 from .search import argmin  # noqa: F401
@@ -240,6 +242,7 @@
            'matrix_power',
            'qr',
            'eigvals',
+           'eigvalsh',
            'abs',
            'acos',
            'all',
@@ -395,6 +398,7 @@
            'uniform_',
            'multi_dot',
            'solve',
+           'triangular_solve'
 ]
 
 #this list used in math_op_patch.py for magic_method bind
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index aea56432fa9ca..abfc72c797a85 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -14,8 +14,8 @@
 
 import numpy as np
 from ..fluid.layer_helper import LayerHelper
+from ..fluid.framework import in_dygraph_mode, _varbase_creator, Variable, _dygraph_tracer
 from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype
-from ..fluid.framework import in_dygraph_mode, _varbase_creator, Variable
 
 from ..fluid.layers import transpose, cast  # noqa: F401
 from ..fluid import layers
@@ -2313,3 +2313,143 @@ def solve(x, y, name=None):
         type="solve", inputs={"X": x,
                               "Y": y}, outputs={"Out": out})
     return out
+
+
+def triangular_solve(x,
+                     y,
+                     upper=True,
+                     transpose=False,
+                     unitriangular=False,
+                     name=None):
+    r"""
+    Computes the solution of a system of equations with a triangular coefficient matrix `x` and
+    multiple right-hand sides `y` .
+
+    Input `x` and `y` is 2D matrices or batches of 2D matrices. If the inputs are batches, the outputs
+    is also batches.
+
+    Args:
+        x (Tensor): The input triangular coefficient matrix. Its shape should be `[*, M, M]`, where `*` is zero or
+            more batch dimensions. Its data type should be float32 or float64.
+        y (Tensor): Multiple right-hand sides of system of equations. Its shape should be `[*, M, K]`, where `*` is 
+            zero or more batch dimensions. Its data type should be float32 or float64.
+        upper (bool, optional): Whether to solve the upper-triangular system of equations (default) or the lower-triangular 
+            system of equations. Default: True.
+        transpose (bool, optional): whether `x` should be transposed before calculation. Default: False.
+        unitriangular (bool, optional): whether `x` is unit triangular. If True, the diagonal elements of `x` are assumed 
+            to be 1 and not referenced from `x` . Default: False.
+        name(str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor: The solution of the system of equations. Its data type should be the same as that of `x`.
+
+    Examples:
+    .. code-block:: python
+
+        # a square system of linear equations:
+        # x1 +   x2  +   x3 = 0
+        #      2*x2  +   x3 = -9
+        #               -x3 = 5
+
+        import paddle
+        import numpy as np
+
+        x = paddle.to_tensor([[1, 1, 1], 
+                              [0, 2, 1],
+                              [0, 0,-1]], dtype="float64")
+        y = paddle.to_tensor([[0], [-9], [5]], dtype="float64")
+        out = paddle.linalg.triangular_solve(x, y, upper=True)
+
+        print(out)
+        # [7, -2, -5]
+    """
+    if in_dygraph_mode():
+        return _C_ops.triangular_solve(x, y, 'upper', upper, 'transpose',
+                                       transpose, 'unitriangular',
+                                       unitriangular)
+
+    inputs = {"X": [x], "Y": [y]}
+    helper = LayerHelper("triangular_solve", **locals())
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'triangular_solve')
+    check_variable_and_dtype(y, 'y', ['float32', 'float64'], 'triangular_solve')
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+
+    helper.append_op(
+        type='triangular_solve',
+        inputs={'X': x,
+                'Y': y},
+        outputs={'Out': out},
+        attrs={
+            'upper': upper,
+            'transpose': transpose,
+            'unitriangular': unitriangular
+        })
+    return out
+
+
+def eigvalsh(x, UPLO='L', name=None):
+    """
+    Computes the eigenvalues of a 
+    complex Hermitian (conjugate symmetric) or a real symmetric matrix.
+
+    Args:
+        x (Tensor): A tensor with shape :math:`[_, M, M]` , The data type of the input Tensor x
+            should be one of float32, float64, complex64, complex128.
+        UPLO(str, optional): Lower triangular part of a (‘L’, default) or the upper triangular part (‘U’).
+        name(str, optional): The default value is None.  Normally there is no need for user to set this
+            property.  For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor: The tensor eigenvalues in ascending order.
+
+    Examples:
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+
+            x_data = np.array([[1, -2j], [2j, 5]])
+            x = paddle.to_tensor(x_data)
+            out_value = paddle.eigvalsh(x, UPLO='L')
+            print(out_value)
+            #[0.17157288, 5.82842712]
+    """
+    if in_dygraph_mode():
+        is_test = x.stop_gradient
+        values, _ = _C_ops.eigvalsh(x, 'UPLO', UPLO, 'is_test', is_test)
+        return values
+
+    def __check_input(x, UPLO):
+        x_shape = list(x.shape)
+        if len(x.shape) < 2:
+            raise ValueError(
+                "Input(input) only support >=2 tensor, but received "
+                "length of Input(input) is %s." % len(x.shape))
+        if x_shape[-1] != x_shape[-2]:
+            raise ValueError(
+                "The input matrix must be batches of square matrices. But received x's dimention: {}".
+                format(x_shape))
+        if UPLO is not 'L' and UPLO is not 'U':
+            raise ValueError(
+                "UPLO must be L or U. But received UPLO is: {}".format(UPLO))
+
+    __check_input(x, UPLO)
+
+    helper = LayerHelper('eigvalsh', **locals())
+    check_variable_and_dtype(x, 'dtype',
+                             ['float32', 'float64', 'complex64', 'complex128'],
+                             'eigvalsh')
+
+    out_value = helper.create_variable_for_type_inference(dtype=x.dtype)
+    out_vector = helper.create_variable_for_type_inference(dtype=x.dtype)
+
+    is_test = x.stop_gradient
+    helper.append_op(
+        type='eigvalsh',
+        inputs={'X': x},
+        outputs={'Eigenvalues': out_value,
+                 'Eigenvectors': out_vector},
+        attrs={'UPLO': UPLO,
+               'is_test': is_test})
+    return out_value
diff --git a/python/paddle/tensor/random.py b/python/paddle/tensor/random.py
index 6aec6c9961c13..67e5120a54b79 100644
--- a/python/paddle/tensor/random.py
+++ b/python/paddle/tensor/random.py
@@ -661,6 +661,180 @@ def randint(low=0, high=None, shape=[1], dtype=None, name=None):
     return out
 
 
+def randint_like(x, low=0, high=None, dtype=None, name=None):
+    """
+    This OP returns a Tensor filled with random integers from a discrete uniform
+    distribution in the range [``low``, ``high``), with the same shape as ``x``.
+    (use ``dtype`` if ``dtype`` is not None) 
+    If ``high`` is None (the default), the range is [0, ``low``).
+
+    Args:
+        x (Tensor): The input tensor which specifies shape. The dtype of ``x`` 
+            can be bool, int32, int64, float16, float32, float64.
+        low (int): The lower bound on the range of random values to generate.
+            The ``low`` is included in the range. If ``high`` is None, the
+            range is [0, ``low``). Default is 0.
+        high (int, optional): The upper bound on the range of random values to
+            generate, the ``high`` is excluded in the range. Default is None
+            (see above for behavior if high = None). Default is None.
+        dtype (str|np.dtype, optional): The data type of the
+            output tensor. Supported data types: bool, int32, int64, float16, 
+            float32, float64. If ``dytpe`` is None, the data type is the
+            same as x's data type. Default is None.
+        name (str, optional): The default value is None.  Normally there is no
+            need for user to set this property.  For more information, please
+            refer to :ref:`api_guide_Name`.
+
+    Returns: 
+        Tensor: A Tensor filled with random integers from a discrete uniform
+        distribution in the range [``low``, ``high``), with ``shape`` and ``dtype``.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            # example 1:
+            # dtype is None and the dtype of x is float16
+            x = paddle.zeros((1,2)).astype("float16")
+            out1 = paddle.randint_like(x, low=-5, high=5)
+            print(out1)
+            print(out1.dtype)
+            # [[0, -3]]  # random
+            # paddle.float16
+
+            # example 2:
+            # dtype is None and the dtype of x is float32
+            x = paddle.zeros((1,2)).astype("float32")
+            out2 = paddle.randint_like(x, low=-5, high=5)
+            print(out2)
+            print(out2.dtype)
+            # [[0, -3]]  # random
+            # paddle.float32
+
+            # example 3:
+            # dtype is None and the dtype of x is float64
+            x = paddle.zeros((1,2)).astype("float64")
+            out3 = paddle.randint_like(x, low=-5, high=5)
+            print(out3)
+            print(out3.dtype)
+            # [[0, -3]]  # random
+            # paddle.float64
+
+            # example 4:
+            # dtype is None and the dtype of x is int32
+            x = paddle.zeros((1,2)).astype("int32")
+            out4 = paddle.randint_like(x, low=-5, high=5)
+            print(out4)
+            print(out4.dtype)
+            # [[0, -3]]  # random
+            # paddle.int32
+
+            # example 5:
+            # dtype is None and the dtype of x is int64
+            x = paddle.zeros((1,2)).astype("int64")
+            out5 = paddle.randint_like(x, low=-5, high=5)
+            print(out5)
+            print(out5.dtype)
+            # [[0, -3]]  # random
+            # paddle.int64
+
+            # example 6:
+            # dtype is float64 and the dtype of x is float32
+            x = paddle.zeros((1,2)).astype("float32")
+            out6 = paddle.randint_like(x, low=-5, high=5, dtype="float64")
+            print(out6)
+            print(out6.dtype)
+            # [[0, -1]]  # random
+            # paddle.float64
+
+            # example 7:
+            # dtype is bool and the dtype of x is float32
+            x = paddle.zeros((1,2)).astype("float32")
+            out7 = paddle.randint_like(x, low=-5, high=5, dtype="bool")
+            print(out7)
+            print(out7.dtype)
+            # [[0, -1]]  # random
+            # paddle.bool
+
+            # example 8:
+            # dtype is int32 and the dtype of x is float32
+            x = paddle.zeros((1,2)).astype("float32")
+            out8 = paddle.randint_like(x, low=-5, high=5, dtype="int32")
+            print(out8)
+            print(out8.dtype)
+            # [[0, -1]]  # random
+            # paddle.int32
+
+            # example 9:
+            # dtype is int64 and the dtype of x is float32
+            x = paddle.zeros((1,2)).astype("float32")
+            out9 = paddle.randint_like(x, low=-5, high=5, dtype="int64")
+            print(out9)
+            print(out9.dtype)
+            # [[0, -1]]  # random
+            # paddle.int64
+
+            # example 10:
+            # dtype is int64 and the dtype of x is bool
+            x = paddle.zeros((1,2)).astype("bool")
+            out10 = paddle.randint_like(x, low=-5, high=5, dtype="int64")
+            print(out10)
+            print(out10.dtype)
+            # [[0, -1]]  # random
+            # paddle.int64
+
+    """
+    if high is None:
+        if low <= 0:
+            raise ValueError(
+                "If high is None, low must be greater than 0, but received low = {0}.".
+                format(low))
+        high = low
+        low = 0
+    if dtype is None:
+        dtype = x.dtype
+    if not isinstance(dtype, core.VarDesc.VarType):
+        dtype = convert_np_dtype_to_dtype_(dtype)
+    shape = x.shape
+
+    if low >= high:
+        raise ValueError(
+            "randint_like's low must less then high, but received low = {0}, "
+            "high = {1}".format(low, high))
+
+    if in_dygraph_mode():
+        shape = utils.convert_shape_to_list(shape)
+        out = _C_ops.randint('shape', shape, 'low', low, 'high', high, 'seed',
+                             0, 'dtype', core.VarDesc.VarType.INT64)
+        out = paddle.cast(out, dtype)
+        return out
+
+    check_shape(shape, 'randint_like')
+    check_dtype(dtype, 'dtype',
+                ['bool', 'float16', 'float32', 'float64', 'int32',
+                 'int64'], 'randint_like')
+
+    inputs = dict()
+    attrs = {
+        'low': low,
+        'high': high,
+        'seed': 0,
+        'dtype': core.VarDesc.VarType.INT64
+    }
+    utils.get_shape_tensor_inputs(
+        inputs=inputs, attrs=attrs, shape=shape, op_type='randint_like')
+
+    helper = LayerHelper("randint", **locals())
+    out = helper.create_variable_for_type_inference(
+        dtype=core.VarDesc.VarType.INT64)
+    helper.append_op(
+        type='randint', inputs=inputs, outputs={'Out': out}, attrs=attrs)
+    out.stop_gradient = True
+    out = paddle.cast(out, dtype)
+    return out
+
+
 def randperm(n, dtype="int64", name=None):
     """
     This OP returns a 1-D Tensor filled with random permutation values from 0
diff --git a/python/paddle/tests/CMakeLists.txt b/python/paddle/tests/CMakeLists.txt
index bc808e72bc75c..a3191608e29d4 100644
--- a/python/paddle/tests/CMakeLists.txt
+++ b/python/paddle/tests/CMakeLists.txt
@@ -50,3 +50,7 @@ set_tests_properties(test_dataset_uci_housing PROPERTIES TIMEOUT 120)
 set_tests_properties(test_dataset_imdb PROPERTIES TIMEOUT 300)
 set_tests_properties(test_pretrained_model PROPERTIES TIMEOUT 600) 
 set_tests_properties(test_hapi_hub PROPERTIES TIMEOUT 300) 
+
+if(NOT WITH_COVERAGE)
+    LIST(REMOVE_ITEM TEST_OPS test_hapi_hub)
+endif()
diff --git a/python/paddle/tests/test_pretrained_model.py b/python/paddle/tests/test_pretrained_model.py
index f2b779e3177fe..913caec733fe9 100644
--- a/python/paddle/tests/test_pretrained_model.py
+++ b/python/paddle/tests/test_pretrained_model.py
@@ -54,7 +54,9 @@ def infer(self, arch):
     def test_models(self):
         arches = [
             'mobilenet_v1', 'mobilenet_v2', 'resnet18', 'vgg16', 'alexnet',
-            'resnext50_32x4d', 'inception_v3'
+            'resnext50_32x4d', 'inception_v3', 'densenet121', 'squeezenet1_0',
+            'squeezenet1_1', 'googlenet', 'shufflenet_v2_x0_25',
+            'shufflenet_v2_swish'
         ]
         for arch in arches:
             self.infer(arch)
diff --git a/python/paddle/tests/test_vision_models.py b/python/paddle/tests/test_vision_models.py
index 9eb75826b7380..48ea1b80c9bf8 100644
--- a/python/paddle/tests/test_vision_models.py
+++ b/python/paddle/tests/test_vision_models.py
@@ -70,9 +70,33 @@ def test_resnet101(self):
     def test_resnet152(self):
         self.models_infer('resnet152')
 
+    def test_densenet121(self):
+        self.models_infer('densenet121')
+
+    def test_densenet161(self):
+        self.models_infer('densenet161')
+
+    def test_densenet169(self):
+        self.models_infer('densenet169')
+
+    def test_densenet201(self):
+        self.models_infer('densenet201')
+
+    def test_densenet264(self):
+        self.models_infer('densenet264')
+
+    def test_squeezenet1_0(self):
+        self.models_infer('squeezenet1_0')
+
+    def test_squeezenet1_1(self):
+        self.models_infer('squeezenet1_1')
+
     def test_alexnet(self):
         self.models_infer('alexnet')
 
+    def test_shufflenetv2_swish(self):
+        self.models_infer('shufflenet_v2_swish')
+
     def test_resnext50_32x4d(self):
         self.models_infer('resnext50_32x4d')
 
@@ -94,6 +118,27 @@ def test_resnext152_64x4d(self):
     def test_inception_v3(self):
         self.models_infer('inception_v3')
 
+    def test_googlenet(self):
+        self.models_infer('googlenet')
+
+    def test_shufflenetv2_x0_25(self):
+        self.models_infer('shufflenet_v2_x0_25')
+
+    def test_shufflenetv2_x0_33(self):
+        self.models_infer('shufflenet_v2_x0_33')
+
+    def test_shufflenetv2_x0_5(self):
+        self.models_infer('shufflenet_v2_x0_5')
+
+    def test_shufflenetv2_x1_0(self):
+        self.models_infer('shufflenet_v2_x1_0')
+
+    def test_shufflenetv2_x1_5(self):
+        self.models_infer('shufflenet_v2_x1_5')
+
+    def test_shufflenetv2_x2_0(self):
+        self.models_infer('shufflenet_v2_x2_0')
+
     def test_vgg16_num_classes(self):
         vgg16 = models.__dict__['vgg16'](pretrained=False, num_classes=10)
 
diff --git a/python/paddle/utils/cpp_extension/cpp_extension.py b/python/paddle/utils/cpp_extension/cpp_extension.py
index 19fa84046ed2d..5370de9ed42aa 100644
--- a/python/paddle/utils/cpp_extension/cpp_extension.py
+++ b/python/paddle/utils/cpp_extension/cpp_extension.py
@@ -403,7 +403,7 @@ def unix_custom_single_compiler(obj, src, ext, cc_args, extra_postargs,
             cflags = copy.deepcopy(extra_postargs)
             try:
                 original_compiler = self.compiler.compiler_so
-                # nvcc compile CUDA source
+                # nvcc or hipcc compile CUDA source
                 if is_cuda_file(src):
                     if core.is_compiled_with_rocm():
                         assert ROCM_HOME is not None, "Not found ROCM runtime, \
@@ -429,6 +429,13 @@ def unix_custom_single_compiler(obj, src, ext, cc_args, extra_postargs,
                 elif isinstance(cflags, dict):
                     cflags = cflags['cxx']
 
+                # Note(qili93): HIP require some additional flags for CMAKE_C_FLAGS
+                if core.is_compiled_with_rocm():
+                    cflags.append('-D__HIP_PLATFORM_HCC__')
+                    cflags.append('-D__HIP_NO_HALF_CONVERSIONS__=1')
+                    cflags.append(
+                        '-DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP')
+
                 # NOTE(Aurelius84): Since Paddle 2.0, we require gcc version > 5.x,
                 # so we add this flag to ensure the symbol names from user compiled
                 # shared library have same ABI suffix with core_(no)avx.so.
@@ -436,7 +443,10 @@ def unix_custom_single_compiler(obj, src, ext, cc_args, extra_postargs,
                 add_compile_flag(['-D_GLIBCXX_USE_CXX11_ABI=1'], cflags)
                 # Append this macor only when jointly compiling .cc with .cu
                 if not is_cuda_file(src) and self.contain_cuda_file:
-                    cflags.append('-DPADDLE_WITH_CUDA')
+                    if core.is_compiled_with_rocm():
+                        cflags.append('-DPADDLE_WITH_HIP')
+                    else:
+                        cflags.append('-DPADDLE_WITH_CUDA')
 
                 add_std_without_repeat(
                     cflags, self.compiler.compiler_type, use_std14=True)
diff --git a/python/paddle/utils/cpp_extension/extension_utils.py b/python/paddle/utils/cpp_extension/extension_utils.py
index 0a2d71abfdee4..5fee663034289 100644
--- a/python/paddle/utils/cpp_extension/extension_utils.py
+++ b/python/paddle/utils/cpp_extension/extension_utils.py
@@ -56,7 +56,12 @@
 
 MSVC_LINK_FLAGS = ['/MACHINE:X64']
 
-COMMON_NVCC_FLAGS = ['-DPADDLE_WITH_CUDA', '-DEIGEN_USE_GPU']
+if core.is_compiled_with_rocm():
+    COMMON_HIPCC_FLAGS = [
+        '-DPADDLE_WITH_HIP', '-DEIGEN_USE_GPU', '-DEIGEN_USE_HIP'
+    ]
+else:
+    COMMON_NVCC_FLAGS = ['-DPADDLE_WITH_CUDA', '-DEIGEN_USE_GPU']
 
 GCC_MINI_VERSION = (5, 4, 0)
 MSVC_MINI_VERSION = (19, 0, 24215)
@@ -319,10 +324,14 @@ def prepare_unix_cudaflags(cflags):
     """
     Prepare all necessary compiled flags for nvcc compiling CUDA files.
     """
-    cflags = COMMON_NVCC_FLAGS + [
-        '-ccbin', 'cc', '-Xcompiler', '-fPIC', '--expt-relaxed-constexpr',
-        '-DNVCC'
-    ] + cflags + get_cuda_arch_flags(cflags)
+    if core.is_compiled_with_rocm():
+        cflags = COMMON_HIPCC_FLAGS + ['-Xcompiler', '-fPIC'
+                                       ] + cflags + get_rocm_arch_flags(cflags)
+    else:
+        cflags = COMMON_NVCC_FLAGS + [
+            '-ccbin', 'cc', '-Xcompiler', '-fPIC', '--expt-relaxed-constexpr',
+            '-DNVCC'
+        ] + cflags + get_cuda_arch_flags(cflags)
 
     return cflags
 
@@ -358,6 +367,14 @@ def get_cuda_arch_flags(cflags):
     return []
 
 
+def get_rocm_arch_flags(cflags):
+    """
+    For ROCm platform, amdgpu target should be added for HIPCC.
+    """
+    cflags = cflags + ['-fno-gpu-rdc', '-amdgpu-target=gfx906']
+    return cflags
+
+
 def _get_fluid_path():
     """
     Return installed fluid dir path.
@@ -471,7 +488,10 @@ def normalize_extension_kwargs(kwargs, use_cuda=False):
         add_compile_flag(extra_compile_args, ['-w'])  # disable warning
 
         if use_cuda:
-            extra_link_args.append('-lcudart')
+            if core.is_compiled_with_rocm():
+                extra_link_args.append('-lamdhip64')
+            else:
+                extra_link_args.append('-lcudart')
 
         kwargs['extra_link_args'] = extra_link_args
 
diff --git a/python/paddle/vision/__init__.py b/python/paddle/vision/__init__.py
index e5db5f6c4f882..54f293d7f57d1 100644
--- a/python/paddle/vision/__init__.py
+++ b/python/paddle/vision/__init__.py
@@ -38,12 +38,21 @@
 from .models import mobilenet_v1  # noqa: F401
 from .models import MobileNetV2  # noqa: F401
 from .models import mobilenet_v2  # noqa: F401
+from .models import SqueezeNet  # noqa: F401
+from .models import squeezenet1_0  # noqa: F401
+from .models import squeezenet1_1  # noqa: F401
 from .models import VGG  # noqa: F401
 from .models import vgg11  # noqa: F401
 from .models import vgg13  # noqa: F401
 from .models import vgg16  # noqa: F401
 from .models import vgg19  # noqa: F401
 from .models import LeNet  # noqa: F401
+from .models import DenseNet  # noqa: F401
+from .models import densenet121  # noqa: F401
+from .models import densenet161  # noqa: F401
+from .models import densenet169  # noqa: F401
+from .models import densenet201  # noqa: F401
+from .models import densenet264  # noqa: F401
 from .models import AlexNet  # noqa: F401
 from .models import alexnet  # noqa: F401
 from .models import ResNeXt  # noqa: F401
@@ -55,6 +64,16 @@
 from .models import resnext152_64x4d  # noqa: F401
 from .models import InceptionV3  # noqa: F401
 from .models import inception_v3  # noqa: F401
+from .models import GoogLeNet  # noqa: F401
+from .models import googlenet  # noqa: F401
+from .models import ShuffleNetV2  # noqa: F401
+from .models import shufflenet_v2_x0_25  # noqa: F401
+from .models import shufflenet_v2_x0_33  # noqa: F401
+from .models import shufflenet_v2_x0_5  # noqa: F401
+from .models import shufflenet_v2_x1_0  # noqa: F401
+from .models import shufflenet_v2_x1_5  # noqa: F401
+from .models import shufflenet_v2_x2_0  # noqa: F401
+from .models import shufflenet_v2_swish  # noqa: F401
 from .transforms import BaseTransform  # noqa: F401
 from .transforms import Compose  # noqa: F401
 from .transforms import Resize  # noqa: F401
diff --git a/python/paddle/vision/models/__init__.py b/python/paddle/vision/models/__init__.py
index 7d8cb58fad969..e9a6af32d406b 100644
--- a/python/paddle/vision/models/__init__.py
+++ b/python/paddle/vision/models/__init__.py
@@ -28,6 +28,12 @@
 from .vgg import vgg16  # noqa: F401
 from .vgg import vgg19  # noqa: F401
 from .lenet import LeNet  # noqa: F401
+from .densenet import DenseNet  # noqa: F401
+from .densenet import densenet121  # noqa: F401
+from .densenet import densenet161  # noqa: F401
+from .densenet import densenet169  # noqa: F401
+from .densenet import densenet201  # noqa: F401
+from .densenet import densenet264  # noqa: F401
 from .alexnet import AlexNet  # noqa: F401
 from .alexnet import alexnet  # noqa: F401
 from .resnext import ResNeXt  # noqa: F401
@@ -39,6 +45,19 @@
 from .resnext import resnext152_64x4d  # noqa: F401
 from .inceptionv3 import InceptionV3  # noqa: F401
 from .inceptionv3 import inception_v3  # noqa: F401
+from .squeezenet import SqueezeNet  # noqa: F401
+from .squeezenet import squeezenet1_0  # noqa: F401
+from .squeezenet import squeezenet1_1  # noqa: F401
+from .googlenet import GoogLeNet  # noqa: F401
+from .googlenet import googlenet  # noqa: F401
+from .shufflenetv2 import ShuffleNetV2  # noqa: F401
+from .shufflenetv2 import shufflenet_v2_x0_25  # noqa: F401
+from .shufflenetv2 import shufflenet_v2_x0_33  # noqa: F401
+from .shufflenetv2 import shufflenet_v2_x0_5  # noqa: F401
+from .shufflenetv2 import shufflenet_v2_x1_0  # noqa: F401
+from .shufflenetv2 import shufflenet_v2_x1_5  # noqa: F401
+from .shufflenetv2 import shufflenet_v2_x2_0  # noqa: F401
+from .shufflenetv2 import shufflenet_v2_swish  # noqa: F401
 
 __all__ = [ #noqa
     'ResNet',
@@ -57,6 +76,12 @@
     'MobileNetV2',
     'mobilenet_v2',
     'LeNet',
+    'DenseNet',
+    'densenet121',
+    'densenet161',
+    'densenet169',
+    'densenet201',
+    'densenet264',
     'AlexNet',
     'alexnet',
     'ResNeXt',
@@ -67,5 +92,18 @@
     'resnext152_32x4d',
     'resnext152_64x4d',
     'InceptionV3',
-    'inception_v3'
+    'inception_v3',
+    'SqueezeNet',
+    'squeezenet1_0',
+    'squeezenet1_1',
+    'GoogLeNet',
+    'googlenet',
+    'ShuffleNetV2',
+    'shufflenet_v2_x0_25',
+    'shufflenet_v2_x0_33',
+    'shufflenet_v2_x0_5',
+    'shufflenet_v2_x1_0',
+    'shufflenet_v2_x1_5',
+    'shufflenet_v2_x2_0',
+    'shufflenet_v2_swish'
 ]
diff --git a/python/paddle/vision/models/densenet.py b/python/paddle/vision/models/densenet.py
new file mode 100644
index 0000000000000..46c7b6dc52b58
--- /dev/null
+++ b/python/paddle/vision/models/densenet.py
@@ -0,0 +1,417 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+
+import paddle
+import paddle.nn as nn
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+from paddle.nn.initializer import Uniform
+from paddle.fluid.param_attr import ParamAttr
+from paddle.utils.download import get_weights_path_from_url
+
+__all__ = []
+
+model_urls = {
+    'densenet121':
+    ('https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DenseNet121_pretrained.pdparams',
+     'db1b239ed80a905290fd8b01d3af08e4'),
+    'densenet161':
+    ('https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DenseNet161_pretrained.pdparams',
+     '62158869cb315098bd25ddbfd308a853'),
+    'densenet169':
+    ('https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DenseNet169_pretrained.pdparams',
+     '82cc7c635c3f19098c748850efb2d796'),
+    'densenet201':
+    ('https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DenseNet201_pretrained.pdparams',
+     '16ca29565a7712329cf9e36e02caaf58'),
+    'densenet264':
+    ('https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DenseNet264_pretrained.pdparams',
+     '3270ce516b85370bba88cfdd9f60bff4'),
+}
+
+
+class BNACConvLayer(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 pad=0,
+                 groups=1,
+                 act="relu"):
+        super(BNACConvLayer, self).__init__()
+        self._batch_norm = BatchNorm(num_channels, act=act)
+
+        self._conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=pad,
+            groups=groups,
+            weight_attr=ParamAttr(),
+            bias_attr=False)
+
+    def forward(self, input):
+        y = self._batch_norm(input)
+        y = self._conv(y)
+        return y
+
+
+class DenseLayer(nn.Layer):
+    def __init__(self, num_channels, growth_rate, bn_size, dropout):
+        super(DenseLayer, self).__init__()
+        self.dropout = dropout
+
+        self.bn_ac_func1 = BNACConvLayer(
+            num_channels=num_channels,
+            num_filters=bn_size * growth_rate,
+            filter_size=1,
+            pad=0,
+            stride=1)
+
+        self.bn_ac_func2 = BNACConvLayer(
+            num_channels=bn_size * growth_rate,
+            num_filters=growth_rate,
+            filter_size=3,
+            pad=1,
+            stride=1)
+
+        if dropout:
+            self.dropout_func = Dropout(p=dropout, mode="downscale_in_infer")
+
+    def forward(self, input):
+        conv = self.bn_ac_func1(input)
+        conv = self.bn_ac_func2(conv)
+        if self.dropout:
+            conv = self.dropout_func(conv)
+        conv = paddle.concat([input, conv], axis=1)
+        return conv
+
+
+class DenseBlock(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_layers,
+                 bn_size,
+                 growth_rate,
+                 dropout,
+                 name=None):
+        super(DenseBlock, self).__init__()
+        self.dropout = dropout
+        self.dense_layer_func = []
+
+        pre_channel = num_channels
+        for layer in range(num_layers):
+            self.dense_layer_func.append(
+                self.add_sublayer(
+                    "{}_{}".format(name, layer + 1),
+                    DenseLayer(
+                        num_channels=pre_channel,
+                        growth_rate=growth_rate,
+                        bn_size=bn_size,
+                        dropout=dropout)))
+            pre_channel = pre_channel + growth_rate
+
+    def forward(self, input):
+        conv = input
+        for func in self.dense_layer_func:
+            conv = func(conv)
+        return conv
+
+
+class TransitionLayer(nn.Layer):
+    def __init__(self, num_channels, num_output_features):
+        super(TransitionLayer, self).__init__()
+
+        self.conv_ac_func = BNACConvLayer(
+            num_channels=num_channels,
+            num_filters=num_output_features,
+            filter_size=1,
+            pad=0,
+            stride=1)
+
+        self.pool2d_avg = AvgPool2D(kernel_size=2, stride=2, padding=0)
+
+    def forward(self, input):
+        y = self.conv_ac_func(input)
+        y = self.pool2d_avg(y)
+        return y
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 pad=0,
+                 groups=1,
+                 act="relu"):
+        super(ConvBNLayer, self).__init__()
+
+        self._conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=pad,
+            groups=groups,
+            weight_attr=ParamAttr(),
+            bias_attr=False)
+        self._batch_norm = BatchNorm(num_filters, act=act)
+
+    def forward(self, input):
+        y = self._conv(input)
+        y = self._batch_norm(y)
+        return y
+
+
+class DenseNet(nn.Layer):
+    """DenseNet model from
+    `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
+
+    Args:
+        layers (int): layers of densenet. Default: 121.
+        bn_size (int): expansion of growth rate in the middle layer. Default: 4.
+        dropout (float): dropout rate. Default: 0..
+        num_classes (int): output dim of last fc layer. Default: 1000.
+        with_pool (bool): use pool before the last fc layer or not. Default: True.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddle.vision.models import DenseNet
+
+            # build model
+            densenet = DenseNet()
+
+            x = paddle.rand([1, 3, 224, 224])
+            out = densenet(x)
+
+            print(out.shape)
+    """
+
+    def __init__(self,
+                 layers=121,
+                 bn_size=4,
+                 dropout=0.,
+                 num_classes=1000,
+                 with_pool=True):
+        super(DenseNet, self).__init__()
+        self.num_classes = num_classes
+        self.with_pool = with_pool
+        supported_layers = [121, 161, 169, 201, 264]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(
+                supported_layers, layers)
+        densenet_spec = {
+            121: (64, 32, [6, 12, 24, 16]),
+            161: (96, 48, [6, 12, 36, 24]),
+            169: (64, 32, [6, 12, 32, 32]),
+            201: (64, 32, [6, 12, 48, 32]),
+            264: (64, 32, [6, 12, 64, 48])
+        }
+        num_init_features, growth_rate, block_config = densenet_spec[layers]
+
+        self.conv1_func = ConvBNLayer(
+            num_channels=3,
+            num_filters=num_init_features,
+            filter_size=7,
+            stride=2,
+            pad=3,
+            act='relu')
+        self.pool2d_max = MaxPool2D(kernel_size=3, stride=2, padding=1)
+        self.block_config = block_config
+        self.dense_block_func_list = []
+        self.transition_func_list = []
+        pre_num_channels = num_init_features
+        num_features = num_init_features
+        for i, num_layers in enumerate(block_config):
+            self.dense_block_func_list.append(
+                self.add_sublayer(
+                    "db_conv_{}".format(i + 2),
+                    DenseBlock(
+                        num_channels=pre_num_channels,
+                        num_layers=num_layers,
+                        bn_size=bn_size,
+                        growth_rate=growth_rate,
+                        dropout=dropout,
+                        name='conv' + str(i + 2))))
+
+            num_features = num_features + num_layers * growth_rate
+            pre_num_channels = num_features
+
+            if i != len(block_config) - 1:
+                self.transition_func_list.append(
+                    self.add_sublayer(
+                        "tr_conv{}_blk".format(i + 2),
+                        TransitionLayer(
+                            num_channels=pre_num_channels,
+                            num_output_features=num_features // 2)))
+                pre_num_channels = num_features // 2
+                num_features = num_features // 2
+
+        self.batch_norm = BatchNorm(num_features, act="relu")
+        if self.with_pool:
+            self.pool2d_avg = AdaptiveAvgPool2D(1)
+
+        if self.num_classes > 0:
+            stdv = 1.0 / math.sqrt(num_features * 1.0)
+            self.out = Linear(
+                num_features,
+                num_classes,
+                weight_attr=ParamAttr(initializer=Uniform(-stdv, stdv)),
+                bias_attr=ParamAttr())
+
+    def forward(self, input):
+        conv = self.conv1_func(input)
+        conv = self.pool2d_max(conv)
+
+        for i, num_layers in enumerate(self.block_config):
+            conv = self.dense_block_func_list[i](conv)
+            if i != len(self.block_config) - 1:
+                conv = self.transition_func_list[i](conv)
+
+        conv = self.batch_norm(conv)
+
+        if self.with_pool:
+            y = self.pool2d_avg(conv)
+
+        if self.num_classes > 0:
+            y = paddle.flatten(y, start_axis=1, stop_axis=-1)
+            y = self.out(y)
+
+        return y
+
+
+def _densenet(arch, layers, pretrained, **kwargs):
+    model = DenseNet(layers=layers, **kwargs)
+    if pretrained:
+        assert arch in model_urls, "{} model do not have a pretrained model now, you should set pretrained=False".format(
+            arch)
+        weight_path = get_weights_path_from_url(model_urls[arch][0],
+                                                model_urls[arch][1])
+
+        param = paddle.load(weight_path)
+        model.set_dict(param)
+
+    return model
+
+
+def densenet121(pretrained=False, **kwargs):
+    """DenseNet 121-layer model
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+
+    Examples:
+        .. code-block:: python
+
+            from paddle.vision.models import densenet121
+
+            # build model
+            model = densenet121()
+
+            # build model and load imagenet pretrained weight
+            # model = densenet121(pretrained=True)
+    """
+    return _densenet('densenet121', 121, pretrained, **kwargs)
+
+
+def densenet161(pretrained=False, **kwargs):
+    """DenseNet 161-layer model
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+
+    Examples:
+        .. code-block:: python
+
+            from paddle.vision.models import densenet161
+
+            # build model
+            model = densenet161()
+
+            # build model and load imagenet pretrained weight
+            # model = densenet161(pretrained=True)
+    """
+    return _densenet('densenet161', 161, pretrained, **kwargs)
+
+
+def densenet169(pretrained=False, **kwargs):
+    """DenseNet 169-layer model
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+
+    Examples:
+        .. code-block:: python
+
+            from paddle.vision.models import densenet169
+
+            # build model
+            model = densenet169()
+
+            # build model and load imagenet pretrained weight
+            # model = densenet169(pretrained=True)
+    """
+    return _densenet('densenet169', 169, pretrained, **kwargs)
+
+
+def densenet201(pretrained=False, **kwargs):
+    """DenseNet 201-layer model
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+
+    Examples:
+        .. code-block:: python
+
+            from paddle.vision.models import densenet201
+
+            # build model
+            model = densenet201()
+
+            # build model and load imagenet pretrained weight
+            # model = densenet201(pretrained=True)
+    """
+    return _densenet('densenet201', 201, pretrained, **kwargs)
+
+
+def densenet264(pretrained=False, **kwargs):
+    """DenseNet 264-layer model
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+
+    Examples:
+        .. code-block:: python
+
+            from paddle.vision.models import densenet264
+
+            # build model
+            model = densenet264()
+
+            # build model and load imagenet pretrained weight
+            # model = densenet264(pretrained=True)
+    """
+    return _densenet('densenet264', 264, pretrained, **kwargs)
diff --git a/python/paddle/vision/models/googlenet.py b/python/paddle/vision/models/googlenet.py
new file mode 100644
index 0000000000000..6afbc42603867
--- /dev/null
+++ b/python/paddle/vision/models/googlenet.py
@@ -0,0 +1,254 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddle.nn import Conv2D, Linear, Dropout
+from paddle.nn import MaxPool2D, AvgPool2D, AdaptiveAvgPool2D
+from paddle.nn.initializer import Uniform
+from paddle.fluid.param_attr import ParamAttr
+from paddle.utils.download import get_weights_path_from_url
+
+__all__ = []
+
+model_urls = {
+    "googlenet":
+    ("https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/GoogLeNet_pretrained.pdparams",
+     "80c06f038e905c53ab32c40eca6e26ae")
+}
+
+
+def xavier(channels, filter_size):
+    stdv = (3.0 / (filter_size**2 * channels))**0.5
+    param_attr = ParamAttr(initializer=Uniform(-stdv, stdv))
+    return param_attr
+
+
+class ConvLayer(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 groups=1):
+        super(ConvLayer, self).__init__()
+
+        self._conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            bias_attr=False)
+
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        return y
+
+
+class Inception(nn.Layer):
+    def __init__(self, input_channels, output_channels, filter1, filter3R,
+                 filter3, filter5R, filter5, proj):
+        super(Inception, self).__init__()
+
+        self._conv1 = ConvLayer(input_channels, filter1, 1)
+        self._conv3r = ConvLayer(input_channels, filter3R, 1)
+        self._conv3 = ConvLayer(filter3R, filter3, 3)
+        self._conv5r = ConvLayer(input_channels, filter5R, 1)
+        self._conv5 = ConvLayer(filter5R, filter5, 5)
+        self._pool = MaxPool2D(kernel_size=3, stride=1, padding=1)
+
+        self._convprj = ConvLayer(input_channels, proj, 1)
+
+    def forward(self, inputs):
+        conv1 = self._conv1(inputs)
+
+        conv3r = self._conv3r(inputs)
+        conv3 = self._conv3(conv3r)
+
+        conv5r = self._conv5r(inputs)
+        conv5 = self._conv5(conv5r)
+
+        pool = self._pool(inputs)
+        convprj = self._convprj(pool)
+
+        cat = paddle.concat([conv1, conv3, conv5, convprj], axis=1)
+        cat = F.relu(cat)
+        return cat
+
+
+class GoogLeNet(nn.Layer):
+    """GoogLeNet (Inception v1) model architecture from
+    `"Going Deeper with Convolutions" <https://arxiv.org/pdf/1409.4842.pdf>`_
+    
+    Args:
+        num_classes (int): output dim of last fc layer. If num_classes <=0, last fc layer 
+                            will not be defined. Default: 1000.
+        with_pool (bool, optional): use pool before the last fc layer or not. Default: True.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddle.vision.models import GoogLeNet
+
+            # build model
+            model = GoogLeNet()
+
+            x = paddle.rand([1, 3, 224, 224])
+            out, out1, out2 = model(x)
+
+            print(out.shape)
+    """
+
+    def __init__(self, num_classes=1000, with_pool=True):
+        super(GoogLeNet, self).__init__()
+        self.num_classes = num_classes
+        self.with_pool = with_pool
+
+        self._conv = ConvLayer(3, 64, 7, 2)
+        self._pool = MaxPool2D(kernel_size=3, stride=2)
+        self._conv_1 = ConvLayer(64, 64, 1)
+        self._conv_2 = ConvLayer(64, 192, 3)
+
+        self._ince3a = Inception(192, 192, 64, 96, 128, 16, 32, 32)
+        self._ince3b = Inception(256, 256, 128, 128, 192, 32, 96, 64)
+
+        self._ince4a = Inception(480, 480, 192, 96, 208, 16, 48, 64)
+        self._ince4b = Inception(512, 512, 160, 112, 224, 24, 64, 64)
+        self._ince4c = Inception(512, 512, 128, 128, 256, 24, 64, 64)
+        self._ince4d = Inception(512, 512, 112, 144, 288, 32, 64, 64)
+        self._ince4e = Inception(528, 528, 256, 160, 320, 32, 128, 128)
+
+        self._ince5a = Inception(832, 832, 256, 160, 320, 32, 128, 128)
+        self._ince5b = Inception(832, 832, 384, 192, 384, 48, 128, 128)
+
+        if with_pool:
+            # out
+            self._pool_5 = AdaptiveAvgPool2D(1)
+            # out1
+            self._pool_o1 = AvgPool2D(kernel_size=5, stride=3)
+            # out2
+            self._pool_o2 = AvgPool2D(kernel_size=5, stride=3)
+
+        if num_classes > 0:
+            # out
+            self._drop = Dropout(p=0.4, mode="downscale_in_infer")
+            self._fc_out = Linear(
+                1024, num_classes, weight_attr=xavier(1024, 1))
+
+            # out1
+            self._conv_o1 = ConvLayer(512, 128, 1)
+            self._fc_o1 = Linear(1152, 1024, weight_attr=xavier(2048, 1))
+            self._drop_o1 = Dropout(p=0.7, mode="downscale_in_infer")
+            self._out1 = Linear(1024, num_classes, weight_attr=xavier(1024, 1))
+
+            # out2
+            self._conv_o2 = ConvLayer(528, 128, 1)
+            self._fc_o2 = Linear(1152, 1024, weight_attr=xavier(2048, 1))
+            self._drop_o2 = Dropout(p=0.7, mode="downscale_in_infer")
+            self._out2 = Linear(1024, num_classes, weight_attr=xavier(1024, 1))
+
+    def forward(self, inputs):
+        x = self._conv(inputs)
+        x = self._pool(x)
+        x = self._conv_1(x)
+        x = self._conv_2(x)
+        x = self._pool(x)
+
+        x = self._ince3a(x)
+        x = self._ince3b(x)
+        x = self._pool(x)
+
+        ince4a = self._ince4a(x)
+        x = self._ince4b(ince4a)
+        x = self._ince4c(x)
+        ince4d = self._ince4d(x)
+        x = self._ince4e(ince4d)
+        x = self._pool(x)
+
+        x = self._ince5a(x)
+        ince5b = self._ince5b(x)
+
+        out, out1, out2 = ince5b, ince4a, ince4d
+
+        if self.with_pool:
+            out = self._pool_5(out)
+            out1 = self._pool_o1(out1)
+            out2 = self._pool_o2(out2)
+
+        if self.num_classes > 0:
+            out = self._drop(out)
+            out = paddle.squeeze(out, axis=[2, 3])
+            out = self._fc_out(out)
+
+            out1 = self._conv_o1(out1)
+            out1 = paddle.flatten(out1, start_axis=1, stop_axis=-1)
+            out1 = self._fc_o1(out1)
+            out1 = F.relu(out1)
+            out1 = self._drop_o1(out1)
+            out1 = self._out1(out1)
+
+            out2 = self._conv_o2(out2)
+            out2 = paddle.flatten(out2, start_axis=1, stop_axis=-1)
+            out2 = self._fc_o2(out2)
+            out2 = self._drop_o2(out2)
+            out2 = self._out2(out2)
+
+        return [out, out1, out2]
+
+
+def googlenet(pretrained=False, **kwargs):
+    """GoogLeNet (Inception v1) model architecture from
+    `"Going Deeper with Convolutions" <https://arxiv.org/pdf/1409.4842.pdf>`_
+    
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddle.vision.models import googlenet
+
+            # build model
+            model = googlenet()
+
+            # build model and load imagenet pretrained weight
+            # model = googlenet(pretrained=True)
+
+            x = paddle.rand([1, 3, 224, 224])
+            out, out1, out2 = model(x)
+
+            print(out.shape)
+    """
+    model = GoogLeNet(**kwargs)
+    arch = "googlenet"
+    if pretrained:
+        assert (
+            arch in model_urls
+        ), "{} model do not have a pretrained model now, you should set pretrained=False".format(
+            arch)
+        weight_path = get_weights_path_from_url(model_urls[arch][0],
+                                                model_urls[arch][1])
+
+        param = paddle.load(weight_path)
+        model.set_dict(param)
+    return model
diff --git a/python/paddle/vision/models/shufflenetv2.py b/python/paddle/vision/models/shufflenetv2.py
new file mode 100644
index 0000000000000..041f3fc749b6c
--- /dev/null
+++ b/python/paddle/vision/models/shufflenetv2.py
@@ -0,0 +1,523 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+from paddle.fluid.param_attr import ParamAttr
+from paddle.nn import AdaptiveAvgPool2D, BatchNorm, Conv2D, Linear, MaxPool2D
+from paddle.utils.download import get_weights_path_from_url
+
+__all__ = []
+
+model_urls = {
+    "shufflenet_v2_x0_25": (
+        "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ShuffleNetV2_x0_25_pretrained.pdparams",
+        "e753404cbd95027759c5f56ecd6c9c4b", ),
+    "shufflenet_v2_x0_33": (
+        "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ShuffleNetV2_x0_33_pretrained.pdparams",
+        "776e3cf9a4923abdfce789c45b8fe1f2", ),
+    "shufflenet_v2_x0_5": (
+        "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ShuffleNetV2_x0_5_pretrained.pdparams",
+        "e3649cf531566917e2969487d2bc6b60", ),
+    "shufflenet_v2_x1_0": (
+        "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ShuffleNetV2_x1_0_pretrained.pdparams",
+        "7821c348ea34e58847c43a08a4ac0bdf", ),
+    "shufflenet_v2_x1_5": (
+        "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ShuffleNetV2_x1_5_pretrained.pdparams",
+        "93a07fa557ab2d8803550f39e5b6c391", ),
+    "shufflenet_v2_x2_0": (
+        "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ShuffleNetV2_x2_0_pretrained.pdparams",
+        "4ab1f622fd0d341e0f84b4e057797563", ),
+    "shufflenet_v2_swish": (
+        "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ShuffleNetV2_swish_pretrained.pdparams",
+        "daff38b3df1b3748fccbb13cfdf02519", ),
+}
+
+
+def channel_shuffle(x, groups):
+    batch_size, num_channels, height, width = x.shape[0:4]
+    channels_per_group = num_channels // groups
+
+    # reshape
+    x = paddle.reshape(
+        x, shape=[batch_size, groups, channels_per_group, height, width])
+
+    # transpose
+    x = paddle.transpose(x, perm=[0, 2, 1, 3, 4])
+
+    # flatten
+    x = paddle.reshape(x, shape=[batch_size, num_channels, height, width])
+    return x
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride,
+                 padding,
+                 groups=1,
+                 act=None):
+        super(ConvBNLayer, self).__init__()
+        self._conv = Conv2D(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            groups=groups,
+            weight_attr=ParamAttr(initializer=nn.initializer.KaimingNormal()),
+            bias_attr=False, )
+
+        self._batch_norm = BatchNorm(out_channels, act=act)
+
+    def forward(self, inputs):
+        x = self._conv(inputs)
+        x = self._batch_norm(x)
+        return x
+
+
+class InvertedResidual(nn.Layer):
+    def __init__(self, in_channels, out_channels, stride, act="relu"):
+        super(InvertedResidual, self).__init__()
+        self._conv_pw = ConvBNLayer(
+            in_channels=in_channels // 2,
+            out_channels=out_channels // 2,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1,
+            act=act)
+        self._conv_dw = ConvBNLayer(
+            in_channels=out_channels // 2,
+            out_channels=out_channels // 2,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            groups=out_channels // 2,
+            act=None)
+        self._conv_linear = ConvBNLayer(
+            in_channels=out_channels // 2,
+            out_channels=out_channels // 2,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1,
+            act=act)
+
+    def forward(self, inputs):
+        x1, x2 = paddle.split(
+            inputs,
+            num_or_sections=[inputs.shape[1] // 2, inputs.shape[1] // 2],
+            axis=1)
+        x2 = self._conv_pw(x2)
+        x2 = self._conv_dw(x2)
+        x2 = self._conv_linear(x2)
+        out = paddle.concat([x1, x2], axis=1)
+        return channel_shuffle(out, 2)
+
+
+class InvertedResidualDS(nn.Layer):
+    def __init__(self, in_channels, out_channels, stride, act="relu"):
+        super(InvertedResidualDS, self).__init__()
+
+        # branch1
+        self._conv_dw_1 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=in_channels,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            groups=in_channels,
+            act=None)
+        self._conv_linear_1 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=out_channels // 2,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1,
+            act=act)
+        # branch2
+        self._conv_pw_2 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=out_channels // 2,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1,
+            act=act)
+        self._conv_dw_2 = ConvBNLayer(
+            in_channels=out_channels // 2,
+            out_channels=out_channels // 2,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            groups=out_channels // 2,
+            act=None)
+        self._conv_linear_2 = ConvBNLayer(
+            in_channels=out_channels // 2,
+            out_channels=out_channels // 2,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1,
+            act=act)
+
+    def forward(self, inputs):
+        x1 = self._conv_dw_1(inputs)
+        x1 = self._conv_linear_1(x1)
+        x2 = self._conv_pw_2(inputs)
+        x2 = self._conv_dw_2(x2)
+        x2 = self._conv_linear_2(x2)
+        out = paddle.concat([x1, x2], axis=1)
+
+        return channel_shuffle(out, 2)
+
+
+class ShuffleNetV2(nn.Layer):
+    """ShuffleNetV2 model from
+    `"ShuffleNet V2: Practical Guidelines for Efficient CNN Architecture Design" <https://arxiv.org/pdf/1807.11164.pdf>`_
+
+    Args:
+        scale (float, optional) - scale of output channels. Default: True.
+        act (str, optional) - activation function of neural network. Default: "relu".
+        num_classes (int, optional): output dim of last fc layer. If num_classes <=0, last fc layer
+                            will not be defined. Default: 1000.
+        with_pool (bool, optional): use pool before the last fc layer or not. Default: True.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddle.vision.models import ShuffleNetV2
+
+            shufflenet_v2_swish = ShuffleNetV2(scale=1.0, act="swish")
+            x = paddle.rand([1, 3, 224, 224])
+            out = shufflenet_v2_swish(x)
+            print(out.shape)
+
+    """
+
+    def __init__(self, scale=1.0, act="relu", num_classes=1000, with_pool=True):
+        super(ShuffleNetV2, self).__init__()
+        self.scale = scale
+        self.num_classes = num_classes
+        self.with_pool = with_pool
+        stage_repeats = [4, 8, 4]
+
+        if scale == 0.25:
+            stage_out_channels = [-1, 24, 24, 48, 96, 512]
+        elif scale == 0.33:
+            stage_out_channels = [-1, 24, 32, 64, 128, 512]
+        elif scale == 0.5:
+            stage_out_channels = [-1, 24, 48, 96, 192, 1024]
+        elif scale == 1.0:
+            stage_out_channels = [-1, 24, 116, 232, 464, 1024]
+        elif scale == 1.5:
+            stage_out_channels = [-1, 24, 176, 352, 704, 1024]
+        elif scale == 2.0:
+            stage_out_channels = [-1, 24, 224, 488, 976, 2048]
+        else:
+            raise NotImplementedError("This scale size:[" + str(scale) +
+                                      "] is not implemented!")
+        # 1. conv1
+        self._conv1 = ConvBNLayer(
+            in_channels=3,
+            out_channels=stage_out_channels[1],
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            act=act)
+        self._max_pool = MaxPool2D(kernel_size=3, stride=2, padding=1)
+
+        # 2. bottleneck sequences
+        self._block_list = []
+        for stage_id, num_repeat in enumerate(stage_repeats):
+            for i in range(num_repeat):
+                if i == 0:
+                    block = self.add_sublayer(
+                        sublayer=InvertedResidualDS(
+                            in_channels=stage_out_channels[stage_id + 1],
+                            out_channels=stage_out_channels[stage_id + 2],
+                            stride=2,
+                            act=act),
+                        name=str(stage_id + 2) + "_" + str(i + 1))
+                else:
+                    block = self.add_sublayer(
+                        sublayer=InvertedResidual(
+                            in_channels=stage_out_channels[stage_id + 2],
+                            out_channels=stage_out_channels[stage_id + 2],
+                            stride=1,
+                            act=act),
+                        name=str(stage_id + 2) + "_" + str(i + 1))
+                self._block_list.append(block)
+        # 3. last_conv
+        self._last_conv = ConvBNLayer(
+            in_channels=stage_out_channels[-2],
+            out_channels=stage_out_channels[-1],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            act=act)
+        # 4. pool
+        if with_pool:
+            self._pool2d_avg = AdaptiveAvgPool2D(1)
+
+        # 5. fc
+        if num_classes > 0:
+            self._out_c = stage_out_channels[-1]
+            self._fc = Linear(stage_out_channels[-1], num_classes)
+
+    def forward(self, inputs):
+        x = self._conv1(inputs)
+        x = self._max_pool(x)
+        for inv in self._block_list:
+            x = inv(x)
+        x = self._last_conv(x)
+
+        if self.with_pool:
+            x = self._pool2d_avg(x)
+
+        if self.num_classes > 0:
+            x = paddle.flatten(x, start_axis=1, stop_axis=-1)
+            x = self._fc(x)
+        return x
+
+
+def _shufflenet_v2(arch, pretrained=False, **kwargs):
+    model = ShuffleNetV2(**kwargs)
+    if pretrained:
+        assert (
+            arch in model_urls
+        ), "{} model do not have a pretrained model now, you should set pretrained=False".format(
+            arch)
+        weight_path = get_weights_path_from_url(model_urls[arch][0],
+                                                model_urls[arch][1])
+
+        param = paddle.load(weight_path)
+        model.set_dict(param)
+    return model
+
+
+def shufflenet_v2_x0_25(pretrained=False, **kwargs):
+    """ShuffleNetV2 with 0.25x output channels, as described in
+    `"ShuffleNet V2: Practical Guidelines for Ecient CNN Architecture Design" <https://arxiv.org/pdf/1807.11164.pdf>`_ 。
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet. Default: False.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddle.vision.models import shufflenet_v2_x0_25
+
+            # build model
+            model = shufflenet_v2_x0_25()
+
+            # build model and load imagenet pretrained weight
+            # model = shufflenet_v2_x0_25(pretrained=True)
+
+            x = paddle.rand([1, 3, 224, 224])
+            out = model(x)
+
+            print(out.shape)
+
+    """
+    return _shufflenet_v2(
+        "shufflenet_v2_x0_25", scale=0.25, pretrained=pretrained, **kwargs)
+
+
+def shufflenet_v2_x0_33(pretrained=False, **kwargs):
+    """ShuffleNetV2 with 0.33x output channels, as described in
+    `"ShuffleNet V2: Practical Guidelines for Ecient CNN Architecture Design" <https://arxiv.org/pdf/1807.11164.pdf>`_ 。
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet. Default: False.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddle.vision.models import shufflenet_v2_x0_33
+
+            # build model
+            model = shufflenet_v2_x0_33()
+
+            # build model and load imagenet pretrained weight
+            # model = shufflenet_v2_x0_33(pretrained=True)
+
+            x = paddle.rand([1, 3, 224, 224])
+            out = model(x)
+
+            print(out.shape)
+
+    """
+    return _shufflenet_v2(
+        "shufflenet_v2_x0_33", scale=0.33, pretrained=pretrained, **kwargs)
+
+
+def shufflenet_v2_x0_5(pretrained=False, **kwargs):
+    """ShuffleNetV2 with 0.5x output channels, as described in
+    `"ShuffleNet V2: Practical Guidelines for Ecient CNN Architecture Design" <https://arxiv.org/pdf/1807.11164.pdf>`_ 。
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet. Default: False.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddle.vision.models import shufflenet_v2_x0_5
+
+            # build model
+            model = shufflenet_v2_x0_5()
+
+            # build model and load imagenet pretrained weight
+            # model = shufflenet_v2_x0_5(pretrained=True)
+
+            x = paddle.rand([1, 3, 224, 224])
+            out = model(x)
+
+            print(out.shape)
+
+    """
+    return _shufflenet_v2(
+        "shufflenet_v2_x0_5", scale=0.5, pretrained=pretrained, **kwargs)
+
+
+def shufflenet_v2_x1_0(pretrained=False, **kwargs):
+    """ShuffleNetV2 with 1.0x output channels, as described in
+    `"ShuffleNet V2: Practical Guidelines for Ecient CNN Architecture Design" <https://arxiv.org/pdf/1807.11164.pdf>`_ 。
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet. Default: False.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddle.vision.models import shufflenet_v2_x1_0
+
+            # build model
+            model = shufflenet_v2_x1_0()
+
+            # build model and load imagenet pretrained weight
+            # model = shufflenet_v2_x1_0(pretrained=True)
+
+            x = paddle.rand([1, 3, 224, 224])
+            out = model(x)
+
+            print(out.shape)
+
+    """
+    return _shufflenet_v2(
+        "shufflenet_v2_x1_0", scale=1.0, pretrained=pretrained, **kwargs)
+
+
+def shufflenet_v2_x1_5(pretrained=False, **kwargs):
+    """ShuffleNetV2 with 1.5x output channels, as described in
+    `"ShuffleNet V2: Practical Guidelines for Ecient CNN Architecture Design" <https://arxiv.org/pdf/1807.11164.pdf>`_ 。
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet. Default: False.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddle.vision.models import shufflenet_v2_x1_5
+
+            # build model
+            model = shufflenet_v2_x1_5()
+
+            # build model and load imagenet pretrained weight
+            # model = shufflenet_v2_x1_5(pretrained=True)
+
+            x = paddle.rand([1, 3, 224, 224])
+            out = model(x)
+
+            print(out.shape)
+
+    """
+    return _shufflenet_v2(
+        "shufflenet_v2_x1_5", scale=1.5, pretrained=pretrained, **kwargs)
+
+
+def shufflenet_v2_x2_0(pretrained=False, **kwargs):
+    """ShuffleNetV2 with 2.0x output channels, as described in
+    `"ShuffleNet V2: Practical Guidelines for Ecient CNN Architecture Design" <https://arxiv.org/pdf/1807.11164.pdf>`_ 。
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet. Default: False.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddle.vision.models import shufflenet_v2_x2_0
+
+            # build model
+            model = shufflenet_v2_x2_0()
+
+            # build model and load imagenet pretrained weight
+            # model = shufflenet_v2_x2_0(pretrained=True)
+
+            x = paddle.rand([1, 3, 224, 224])
+            out = model(x)
+
+            print(out.shape)
+
+    """
+    return _shufflenet_v2(
+        "shufflenet_v2_x2_0", scale=2.0, pretrained=pretrained, **kwargs)
+
+
+def shufflenet_v2_swish(pretrained=False, **kwargs):
+    """ShuffleNetV2 with 1.0x output channels and swish activation function, as described in
+    `"ShuffleNet V2: Practical Guidelines for Ecient CNN Architecture Design" <https://arxiv.org/pdf/1807.11164.pdf>`_ 。
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet. Default: False.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddle.vision.models import shufflenet_v2_swish
+
+            # build model
+            model = shufflenet_v2_swish()
+
+            # build model and load imagenet pretrained weight
+            # model = shufflenet_v2_swish(pretrained=True)
+
+            x = paddle.rand([1, 3, 224, 224])
+            out = model(x)
+
+            print(out.shape)
+
+    """
+    return _shufflenet_v2(
+        "shufflenet_v2_swish",
+        scale=1.0,
+        act="swish",
+        pretrained=pretrained,
+        **kwargs)
diff --git a/python/paddle/vision/models/squeezenet.py b/python/paddle/vision/models/squeezenet.py
new file mode 100644
index 0000000000000..804be2622cfec
--- /dev/null
+++ b/python/paddle/vision/models/squeezenet.py
@@ -0,0 +1,240 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddle.nn import Conv2D, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D
+from paddle.fluid.param_attr import ParamAttr
+from paddle.utils.download import get_weights_path_from_url
+
+__all__ = []
+
+model_urls = {
+    'squeezenet1_0':
+    ('https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/SqueezeNet1_0_pretrained.pdparams',
+     '30b95af60a2178f03cf9b66cd77e1db1'),
+    'squeezenet1_1':
+    ('https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/SqueezeNet1_1_pretrained.pdparams',
+     'a11250d3a1f91d7131fd095ebbf09eee'),
+}
+
+
+class MakeFireConv(nn.Layer):
+    def __init__(self, input_channels, output_channels, filter_size, padding=0):
+        super(MakeFireConv, self).__init__()
+        self._conv = Conv2D(
+            input_channels,
+            output_channels,
+            filter_size,
+            padding=padding,
+            weight_attr=ParamAttr(),
+            bias_attr=ParamAttr())
+
+    def forward(self, x):
+        x = self._conv(x)
+        x = F.relu(x)
+        return x
+
+
+class MakeFire(nn.Layer):
+    def __init__(self, input_channels, squeeze_channels, expand1x1_channels,
+                 expand3x3_channels):
+        super(MakeFire, self).__init__()
+        self._conv = MakeFireConv(input_channels, squeeze_channels, 1)
+        self._conv_path1 = MakeFireConv(squeeze_channels, expand1x1_channels, 1)
+        self._conv_path2 = MakeFireConv(
+            squeeze_channels, expand3x3_channels, 3, padding=1)
+
+    def forward(self, inputs):
+        x = self._conv(inputs)
+        x1 = self._conv_path1(x)
+        x2 = self._conv_path2(x)
+        return paddle.concat([x1, x2], axis=1)
+
+
+class SqueezeNet(nn.Layer):
+    """SqueezeNet model from
+    `"SqueezeNet: AlexNet-level accuracy with 50x fewer parameters and <0.5MB model size"
+    <https://arxiv.org/pdf/1602.07360.pdf>`_
+
+    Args:
+        version (str): version of squeezenet, which can be "1.0" or "1.1".
+        num_classes (int): output dim of last fc layer. Default: 1000.
+        with_pool (bool): use pool before the last fc layer or not. Default: True.
+
+    Examples:
+        .. code-block:: python
+            import paddle
+            from paddle.vision.models import SqueezeNet
+
+            # build v1.0 model
+            model = SqueezeNet(version='1.0')
+
+            # build v1.1 model
+            # model = SqueezeNet(version='1.1')
+
+            x = paddle.rand([1, 3, 224, 224])
+            out = model(x)
+
+            print(out.shape)
+
+    """
+
+    def __init__(self, version, num_classes=1000, with_pool=True):
+        super(SqueezeNet, self).__init__()
+        self.version = version
+        self.num_classes = num_classes
+        self.with_pool = with_pool
+
+        supported_versions = ['1.0', '1.1']
+        assert version in supported_versions, \
+            "supported versions are {} but input version is {}".format(
+                supported_versions, version)
+
+        if self.version == "1.0":
+            self._conv = Conv2D(
+                3,
+                96,
+                7,
+                stride=2,
+                weight_attr=ParamAttr(),
+                bias_attr=ParamAttr())
+            self._pool = MaxPool2D(kernel_size=3, stride=2, padding=0)
+            self._conv1 = MakeFire(96, 16, 64, 64)
+            self._conv2 = MakeFire(128, 16, 64, 64)
+            self._conv3 = MakeFire(128, 32, 128, 128)
+            self._conv4 = MakeFire(256, 32, 128, 128)
+            self._conv5 = MakeFire(256, 48, 192, 192)
+            self._conv6 = MakeFire(384, 48, 192, 192)
+            self._conv7 = MakeFire(384, 64, 256, 256)
+            self._conv8 = MakeFire(512, 64, 256, 256)
+        else:
+            self._conv = Conv2D(
+                3,
+                64,
+                3,
+                stride=2,
+                padding=1,
+                weight_attr=ParamAttr(),
+                bias_attr=ParamAttr())
+            self._pool = MaxPool2D(kernel_size=3, stride=2, padding=0)
+            self._conv1 = MakeFire(64, 16, 64, 64)
+            self._conv2 = MakeFire(128, 16, 64, 64)
+            self._conv3 = MakeFire(128, 32, 128, 128)
+            self._conv4 = MakeFire(256, 32, 128, 128)
+            self._conv5 = MakeFire(256, 48, 192, 192)
+            self._conv6 = MakeFire(384, 48, 192, 192)
+            self._conv7 = MakeFire(384, 64, 256, 256)
+            self._conv8 = MakeFire(512, 64, 256, 256)
+
+        self._drop = Dropout(p=0.5, mode="downscale_in_infer")
+        self._conv9 = Conv2D(
+            512, num_classes, 1, weight_attr=ParamAttr(), bias_attr=ParamAttr())
+        self._avg_pool = AdaptiveAvgPool2D(1)
+
+    def forward(self, inputs):
+        x = self._conv(inputs)
+        x = F.relu(x)
+        x = self._pool(x)
+        if self.version == "1.0":
+            x = self._conv1(x)
+            x = self._conv2(x)
+            x = self._conv3(x)
+            x = self._pool(x)
+            x = self._conv4(x)
+            x = self._conv5(x)
+            x = self._conv6(x)
+            x = self._conv7(x)
+            x = self._pool(x)
+            x = self._conv8(x)
+        else:
+            x = self._conv1(x)
+            x = self._conv2(x)
+            x = self._pool(x)
+            x = self._conv3(x)
+            x = self._conv4(x)
+            x = self._pool(x)
+            x = self._conv5(x)
+            x = self._conv6(x)
+            x = self._conv7(x)
+            x = self._conv8(x)
+        if self.num_classes > 0:
+            x = self._drop(x)
+            x = self._conv9(x)
+        if self.with_pool:
+            x = F.relu(x)
+            x = self._avg_pool(x)
+            x = paddle.squeeze(x, axis=[2, 3])
+
+        return x
+
+
+def _squeezenet(arch, version, pretrained, **kwargs):
+    model = SqueezeNet(version, **kwargs)
+    if pretrained:
+        assert arch in model_urls, "{} model do not have a pretrained model now, you should set pretrained=False".format(
+            arch)
+        weight_path = get_weights_path_from_url(model_urls[arch][0],
+                                                model_urls[arch][1])
+        param = paddle.load(weight_path)
+        model.set_dict(param)
+
+    return model
+
+
+def squeezenet1_0(pretrained=False, **kwargs):
+    """SqueezeNet v1.0 model
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet. Default: False.
+
+    Examples:
+        .. code-block:: python
+
+            from paddle.vision.models import squeezenet1_0
+
+            # build model
+            model = squeezenet1_0()
+
+            # build model and load imagenet pretrained weight
+            # model = squeezenet1_0(pretrained=True)
+    """
+    return _squeezenet('squeezenet1_0', '1.0', pretrained, **kwargs)
+
+
+def squeezenet1_1(pretrained=False, **kwargs):
+    """SqueezeNet v1.1 model
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet. Default: False.
+
+    Examples:
+        .. code-block:: python
+
+            from paddle.vision.models import squeezenet1_1
+
+            # build model
+            model = squeezenet1_1()
+
+            # build model and load imagenet pretrained weight
+            # model = squeezenet1_1(pretrained=True)
+    """
+    return _squeezenet('squeezenet1_1', '1.1', pretrained, **kwargs)
diff --git a/python/setup.py.in b/python/setup.py.in
index b10d5df541f2f..6a252a5723d0d 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -54,6 +54,25 @@ def get_minor():
 def get_patch():
     return str(_get_version_detail(2))
 
+def get_cuda_version():
+    if '@WITH_GPU@' == 'ON':
+        return '@CUDA_VERSION@'
+    else:
+        return 'False'
+
+def get_cudnn_version():
+    if '@WITH_GPU@' == 'ON':
+        temp_cudnn_version = ''
+        if '@CUDNN_MAJOR_VERSION@':
+            temp_cudnn_version += '@CUDNN_MAJOR_VERSION@'
+            if '@CUDNN_MINOR_VERSION@':
+                temp_cudnn_version += '.@CUDNN_MINOR_VERSION@'
+                if '@CUDNN_PATCHLEVEL_VERSION@':
+                    temp_cudnn_version += '.@CUDNN_PATCHLEVEL_VERSION@'
+        return temp_cudnn_version
+    else:
+        return 'False'
+
 def is_taged():
     try:
         cmd = ['git', 'describe', '--exact-match', '--tags', 'HEAD', '2>/dev/null']
@@ -67,7 +86,7 @@ def is_taged():
     else:
         return False
 
-def write_version_py(filename='paddle/version.py'):
+def write_version_py(filename='paddle/version/__init__.py'):
     cnt = '''# THIS FILE IS GENERATED FROM PADDLEPADDLE SETUP.PY
 #
 full_version    = '%(major)d.%(minor)d.%(patch)s'
@@ -75,11 +94,56 @@ major           = '%(major)d'
 minor           = '%(minor)d'
 patch           = '%(patch)s'
 rc              = '%(rc)d'
+cuda_version    = '%(cuda)s'
+cudnn_version   = '%(cudnn)s'
 istaged         = %(istaged)s
 commit          = '%(commit)s'
 with_mkl        = '%(with_mkl)s'
 
+__all__ = ['cuda', 'cudnn', 'show']
+
 def show():
+    """Get the version of paddle if `paddle` package if tagged. Otherwise, output the corresponding commit id.
+    
+    Returns:
+        If paddle package is not tagged, the commit-id of paddle will be output.
+        Otherwise, the following information will be output.
+
+        full_version: version of paddle
+
+        major: the major version of paddle
+
+        minor: the minor version of paddle
+
+        patch: the patch level version of paddle
+        
+        rc: whether it's rc version
+
+        cuda: the cuda version of package. It will return `False` if CPU version paddle package is installed
+
+        cudnn: the cudnn version of package. It will return `False` if CPU version paddle package is installed
+    
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            # Case 1: paddle is tagged with 2.2.0
+            paddle.version.show()
+            # full_version: 2.2.0
+            # major: 2
+            # minor: 2
+            # patch: 0
+            # rc: 0
+            # cuda: '10.2'
+            # cudnn: '7.6.5'
+
+            # Case 2: paddle is not tagged
+            paddle.version.show()
+            # commit: cfa357e984bfd2ffa16820e354020529df434f7d
+            # cuda: '10.2'
+            # cudnn: '7.6.5'
+    """
     if istaged:
         print('full_version:', full_version)
         print('major:', major)
@@ -88,11 +152,56 @@ def show():
         print('rc:', rc)
     else:
         print('commit:', commit)
+    print('cuda:', cuda_version)
+    print('cudnn:', cudnn_version)
 
 def mkl():
     return with_mkl
+
+def cuda():
+    """Get cuda version of paddle package.
+
+    Returns:
+        string: Return the version information of cuda. If paddle package is CPU version, it will return False.
+    
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            paddle.version.cuda()
+            # '10.2'
+
+    """
+    return cuda_version
+
+def cudnn():
+    """Get cudnn version of paddle package.
+
+    Returns:
+        string: Return the version information of cudnn. If paddle package is CPU version, it will return False.
+    
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            paddle.version.cudnn()
+            # '7.6.5'
+
+    """
+    return cudnn_version
 '''
     commit = git_commit()
+
+    dirname = os.path.dirname(filename)
+
+    try:
+        os.makedirs(dirname)
+    except OSError as e:
+        if e.errno != errno.EEXIST:
+            raise
+
     with open(filename, 'w') as f:
         f.write(cnt % {
             'major': get_major(),
@@ -100,11 +209,13 @@ def mkl():
             'patch': get_patch(),
             'rc': RC,
             'version': '${PADDLE_VERSION}',
+            'cuda': get_cuda_version(),
+            'cudnn': get_cudnn_version(),
             'commit': commit,
             'istaged': is_taged(),
             'with_mkl': '@WITH_MKL@'})
 
-write_version_py(filename='@PADDLE_BINARY_DIR@/python/paddle/version.py')
+write_version_py(filename='@PADDLE_BINARY_DIR@/python/paddle/version/__init__.py')
 
 def write_cuda_env_config_py(filename='paddle/cuda_env.py'):
     cnt = ""
@@ -163,6 +274,7 @@ packages=['paddle',
           'paddle.incubate.checkpoint',
           'paddle.incubate.operators',
           'paddle.incubate.tensor',
+          'paddle.incubate.nn',
           'paddle.distributed.fleet',
           'paddle.distributed.fleet.base',
           'paddle.distributed.fleet.elastic',
@@ -222,6 +334,7 @@ packages=['paddle',
           'paddle.fluid.incubate.fleet.collective',
           'paddle.fluid.incubate.fleet.utils',
           'paddle.amp',
+          'paddle.cost_model',
           'paddle.hapi',
           'paddle.vision',
           'paddle.vision.models',
@@ -230,6 +343,9 @@ packages=['paddle',
           'paddle.text',
           'paddle.text.datasets',
           'paddle.incubate',
+          'paddle.incubate.nn',
+          'paddle.incubate.nn.functional',
+          'paddle.incubate.nn.layer',
           'paddle.io',
           'paddle.optimizer',
           'paddle.nn',
@@ -242,11 +358,13 @@ packages=['paddle',
           'paddle.static',
           'paddle.static.nn',
           'paddle.static.amp',
+          'paddle.static.sparsity',
           'paddle.tensor',
           'paddle.onnx',
           'paddle.autograd',
           'paddle.device',
           'paddle.device.cuda',
+          'paddle.version',
           ]
 
 with open('@PADDLE_SOURCE_DIR@/python/requirements.txt') as f:
@@ -287,6 +405,7 @@ if os.name != 'nt':
 else:
     package_data={'paddle.fluid': ['${FLUID_CORE_NAME}' + '.pyd', '${FLUID_CORE_NAME}' + '.lib']}
 
+package_data['paddle.fluid'] += ['${PADDLE_BINARY_DIR}/python/paddle/cost_model/static_op_benchmark.json']
 if '${HAS_NOAVX_CORE}' == 'ON':
     package_data['paddle.fluid'] += ['core_noavx' + ('.so' if os.name != 'nt' else '.pyd')]
 
@@ -345,6 +464,10 @@ if '${WITH_LITE}' == 'ON':
             shutil.copy('${LITE_NNADAPTER_NPU_LIB}', libs_path)
             package_data['paddle.libs']+=['libnnadapter_driver_huawei_ascend_npu' + ext_name]
 
+if '${WITH_CINN}' == 'ON':
+    shutil.copy('${CINN_LIB_LOCATION}/${CINN_LIB_NAME}', libs_path)
+    package_data['paddle.libs']+=['libcinnapi.so']
+
 if '${WITH_PSLIB}' == 'ON':
     shutil.copy('${PSLIB_LIB}', libs_path)
     if os.path.exists('${PSLIB_VERSION_PY}'):
diff --git a/python/unittest_py/requirements.txt b/python/unittest_py/requirements.txt
index af2203316d8b3..fe8382faa0c34 100644
--- a/python/unittest_py/requirements.txt
+++ b/python/unittest_py/requirements.txt
@@ -3,6 +3,7 @@ coverage
 pycrypto ; platform_system != "Windows"
 mock
 gym
+hypothesis
 opencv-python<=4.2.0.32
 visualdl
 paddle2onnx>=0.8.2
diff --git a/tools/check_api_approvals.sh b/tools/check_api_approvals.sh
index 760bc2b168475..dcbe853d8a1bc 100644
--- a/tools/check_api_approvals.sh
+++ b/tools/check_api_approvals.sh
@@ -43,22 +43,22 @@ api_spec_diff=`python ${PADDLE_ROOT}/tools/diff_api.py ${PADDLE_ROOT}/paddle/flu
 if [ "$api_spec_diff" != "" -o "${api_params_diff}" != "" ]; then
     echo_line="You must have one RD (XiaoguangHu01, lanxianghit or Superjomn) approval for API change.\n"
     echo_line="${echo_line} and one TPM approval for API change: \n"
-    echo_line="${echo_line} jzhang533/ZhangJun, dingjiaweiww/DingJiaWei, Heeenrrry/LiKunLun, TCChenlong/ChenLong for general APIs\n"
-    echo_line="${echo_line} PangHua/XiangHui for distributed related APIs\n"
-    echo_line="${echo_line} twismon/WangYunKai, CheQiXiao/CheQiXiao for inference related APIs.\n"
+    echo_line="${echo_line} jzhang533/ZhangJun, dingjiaweiww/DingJiaWei, TCChenlong/ChenLong, Ligoml/LiMengLiu for general APIs.\n"
+    echo_line="${echo_line} PangHua/XiangHui for distributed related APIs.\n"
+    echo_line="${echo_line} leiqing1/LeiQing for inference related APIs.\n"
 
     check_approval 1 46782768 47554610 328693
-    check_approval 1 29231 23093488 28379894 11935832 2682285 12050047 50894398
+    check_approval 1 29231 23093488 11935832 39876205 2682285 54695910
 fi
 
 api_doc_spec_diff=`python ${PADDLE_ROOT}/tools/diff_api.py ${PADDLE_ROOT}/paddle/fluid/API_DEV.spec.doc  ${PADDLE_ROOT}/paddle/fluid/API_PR.spec.doc` 
 if [ "$api_doc_spec_diff" != "" ]; then
     echo_line="You must have  one TPM approval for API documents change: \n"
-    echo_line="${echo_line} jzhang533/ZhangJun, dingjiaweiww/DingJiaWei, Heeenrrry/LiKunLun, TCChenlong/ChenLong for general API docs\n"
-    echo_line="${echo_line} PangHua/XiangHui for distributed related API docs\n"
-    echo_line="${echo_line} twismon/WangYunKai, CheQiXiao/CheQiXiao for inference related API docs.\n"
+    echo_line="${echo_line} jzhang533/ZhangJun, dingjiaweiww/DingJiaWei, TCChenlong/ChenLong, Ligoml/LiMengLiu for general API docs.\n"
+    echo_line="${echo_line} PangHua/XiangHui for distributed related API docs.\n"
+    echo_line="${echo_line} leiqing1/LeiQing for inference related API docs.\n"
 
-    check_approval 1 29231 23093488 28379894 11935832 2682285 12050047 50894398
+    check_approval 1 29231 23093488 11935832 39876205 2682285 54695910
 fi
 
 api_src_spec_diff=`python ${PADDLE_ROOT}/tools/check_api_source_without_core_ops.py ${PADDLE_ROOT}/paddle/fluid/API_DEV.source.md5  ${PADDLE_ROOT}/paddle/fluid/API_PR.source.md5` 
diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh
index 6104b168798c9..efe492dffc8e2 100644
--- a/tools/check_file_diff_approvals.sh
+++ b/tools/check_file_diff_approvals.sh
@@ -366,9 +366,9 @@ if [ -n "${UNITYBUILD_RULE_CHANGED}" -a -n "${GIT_PR_ID}" ]; then
                wanghuancoder or luotao1) approval for modifying
                unity_build_rule.cmake which the rules of Unity Build."
     echo_line=$(echo ${echo_line})
-    # Avin0323(16167147) zhouwei25(52485244)
+    # Avin0323(23427135) zhouwei25(52485244)
     # wanghuancoder(26922892) luotao1(6836917)
-    check_approval 1 16167147 52485244 26922892 6836917
+    check_approval 1 23427135 52485244 26922892 6836917
 fi
 
 if [ -n "${echo_list}" ];then
diff --git a/tools/ci_model_benchmark.sh b/tools/ci_model_benchmark.sh
index 574169869376a..ec1515f7d76dc 100644
--- a/tools/ci_model_benchmark.sh
+++ b/tools/ci_model_benchmark.sh
@@ -98,6 +98,7 @@ case $1 in
   ;;
   run_all)
     compile_install_paddle
+    init_benchmark
     prepare_data
     run_model_benchmark
   ;;
diff --git a/tools/dockerfile/Dockerfile.cuda10_ubuntu18_cinn b/tools/dockerfile/Dockerfile.cuda10_ubuntu18_cinn
deleted file mode 100644
index c021c23aec82b..0000000000000
--- a/tools/dockerfile/Dockerfile.cuda10_ubuntu18_cinn
+++ /dev/null
@@ -1,152 +0,0 @@
-# A image for building paddle binaries
-# Use cuda devel base image for both cpu and gpu environment
-# When you modify it, please be aware of cudnn-runtime version
-FROM nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04
-MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
-
-# ENV variables
-ARG WITH_GPU
-ARG WITH_AVX
-
-ENV WITH_GPU=${WITH_GPU:-ON}
-ENV WITH_AVX=${WITH_AVX:-ON}
-ENV DEBIAN_FRONTEND=noninteractive
-
-ENV HOME /root
-# Add bash enhancements
-COPY paddle/scripts/docker/root/ /root/
-
-RUN apt-get update && \
-  apt-get install -y software-properties-common && add-apt-repository ppa:deadsnakes/ppa && \
-  apt-get update && \
-  apt-get install -y curl wget vim git unzip unrar tar xz-utils bzip2 gzip \ 
-    coreutils ntp language-pack-zh-hans python-qt4 libsm6 libxext6 libxrender-dev
-
-
-# Downgrade gcc&&g++
-WORKDIR /usr/bin 
-      RUN apt-get update --fix-missing
-      COPY tools/dockerfile/build_scripts /build_scripts 
-      RUN bash /build_scripts/install_gcc.sh gcc82 && rm -rf /build_scripts 
-      RUN cp gcc gcc.bak && cp g++ g++.bak && rm gcc && rm g++ 
-      RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/local/bin/gcc 
-      RUN ln -s /usr/local/gcc-8.2/bin/g++ /usr/local/bin/g++ 
-      RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/bin/gcc 
-      RUN ln -s /usr/local/gcc-8.2/bin/g++ /usr/bin/g++ 
-      ENV PATH=/usr/local/gcc-8.2/bin:$PATH 
-
-RUN apt-get update && \
-  apt-get install -y python2.7 python2.7-dev \
-  python3.5 python3.5-dev \
-  python3.6 python3.6-dev \
-  python3.7 python3.7-dev \
-  python3.8 python3.8-dev && \
-  curl https://bootstrap.pypa.io/2.7/get-pip.py -o - | python2.7 && easy_install pip && \
-  curl https://bootstrap.pypa.io/3.5/get-pip.py -o - | python3.5 && easy_install pip && \
-  curl https://bootstrap.pypa.io/ez_setup.py -o - | python3.6 && easy_install pip && \
-  curl https://bootstrap.pypa.io/ez_setup.py -o - | python3.7 && easy_install pip && \
-  curl https://bootstrap.pypa.io/ez_setup.py -o - | python3.8 && easy_install pip && \
-  rm /usr/bin/python && ln -s /usr/bin/python2.7 /usr/bin/python && \
-  rm /usr/bin/python3 && ln -s /usr/bin/python3.5 /usr/bin/python3 && \
-  rm /usr/local/bin/pip && ln -s /usr/local/bin/pip2.7 /usr/local/bin/pip && \
-  rm /usr/local/bin/pip3 && ln -s /usr/local/bin/pip3.5 /usr/local/bin/pip3
-
-
-# install cmake
-WORKDIR /home
-RUN wget -q https://cmake.org/files/v3.16/cmake-3.16.0-Linux-x86_64.tar.gz && tar -zxvf cmake-3.16.0-Linux-x86_64.tar.gz && rm cmake-3.16.0-Linux-x86_64.tar.gz
-ENV PATH=/home/cmake-3.16.0-Linux-x86_64/bin:$PATH
-
-
-# remove them when apt-get support 2.27 and higher version
-RUN wget -q https://ftp.gnu.org/gnu/binutils/binutils-2.33.1.tar.gz && \ 
-    tar -xzf binutils-2.33.1.tar.gz && \ 
-    cd binutils-2.33.1 && \
-    ./configure && make -j && make install && cd .. && rm -rf binutils-2.33.1 binutils-2.33.1.tar.gz
-
-
-# Install Go and glide
-RUN wget -qO- https://paddle-ci.cdn.bcebos.com/go1.8.1.linux-amd64.tar.gz | \
-    tar -xz -C /usr/local && \
-    mkdir /root/gopath && \
-    mkdir /root/gopath/bin && \
-    mkdir /root/gopath/src
-ENV GOROOT=/usr/local/go GOPATH=/root/gopath
-# should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT.
-ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin
-# install glide
-RUN curl -s -q https://glide.sh/get | sh
-
-# git credential to skip password typing
-RUN git config --global credential.helper store
-
-# Fix locales to en_US.UTF-8
-RUN localedef -i en_US -f UTF-8 en_US.UTF-8
-
-RUN pip3 --no-cache-dir install pre-commit==1.10.4 ipython==5.3.0 && \
-    pip3 --no-cache-dir install ipykernel==4.6.0 wheel && \
-    pip3.6 --no-cache-dir install pre-commit==1.10.4 ipython==5.3.0 && \
-    pip3.6 --no-cache-dir install ipykernel==4.6.0 wheel && \
-    pip3.7 --no-cache-dir install pre-commit==1.10.4 ipython==5.3.0 && \
-    pip3.7 --no-cache-dir install ipykernel==4.6.0 wheel && \
-    pip3.8 --no-cache-dir install pre-commit==1.10.4 ipython==5.3.0 && \
-    pip3.8 --no-cache-dir install ipykernel==4.6.0 wheel && \
-    pip --no-cache-dir install pre-commit==1.10.4 ipython==5.3.0 && \
-    pip --no-cache-dir install ipykernel==4.6.0 wheel 
-
-#For docstring checker
-RUN pip3 --no-cache-dir install pylint pytest astroid isort && \
-    pip3.6 --no-cache-dir install pylint pytest astroid isort && \
-    pip3.7 --no-cache-dir install pylint pytest astroid isort && \
-    pip3.8 --no-cache-dir install pylint pytest astroid isort && \
-    pip --no-cache-dir install pylint pytest astroid isort
-
-COPY ./python/requirements.txt /root/
-RUN pip3 --no-cache-dir install -r /root/requirements.txt && \
-    pip3.6 --no-cache-dir install -r /root/requirements.txt && \
-    pip3.7 --no-cache-dir install -r /root/requirements.txt && \
-    pip3.8 --no-cache-dir install -r /root/requirements.txt && \
-    pip --no-cache-dir install -r /root/requirements.txt
-
-
-# Older versions of patchelf limited the size of the files being processed and were fixed in this pr.
-# https://github.com/NixOS/patchelf/commit/ba2695a8110abbc8cc6baf0eea819922ee5007fa
-# So install a newer version here.
-RUN wget -q https://paddle-ci.cdn.bcebos.com/patchelf_0.10-2_amd64.deb && \
-    dpkg -i patchelf_0.10-2_amd64.deb
-
-# Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service
-#RUN mkdir /var/run/sshd && echo 'root:root' | chpasswd && sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config && sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
-#CMD source ~/.bashrc
-
-# ccache 3.7.9
-RUN wget https://paddle-ci.gz.bcebos.com/ccache-3.7.9.tar.gz && \
-    tar xf ccache-3.7.9.tar.gz && mkdir /usr/local/ccache-3.7.9 && cd ccache-3.7.9 && \
-    ./configure -prefix=/usr/local/ccache-3.7.9 && \
-    make -j8 && make install && \
-    ln -s /usr/local/ccache-3.7.9/bin/ccache /usr/local/bin/ccache
-
-# For CINN environment 
-RUN apt update --fix-missing
-RUN apt-get install autoconf autogen
-RUN apt-get install libtool
-RUN apt-get install zlib1g-dev
-RUN apt install libginac-dev -y
-RUN apt install clang cmake -y
-RUN python3 -m pip install numpy
-RUN python3 -m pip install pybind11
-
-
-# Install LLVM
-RUN echo "deb http://apt.llvm.org/bionic/ llvm-toolchain-bionic main" >> /etc/apt/source.list
-RUN echo "deb-src http://apt.llvm.org/bionic/ llvm-toolchain-bionic main" >> /etc/apt/source.list
-RUN echo "deb http://apt.llvm.org/bionic/ llvm-toolchain-bionic-10 main" >> /etc/apt/source.list
-RUN echo "deb-src http://apt.llvm.org/bionic/ llvm-toolchain-bionic-10 main" >> /etc/apt/source.list
-RUN ln -s /usr/bin/llvm-config-6.0 /usr/bin/llvm-config
-RUN wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key|apt-key add -
-
-RUN apt update
-RUN apt install libclang-dev llvm-10 llvm-10-dev libclang-10-dev -y
-
-
-EXPOSE 22
diff --git a/tools/dockerfile/ci_dockerfile.sh b/tools/dockerfile/ci_dockerfile.sh
index e61a4eb3dbd0c..bb7bdfe46c29b 100644
--- a/tools/dockerfile/ci_dockerfile.sh
+++ b/tools/dockerfile/ci_dockerfile.sh
@@ -59,10 +59,27 @@ function make_centos_dockerfile(){
   sed -i "s#RUN bash build_scripts/build.sh#RUN bash build_scripts/install_gcc.sh gcc54 \nRUN mv /usr/bin/cc /usr/bin/cc.bak \&\& ln -s /usr/local/gcc-5.4/bin/gcc /usr/bin/cc \nENV PATH=/usr/local/gcc-5.4/bin:\$PATH \nRUN bash build_scripts/build.sh#g" ${dockerfile_name}
 }
 
+function make_cinn_dockerfile(){
+  dockerfile_name="Dockerfile.cuda11_cudnn8_gcc82_ubuntu18_cinn"
+  sed "s/<baseimg>/11.2.0-cudnn8-devel-ubuntu18.04/g" ./Dockerfile.ubuntu18 >${dockerfile_name}
+  sed -i "7i ENV TZ=Asia/Beijing" ${dockerfile_name}
+  sed -i "8i RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone" ${dockerfile_name}
+  sed -i "9i RUN apt-get update && apt-get install -y liblzma-dev openmpi-bin openmpi-doc libopenmpi-dev" ${dockerfile_name}
+  dockerfile_line=$(wc -l ${dockerfile_name}|awk '{print $1}')
+  sed -i "${dockerfile_line}i RUN wget --no-check-certificate -q https://paddle-edl.bj.bcebos.com/hadoop-2.7.7.tar.gz \&\& \
+     tar -xzf  hadoop-2.7.7.tar.gz && mv hadoop-2.7.7 /usr/local/" ${dockerfile_name}
+  sed -i "${dockerfile_line}i RUN apt remove git -y \&\& apt install -y libcurl4-openssl-dev gettext \&\& wget -q https://paddle-ci.gz.bcebos.com/git-2.17.1.tar.gz \&\& \
+    tar -xvf git-2.17.1.tar.gz \&\& \
+    cd git-2.17.1 \&\& \
+    ./configure --with-openssl --with-curl --prefix=/usr/local \&\& \
+    make -j8 \&\& make install " ${dockerfile_name}
+  sed -i "${dockerfile_line}i RUN pip install wheel \&\& pip3 install PyGithub wheel \&\& pip3.7 install PyGithub " ${dockerfile_name}
+}
 
 function main() {
   make_ubuntu_dockerfile
   make_centos_dockerfile
+  make_cinn_dockerfile
 }
 
 main "$@"
diff --git a/tools/parallel_UT_rule.py b/tools/parallel_UT_rule.py
index 803e173e071f6..02d6396f9cec2 100644
--- a/tools/parallel_UT_rule.py
+++ b/tools/parallel_UT_rule.py
@@ -417,11 +417,12 @@
     'test_fusion_group_op', 'test_imperative_layer_apply',
     'test_executor_return_tensor_not_overwriting',
     'test_optimizer_in_control_flow', 'test_lookup_table_op', 'test_randint_op',
-    'test_convert_call', 'test_sigmoid_cross_entropy_with_logits_op',
-    'copy_cross_scope_test', 'test_normalization_wrapper',
-    'test_pretrained_model', 'test_flip', 'test_cosine_similarity_api',
-    'test_cumsum_op', 'test_range', 'test_log_loss_op', 'test_where_index',
-    'test_tril_triu_op', 'test_lod_reset_op', 'test_lod_tensor', 'test_addmm_op',
+    'test_randint_like', 'test_convert_call',
+    'test_sigmoid_cross_entropy_with_logits_op', 'copy_cross_scope_test',
+    'test_normalization_wrapper', 'test_pretrained_model', 'test_flip',
+    'test_cosine_similarity_api', 'test_cumsum_op', 'test_range',
+    'test_log_loss_op', 'test_where_index', 'test_tril_triu_op',
+    'test_lod_reset_op', 'test_lod_tensor', 'test_addmm_op',
     'test_index_select_op', 'test_nvprof', 'test_index_sample_op',
     'test_unstack_op', 'test_increment', 'strided_memcpy_test',
     'test_target_assign_op', 'test_trt_dynamic_shape_transformer_prune',
diff --git a/tools/static_mode_white_list.py b/tools/static_mode_white_list.py
index 7d0a2a8953fc8..8705e29cbb220 100644
--- a/tools/static_mode_white_list.py
+++ b/tools/static_mode_white_list.py
@@ -610,6 +610,7 @@
     'test_dequantize_mkldnn_op',
     'test_elementwise_add_mkldnn_op',
     'test_elementwise_add_bf16_mkldnn_op',
+    'test_elementwise_div_mkldnn_op',
     'test_elementwise_sub_mkldnn_op',
     'test_elementwise_mul_mkldnn_op',
     'test_elementwise_mul_bf16_mkldnn_op',
diff --git a/tools/test_ci_op_benchmark.sh b/tools/test_ci_op_benchmark.sh
index 23df51f09c8e6..25e613dd6bcd0 100644
--- a/tools/test_ci_op_benchmark.sh
+++ b/tools/test_ci_op_benchmark.sh
@@ -317,11 +317,11 @@ function gpu_op_benchmark {
 }
 
 # The PR will pass quickly when get approval from specific person.
-# Xreki 12538138, luotao1 6836917, Avin0323 16167147
+# Xreki 12538138, luotao1 6836917, Avin0323 23427135
 set +x
 approval_line=$(curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000)
 if [ -n "${approval_line}" ]; then
-  APPROVALS=$(echo ${approval_line} | python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 16167147 12538138 6836917)
+  APPROVALS=$(echo ${approval_line} | python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 23427135 12538138 6836917)
   LOG "[INFO] current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
   if [ "${APPROVALS}" == "TRUE" ]; then
     LOG "[INFO] ==================================="
diff --git a/tools/windows/run_unittests.sh b/tools/windows/run_unittests.sh
index 88c8ba3dab9f6..d961bd9a7159f 100644
--- a/tools/windows/run_unittests.sh
+++ b/tools/windows/run_unittests.sh
@@ -354,7 +354,7 @@ if [ "${WITH_GPU:-OFF}" == "ON" ];then
             exit 8;
         fi
     fi
-    run_unittest_gpu $cpu_parallel_job 12
+    run_unittest_gpu $cpu_parallel_job 10
     run_unittest_gpu $tetrad_parallel_job 4
     run_unittest_gpu $two_parallel_job 2
     run_unittest_gpu $non_parallel_job